aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-11-06 22:02:51 -0500
committerChris Mason <chris.mason@oracle.com>2008-11-06 22:02:51 -0500
commit771ed689d2cd53439e28e095bc38fbe40a71429e (patch)
tree518801f7141928e398d40c2b5955720d4346ce1a /fs/btrfs/inode.c
parent4a69a41009c4ac691f7d9c289f5f37fabeddce46 (diff)
Btrfs: Optimize compressed writeback and reads
When reading compressed extents, try to put pages into the page cache for any pages covered by the compressed extent that readpages didn't already preload. Add an async work queue to handle transformations at delayed allocation processing time. Right now this is just compression. The workflow is: 1) Find offsets in the file marked for delayed allocation 2) Lock the pages 3) Lock the state bits 4) Call the async delalloc code The async delalloc code clears the state lock bits and delalloc bits. It is important this happens before the range goes into the work queue because otherwise it might deadlock with other work queue items that try to lock those extent bits. The file pages are compressed, and if the compression doesn't work the pages are written back directly. An ordered work queue is used to make sure the inodes are written in the same order that pdflush or writepages sent them down. This changes extent_write_cache_pages to let the writepage function update the wbc nr_written count. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c643
1 files changed, 500 insertions, 143 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3df0ffad976e..e01c0d0310ab 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -86,6 +86,10 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
86 86
87static void btrfs_truncate(struct inode *inode); 87static void btrfs_truncate(struct inode *inode);
88static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); 88static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
89static noinline int cow_file_range(struct inode *inode,
90 struct page *locked_page,
91 u64 start, u64 end, int *page_started,
92 unsigned long *nr_written, int unlock);
89 93
90/* 94/*
91 * a very lame attempt at stopping writes when the FS is 85% full. There 95 * a very lame attempt at stopping writes when the FS is 85% full. There
@@ -262,35 +266,72 @@ static int cow_file_range_inline(struct btrfs_trans_handle *trans,
262 return 0; 266 return 0;
263} 267}
264 268
269struct async_extent {
270 u64 start;
271 u64 ram_size;
272 u64 compressed_size;
273 struct page **pages;
274 unsigned long nr_pages;
275 struct list_head list;
276};
277
278struct async_cow {
279 struct inode *inode;
280 struct btrfs_root *root;
281 struct page *locked_page;
282 u64 start;
283 u64 end;
284 struct list_head extents;
285 struct btrfs_work work;
286};
287
288static noinline int add_async_extent(struct async_cow *cow,
289 u64 start, u64 ram_size,
290 u64 compressed_size,
291 struct page **pages,
292 unsigned long nr_pages)
293{
294 struct async_extent *async_extent;
295
296 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
297 async_extent->start = start;
298 async_extent->ram_size = ram_size;
299 async_extent->compressed_size = compressed_size;
300 async_extent->pages = pages;
301 async_extent->nr_pages = nr_pages;
302 list_add_tail(&async_extent->list, &cow->extents);
303 return 0;
304}
305
265/* 306/*
266 * when extent_io.c finds a delayed allocation range in the file, 307 * we create compressed extents in two phases. The first
267 * the call backs end up in this code. The basic idea is to 308 * phase compresses a range of pages that have already been
268 * allocate extents on disk for the range, and create ordered data structs 309 * locked (both pages and state bits are locked).
269 * in ram to track those extents.
270 * 310 *
271 * locked_page is the page that writepage had locked already. We use 311 * This is done inside an ordered work queue, and the compression
272 * it to make sure we don't do extra locks or unlocks. 312 * is spread across many cpus. The actual IO submission is step
313 * two, and the ordered work queue takes care of making sure that
314 * happens in the same order things were put onto the queue by
315 * writepages and friends.
273 * 316 *
274 * *page_started is set to one if we unlock locked_page and do everything 317 * If this code finds it can't get good compression, it puts an
275 * required to start IO on it. It may be clean and already done with 318 * entry onto the work queue to write the uncompressed bytes. This
276 * IO when we return. 319 * makes sure that both compressed inodes and uncompressed inodes
320 * are written in the same order that pdflush sent them down.
277 */ 321 */
278static int cow_file_range(struct inode *inode, struct page *locked_page, 322static noinline int compress_file_range(struct inode *inode,
279 u64 start, u64 end, int *page_started) 323 struct page *locked_page,
324 u64 start, u64 end,
325 struct async_cow *async_cow,
326 int *num_added)
280{ 327{
281 struct btrfs_root *root = BTRFS_I(inode)->root; 328 struct btrfs_root *root = BTRFS_I(inode)->root;
282 struct btrfs_trans_handle *trans; 329 struct btrfs_trans_handle *trans;
283 u64 alloc_hint = 0;
284 u64 num_bytes; 330 u64 num_bytes;
285 unsigned long ram_size;
286 u64 orig_start; 331 u64 orig_start;
287 u64 disk_num_bytes; 332 u64 disk_num_bytes;
288 u64 cur_alloc_size;
289 u64 blocksize = root->sectorsize; 333 u64 blocksize = root->sectorsize;
290 u64 actual_end; 334 u64 actual_end;
291 struct btrfs_key ins;
292 struct extent_map *em;
293 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
294 int ret = 0; 335 int ret = 0;
295 struct page **pages = NULL; 336 struct page **pages = NULL;
296 unsigned long nr_pages; 337 unsigned long nr_pages;
@@ -298,22 +339,12 @@ static int cow_file_range(struct inode *inode, struct page *locked_page,
298 unsigned long total_compressed = 0; 339 unsigned long total_compressed = 0;
299 unsigned long total_in = 0; 340 unsigned long total_in = 0;
300 unsigned long max_compressed = 128 * 1024; 341 unsigned long max_compressed = 128 * 1024;
301 unsigned long max_uncompressed = 256 * 1024; 342 unsigned long max_uncompressed = 128 * 1024;
302 int i; 343 int i;
303 int ordered_type;
304 int will_compress; 344 int will_compress;
305 345
306 trans = btrfs_join_transaction(root, 1);
307 BUG_ON(!trans);
308 btrfs_set_trans_block_group(trans, inode);
309 orig_start = start; 346 orig_start = start;
310 347
311 /*
312 * compression made this loop a bit ugly, but the basic idea is to
313 * compress some pages but keep the total size of the compressed
314 * extent relatively small. If compression is off, this goto target
315 * is never used.
316 */
317again: 348again:
318 will_compress = 0; 349 will_compress = 0;
319 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 350 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
@@ -324,7 +355,13 @@ again:
324 355
325 /* we want to make sure that amount of ram required to uncompress 356 /* we want to make sure that amount of ram required to uncompress
326 * an extent is reasonable, so we limit the total size in ram 357 * an extent is reasonable, so we limit the total size in ram
327 * of a compressed extent to 256k 358 * of a compressed extent to 128k. This is a crucial number
359 * because it also controls how easily we can spread reads across
360 * cpus for decompression.
361 *
362 * We also want to make sure the amount of IO required to do
363 * a random read is reasonably small, so we limit the size of
364 * a compressed extent to 128k.
328 */ 365 */
329 total_compressed = min(total_compressed, max_uncompressed); 366 total_compressed = min(total_compressed, max_uncompressed);
330 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 367 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
@@ -333,18 +370,16 @@ again:
333 total_in = 0; 370 total_in = 0;
334 ret = 0; 371 ret = 0;
335 372
336 /* we do compression for mount -o compress and when the 373 /*
337 * inode has not been flagged as nocompress 374 * we do compression for mount -o compress and when the
375 * inode has not been flagged as nocompress. This flag can
376 * change at any time if we discover bad compression ratios.
338 */ 377 */
339 if (!btrfs_test_flag(inode, NOCOMPRESS) && 378 if (!btrfs_test_flag(inode, NOCOMPRESS) &&
340 btrfs_test_opt(root, COMPRESS)) { 379 btrfs_test_opt(root, COMPRESS)) {
341 WARN_ON(pages); 380 WARN_ON(pages);
342 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 381 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
343 382
344 /* we want to make sure the amount of IO required to satisfy
345 * a random read is reasonably small, so we limit the size
346 * of a compressed extent to 128k
347 */
348 ret = btrfs_zlib_compress_pages(inode->i_mapping, start, 383 ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
349 total_compressed, pages, 384 total_compressed, pages,
350 nr_pages, &nr_pages_ret, 385 nr_pages, &nr_pages_ret,
@@ -371,26 +406,34 @@ again:
371 } 406 }
372 } 407 }
373 if (start == 0) { 408 if (start == 0) {
409 trans = btrfs_join_transaction(root, 1);
410 BUG_ON(!trans);
411 btrfs_set_trans_block_group(trans, inode);
412
374 /* lets try to make an inline extent */ 413 /* lets try to make an inline extent */
375 if (ret || total_in < (end - start + 1)) { 414 if (ret || total_in < (actual_end - start)) {
376 /* we didn't compress the entire range, try 415 /* we didn't compress the entire range, try
377 * to make an uncompressed inline extent. This 416 * to make an uncompressed inline extent.
378 * is almost sure to fail, but maybe inline sizes
379 * will get bigger later
380 */ 417 */
381 ret = cow_file_range_inline(trans, root, inode, 418 ret = cow_file_range_inline(trans, root, inode,
382 start, end, 0, NULL); 419 start, end, 0, NULL);
383 } else { 420 } else {
421 /* try making a compressed inline extent */
384 ret = cow_file_range_inline(trans, root, inode, 422 ret = cow_file_range_inline(trans, root, inode,
385 start, end, 423 start, end,
386 total_compressed, pages); 424 total_compressed, pages);
387 } 425 }
426 btrfs_end_transaction(trans, root);
388 if (ret == 0) { 427 if (ret == 0) {
428 /*
429 * inline extent creation worked, we don't need
430 * to create any more async work items. Unlock
431 * and free up our temp pages.
432 */
389 extent_clear_unlock_delalloc(inode, 433 extent_clear_unlock_delalloc(inode,
390 &BTRFS_I(inode)->io_tree, 434 &BTRFS_I(inode)->io_tree,
391 start, end, NULL, 435 start, end, NULL, 1, 0,
392 1, 1, 1); 436 0, 1, 1, 1);
393 *page_started = 1;
394 ret = 0; 437 ret = 0;
395 goto free_pages_out; 438 goto free_pages_out;
396 } 439 }
@@ -435,53 +478,280 @@ again:
435 /* flag the file so we don't compress in the future */ 478 /* flag the file so we don't compress in the future */
436 btrfs_set_flag(inode, NOCOMPRESS); 479 btrfs_set_flag(inode, NOCOMPRESS);
437 } 480 }
481 if (will_compress) {
482 *num_added += 1;
438 483
439 BUG_ON(disk_num_bytes > 484 /* the async work queues will take care of doing actual
440 btrfs_super_total_bytes(&root->fs_info->super_copy)); 485 * allocation on disk for these compressed pages,
486 * and will submit them to the elevator.
487 */
488 add_async_extent(async_cow, start, num_bytes,
489 total_compressed, pages, nr_pages_ret);
441 490
442 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 491 if (start + num_bytes < end) {
492 start += num_bytes;
493 pages = NULL;
494 cond_resched();
495 goto again;
496 }
497 } else {
498 /*
499 * No compression, but we still need to write the pages in
500 * the file we've been given so far. redirty the locked
501 * page if it corresponds to our extent and set things up
502 * for the async work queue to run cow_file_range to do
503 * the normal delalloc dance
504 */
505 if (page_offset(locked_page) >= start &&
506 page_offset(locked_page) <= end) {
507 __set_page_dirty_nobuffers(locked_page);
508 /* unlocked later on in the async handlers */
509 }
510 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
511 *num_added += 1;
512 }
443 513
444 while(disk_num_bytes > 0) { 514out:
445 unsigned long min_bytes; 515 return 0;
516
517free_pages_out:
518 for (i = 0; i < nr_pages_ret; i++) {
519 WARN_ON(pages[i]->mapping);
520 page_cache_release(pages[i]);
521 }
522 if (pages)
523 kfree(pages);
524
525 goto out;
526}
527
528/*
529 * phase two of compressed writeback. This is the ordered portion
530 * of the code, which only gets called in the order the work was
531 * queued. We walk all the async extents created by compress_file_range
532 * and send them down to the disk.
533 */
534static noinline int submit_compressed_extents(struct inode *inode,
535 struct async_cow *async_cow)
536{
537 struct async_extent *async_extent;
538 u64 alloc_hint = 0;
539 struct btrfs_trans_handle *trans;
540 struct btrfs_key ins;
541 struct extent_map *em;
542 struct btrfs_root *root = BTRFS_I(inode)->root;
543 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
544 struct extent_io_tree *io_tree;
545 int ret;
546
547 if (list_empty(&async_cow->extents))
548 return 0;
549
550 trans = btrfs_join_transaction(root, 1);
551
552 while(!list_empty(&async_cow->extents)) {
553 async_extent = list_entry(async_cow->extents.next,
554 struct async_extent, list);
555 list_del(&async_extent->list);
446 556
557 io_tree = &BTRFS_I(inode)->io_tree;
558
559 /* did the compression code fall back to uncompressed IO? */
560 if (!async_extent->pages) {
561 int page_started = 0;
562 unsigned long nr_written = 0;
563
564 lock_extent(io_tree, async_extent->start,
565 async_extent->start + async_extent->ram_size - 1,
566 GFP_NOFS);
567
568 /* allocate blocks */
569 cow_file_range(inode, async_cow->locked_page,
570 async_extent->start,
571 async_extent->start +
572 async_extent->ram_size - 1,
573 &page_started, &nr_written, 0);
574
575 /*
576 * if page_started, cow_file_range inserted an
577 * inline extent and took care of all the unlocking
578 * and IO for us. Otherwise, we need to submit
579 * all those pages down to the drive.
580 */
581 if (!page_started)
582 extent_write_locked_range(io_tree,
583 inode, async_extent->start,
584 async_extent->start +
585 async_extent->ram_size - 1,
586 btrfs_get_extent,
587 WB_SYNC_ALL);
588 kfree(async_extent);
589 cond_resched();
590 continue;
591 }
592
593 lock_extent(io_tree, async_extent->start,
594 async_extent->start + async_extent->ram_size - 1,
595 GFP_NOFS);
447 /* 596 /*
448 * the max size of a compressed extent is pretty small, 597 * here we're doing allocation and writeback of the
449 * make the code a little less complex by forcing 598 * compressed pages
450 * the allocator to find a whole compressed extent at once
451 */ 599 */
452 if (will_compress) 600 btrfs_drop_extent_cache(inode, async_extent->start,
453 min_bytes = disk_num_bytes; 601 async_extent->start +
454 else 602 async_extent->ram_size - 1, 0);
455 min_bytes = root->sectorsize; 603
604 ret = btrfs_reserve_extent(trans, root,
605 async_extent->compressed_size,
606 async_extent->compressed_size,
607 0, alloc_hint,
608 (u64)-1, &ins, 1);
609 BUG_ON(ret);
610 em = alloc_extent_map(GFP_NOFS);
611 em->start = async_extent->start;
612 em->len = async_extent->ram_size;
613
614 em->block_start = ins.objectid;
615 em->block_len = ins.offset;
616 em->bdev = root->fs_info->fs_devices->latest_bdev;
617 set_bit(EXTENT_FLAG_PINNED, &em->flags);
618 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
619
620 while(1) {
621 spin_lock(&em_tree->lock);
622 ret = add_extent_mapping(em_tree, em);
623 spin_unlock(&em_tree->lock);
624 if (ret != -EEXIST) {
625 free_extent_map(em);
626 break;
627 }
628 btrfs_drop_extent_cache(inode, async_extent->start,
629 async_extent->start +
630 async_extent->ram_size - 1, 0);
631 }
632
633 ret = btrfs_add_ordered_extent(inode, async_extent->start,
634 ins.objectid,
635 async_extent->ram_size,
636 ins.offset,
637 BTRFS_ORDERED_COMPRESSED);
638 BUG_ON(ret);
639
640 btrfs_end_transaction(trans, root);
641
642 /*
643 * clear dirty, set writeback and unlock the pages.
644 */
645 extent_clear_unlock_delalloc(inode,
646 &BTRFS_I(inode)->io_tree,
647 async_extent->start,
648 async_extent->start +
649 async_extent->ram_size - 1,
650 NULL, 1, 1, 0, 1, 1, 0);
651
652 ret = btrfs_submit_compressed_write(inode,
653 async_extent->start,
654 async_extent->ram_size,
655 ins.objectid,
656 ins.offset, async_extent->pages,
657 async_extent->nr_pages);
658
659 BUG_ON(ret);
660 trans = btrfs_join_transaction(root, 1);
661 alloc_hint = ins.objectid + ins.offset;
662 kfree(async_extent);
663 cond_resched();
664 }
665
666 btrfs_end_transaction(trans, root);
667 return 0;
668}
669
670/*
671 * when extent_io.c finds a delayed allocation range in the file,
672 * the call backs end up in this code. The basic idea is to
673 * allocate extents on disk for the range, and create ordered data structs
674 * in ram to track those extents.
675 *
676 * locked_page is the page that writepage had locked already. We use
677 * it to make sure we don't do extra locks or unlocks.
678 *
679 * *page_started is set to one if we unlock locked_page and do everything
680 * required to start IO on it. It may be clean and already done with
681 * IO when we return.
682 */
683static noinline int cow_file_range(struct inode *inode,
684 struct page *locked_page,
685 u64 start, u64 end, int *page_started,
686 unsigned long *nr_written,
687 int unlock)
688{
689 struct btrfs_root *root = BTRFS_I(inode)->root;
690 struct btrfs_trans_handle *trans;
691 u64 alloc_hint = 0;
692 u64 num_bytes;
693 unsigned long ram_size;
694 u64 disk_num_bytes;
695 u64 cur_alloc_size;
696 u64 blocksize = root->sectorsize;
697 u64 actual_end;
698 struct btrfs_key ins;
699 struct extent_map *em;
700 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
701 int ret = 0;
702
703 trans = btrfs_join_transaction(root, 1);
704 BUG_ON(!trans);
705 btrfs_set_trans_block_group(trans, inode);
456 706
707 actual_end = min_t(u64, i_size_read(inode), end + 1);
708
709 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
710 num_bytes = max(blocksize, num_bytes);
711 disk_num_bytes = num_bytes;
712 ret = 0;
713
714 if (start == 0) {
715 /* lets try to make an inline extent */
716 ret = cow_file_range_inline(trans, root, inode,
717 start, end, 0, NULL);
718 if (ret == 0) {
719 extent_clear_unlock_delalloc(inode,
720 &BTRFS_I(inode)->io_tree,
721 start, end, NULL, 1, 1,
722 1, 1, 1, 1);
723 *nr_written = *nr_written +
724 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
725 *page_started = 1;
726 ret = 0;
727 goto out;
728 }
729 }
730
731 BUG_ON(disk_num_bytes >
732 btrfs_super_total_bytes(&root->fs_info->super_copy));
733
734 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
735
736 while(disk_num_bytes > 0) {
457 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); 737 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
458 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 738 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
459 min_bytes, 0, alloc_hint, 739 root->sectorsize, 0, alloc_hint,
460 (u64)-1, &ins, 1); 740 (u64)-1, &ins, 1);
461 if (ret) { 741 if (ret) {
462 WARN_ON(1); 742 BUG();
463 goto free_pages_out_fail;
464 } 743 }
465 em = alloc_extent_map(GFP_NOFS); 744 em = alloc_extent_map(GFP_NOFS);
466 em->start = start; 745 em->start = start;
467 746
468 if (will_compress) { 747 ram_size = ins.offset;
469 ram_size = num_bytes; 748 em->len = ins.offset;
470 em->len = num_bytes;
471 } else {
472 /* ramsize == disk size */
473 ram_size = ins.offset;
474 em->len = ins.offset;
475 }
476 749
477 em->block_start = ins.objectid; 750 em->block_start = ins.objectid;
478 em->block_len = ins.offset; 751 em->block_len = ins.offset;
479 em->bdev = root->fs_info->fs_devices->latest_bdev; 752 em->bdev = root->fs_info->fs_devices->latest_bdev;
480 set_bit(EXTENT_FLAG_PINNED, &em->flags); 753 set_bit(EXTENT_FLAG_PINNED, &em->flags);
481 754
482 if (will_compress)
483 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
484
485 while(1) { 755 while(1) {
486 spin_lock(&em_tree->lock); 756 spin_lock(&em_tree->lock);
487 ret = add_extent_mapping(em_tree, em); 757 ret = add_extent_mapping(em_tree, em);
@@ -495,10 +765,8 @@ again:
495 } 765 }
496 766
497 cur_alloc_size = ins.offset; 767 cur_alloc_size = ins.offset;
498 ordered_type = will_compress ? BTRFS_ORDERED_COMPRESSED : 0;
499 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 768 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
500 ram_size, cur_alloc_size, 769 ram_size, cur_alloc_size, 0);
501 ordered_type);
502 BUG_ON(ret); 770 BUG_ON(ret);
503 771
504 if (disk_num_bytes < cur_alloc_size) { 772 if (disk_num_bytes < cur_alloc_size) {
@@ -506,82 +774,145 @@ again:
506 cur_alloc_size); 774 cur_alloc_size);
507 break; 775 break;
508 } 776 }
509
510 if (will_compress) {
511 /*
512 * we're doing compression, we and we need to
513 * submit the compressed extents down to the device.
514 *
515 * We lock down all the file pages, clearing their
516 * dirty bits and setting them writeback. Everyone
517 * that wants to modify the page will wait on the
518 * ordered extent above.
519 *
520 * The writeback bits on the file pages are
521 * cleared when the compressed pages are on disk
522 */
523 btrfs_end_transaction(trans, root);
524
525 if (start <= page_offset(locked_page) &&
526 page_offset(locked_page) < start + ram_size) {
527 *page_started = 1;
528 }
529
530 extent_clear_unlock_delalloc(inode,
531 &BTRFS_I(inode)->io_tree,
532 start,
533 start + ram_size - 1,
534 NULL, 1, 1, 0);
535
536 ret = btrfs_submit_compressed_write(inode, start,
537 ram_size, ins.objectid,
538 cur_alloc_size, pages,
539 nr_pages_ret);
540
541 BUG_ON(ret);
542 trans = btrfs_join_transaction(root, 1);
543 if (start + ram_size < end) {
544 start += ram_size;
545 alloc_hint = ins.objectid + ins.offset;
546 /* pages will be freed at end_bio time */
547 pages = NULL;
548 goto again;
549 } else {
550 /* we've written everything, time to go */
551 break;
552 }
553 }
554 /* we're not doing compressed IO, don't unlock the first 777 /* we're not doing compressed IO, don't unlock the first
555 * page (which the caller expects to stay locked), don't 778 * page (which the caller expects to stay locked), don't
556 * clear any dirty bits and don't set any writeback bits 779 * clear any dirty bits and don't set any writeback bits
557 */ 780 */
558 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 781 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
559 start, start + ram_size - 1, 782 start, start + ram_size - 1,
560 locked_page, 0, 0, 0); 783 locked_page, unlock, 1,
784 1, 0, 0, 0);
561 disk_num_bytes -= cur_alloc_size; 785 disk_num_bytes -= cur_alloc_size;
562 num_bytes -= cur_alloc_size; 786 num_bytes -= cur_alloc_size;
563 alloc_hint = ins.objectid + ins.offset; 787 alloc_hint = ins.objectid + ins.offset;
564 start += cur_alloc_size; 788 start += cur_alloc_size;
565 } 789 }
566
567 ret = 0;
568out: 790out:
791 ret = 0;
569 btrfs_end_transaction(trans, root); 792 btrfs_end_transaction(trans, root);
570 793
571 return ret; 794 return ret;
795}
572 796
573free_pages_out_fail: 797/*
574 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 798 * work queue call back to started compression on a file and pages
575 start, end, locked_page, 0, 0, 0); 799 */
576free_pages_out: 800static noinline void async_cow_start(struct btrfs_work *work)
577 for (i = 0; i < nr_pages_ret; i++) { 801{
578 WARN_ON(pages[i]->mapping); 802 struct async_cow *async_cow;
579 page_cache_release(pages[i]); 803 int num_added = 0;
804 async_cow = container_of(work, struct async_cow, work);
805
806 compress_file_range(async_cow->inode, async_cow->locked_page,
807 async_cow->start, async_cow->end, async_cow,
808 &num_added);
809 if (num_added == 0)
810 async_cow->inode = NULL;
811}
812
813/*
814 * work queue call back to submit previously compressed pages
815 */
816static noinline void async_cow_submit(struct btrfs_work *work)
817{
818 struct async_cow *async_cow;
819 struct btrfs_root *root;
820 unsigned long nr_pages;
821
822 async_cow = container_of(work, struct async_cow, work);
823
824 root = async_cow->root;
825 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
826 PAGE_CACHE_SHIFT;
827
828 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
829
830 if (atomic_read(&root->fs_info->async_delalloc_pages) <
831 5 * 1042 * 1024 &&
832 waitqueue_active(&root->fs_info->async_submit_wait))
833 wake_up(&root->fs_info->async_submit_wait);
834
835 if (async_cow->inode) {
836 submit_compressed_extents(async_cow->inode, async_cow);
580 } 837 }
581 if (pages) 838}
582 kfree(pages);
583 839
584 goto out; 840static noinline void async_cow_free(struct btrfs_work *work)
841{
842 struct async_cow *async_cow;
843 async_cow = container_of(work, struct async_cow, work);
844 kfree(async_cow);
845}
846
847static int cow_file_range_async(struct inode *inode, struct page *locked_page,
848 u64 start, u64 end, int *page_started,
849 unsigned long *nr_written)
850{
851 struct async_cow *async_cow;
852 struct btrfs_root *root = BTRFS_I(inode)->root;
853 unsigned long nr_pages;
854 u64 cur_end;
855 int limit = 10 * 1024 * 1042;
856
857 if (!btrfs_test_opt(root, COMPRESS)) {
858 return cow_file_range(inode, locked_page, start, end,
859 page_started, nr_written, 1);
860 }
861
862 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
863 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
864 while(start < end) {
865 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
866 async_cow->inode = inode;
867 async_cow->root = root;
868 async_cow->locked_page = locked_page;
869 async_cow->start = start;
870
871 if (btrfs_test_flag(inode, NOCOMPRESS))
872 cur_end = end;
873 else
874 cur_end = min(end, start + 512 * 1024 - 1);
875
876 async_cow->end = cur_end;
877 INIT_LIST_HEAD(&async_cow->extents);
878
879 async_cow->work.func = async_cow_start;
880 async_cow->work.ordered_func = async_cow_submit;
881 async_cow->work.ordered_free = async_cow_free;
882 async_cow->work.flags = 0;
883
884 while(atomic_read(&root->fs_info->async_submit_draining) &&
885 atomic_read(&root->fs_info->async_delalloc_pages)) {
886 wait_event(root->fs_info->async_submit_wait,
887 (atomic_read(&root->fs_info->async_delalloc_pages)
888 == 0));
889 }
890
891 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
892 PAGE_CACHE_SHIFT;
893 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
894
895 btrfs_queue_worker(&root->fs_info->delalloc_workers,
896 &async_cow->work);
897
898 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
899 wait_event(root->fs_info->async_submit_wait,
900 (atomic_read(&root->fs_info->async_delalloc_pages) <
901 limit));
902 }
903
904 while(atomic_read(&root->fs_info->async_submit_draining) &&
905 atomic_read(&root->fs_info->async_delalloc_pages)) {
906 wait_event(root->fs_info->async_submit_wait,
907 (atomic_read(&root->fs_info->async_delalloc_pages) ==
908 0));
909 }
910
911 *nr_written += nr_pages;
912 start = cur_end + 1;
913 }
914 *page_started = 1;
915 return 0;
585} 916}
586 917
587/* 918/*
@@ -592,7 +923,8 @@ free_pages_out:
592 * blocks on disk 923 * blocks on disk
593 */ 924 */
594static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, 925static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
595 u64 start, u64 end, int *page_started, int force) 926 u64 start, u64 end, int *page_started, int force,
927 unsigned long *nr_written)
596{ 928{
597 struct btrfs_root *root = BTRFS_I(inode)->root; 929 struct btrfs_root *root = BTRFS_I(inode)->root;
598 struct btrfs_trans_handle *trans; 930 struct btrfs_trans_handle *trans;
@@ -711,7 +1043,8 @@ out_check:
711 btrfs_release_path(root, path); 1043 btrfs_release_path(root, path);
712 if (cow_start != (u64)-1) { 1044 if (cow_start != (u64)-1) {
713 ret = cow_file_range(inode, locked_page, cow_start, 1045 ret = cow_file_range(inode, locked_page, cow_start,
714 found_key.offset - 1, page_started); 1046 found_key.offset - 1, page_started,
1047 nr_written, 1);
715 BUG_ON(ret); 1048 BUG_ON(ret);
716 cow_start = (u64)-1; 1049 cow_start = (u64)-1;
717 } 1050 }
@@ -748,9 +1081,10 @@ out_check:
748 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1081 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
749 num_bytes, num_bytes, type); 1082 num_bytes, num_bytes, type);
750 BUG_ON(ret); 1083 BUG_ON(ret);
1084
751 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1085 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
752 cur_offset, cur_offset + num_bytes - 1, 1086 cur_offset, cur_offset + num_bytes - 1,
753 locked_page, 0, 0, 0); 1087 locked_page, 1, 1, 1, 0, 0, 0);
754 cur_offset = extent_end; 1088 cur_offset = extent_end;
755 if (cur_offset > end) 1089 if (cur_offset > end)
756 break; 1090 break;
@@ -761,7 +1095,7 @@ out_check:
761 cow_start = cur_offset; 1095 cow_start = cur_offset;
762 if (cow_start != (u64)-1) { 1096 if (cow_start != (u64)-1) {
763 ret = cow_file_range(inode, locked_page, cow_start, end, 1097 ret = cow_file_range(inode, locked_page, cow_start, end,
764 page_started); 1098 page_started, nr_written, 1);
765 BUG_ON(ret); 1099 BUG_ON(ret);
766 } 1100 }
767 1101
@@ -775,7 +1109,8 @@ out_check:
775 * extent_io.c call back to do delayed allocation processing 1109 * extent_io.c call back to do delayed allocation processing
776 */ 1110 */
777static int run_delalloc_range(struct inode *inode, struct page *locked_page, 1111static int run_delalloc_range(struct inode *inode, struct page *locked_page,
778 u64 start, u64 end, int *page_started) 1112 u64 start, u64 end, int *page_started,
1113 unsigned long *nr_written)
779{ 1114{
780 struct btrfs_root *root = BTRFS_I(inode)->root; 1115 struct btrfs_root *root = BTRFS_I(inode)->root;
781 int ret; 1116 int ret;
@@ -783,13 +1118,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
783 if (btrfs_test_opt(root, NODATACOW) || 1118 if (btrfs_test_opt(root, NODATACOW) ||
784 btrfs_test_flag(inode, NODATACOW)) 1119 btrfs_test_flag(inode, NODATACOW))
785 ret = run_delalloc_nocow(inode, locked_page, start, end, 1120 ret = run_delalloc_nocow(inode, locked_page, start, end,
786 page_started, 0); 1121 page_started, 0, nr_written);
787 else if (btrfs_test_flag(inode, PREALLOC)) 1122 else if (btrfs_test_flag(inode, PREALLOC))
788 ret = run_delalloc_nocow(inode, locked_page, start, end, 1123 ret = run_delalloc_nocow(inode, locked_page, start, end,
789 page_started, 1); 1124 page_started, 1, nr_written);
790 else 1125 else
791 ret = cow_file_range(inode, locked_page, start, end, 1126 ret = cow_file_range_async(inode, locked_page, start, end,
792 page_started); 1127 page_started, nr_written);
793 1128
794 return ret; 1129 return ret;
795} 1130}
@@ -861,6 +1196,9 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
861 u64 map_length; 1196 u64 map_length;
862 int ret; 1197 int ret;
863 1198
1199 if (bio_flags & EXTENT_BIO_COMPRESSED)
1200 return 0;
1201
864 length = bio->bi_size; 1202 length = bio->bi_size;
865 map_tree = &root->fs_info->mapping_tree; 1203 map_tree = &root->fs_info->mapping_tree;
866 map_length = length; 1204 map_length = length;
@@ -925,12 +1263,12 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
925 btrfs_test_flag(inode, NODATASUM); 1263 btrfs_test_flag(inode, NODATASUM);
926 1264
927 if (!(rw & (1 << BIO_RW))) { 1265 if (!(rw & (1 << BIO_RW))) {
928 if (!skip_sum)
929 btrfs_lookup_bio_sums(root, inode, bio);
930 1266
931 if (bio_flags & EXTENT_BIO_COMPRESSED) 1267 if (bio_flags & EXTENT_BIO_COMPRESSED)
932 return btrfs_submit_compressed_read(inode, bio, 1268 return btrfs_submit_compressed_read(inode, bio,
933 mirror_num, bio_flags); 1269 mirror_num, bio_flags);
1270 else if (!skip_sum)
1271 btrfs_lookup_bio_sums(root, inode, bio);
934 goto mapit; 1272 goto mapit;
935 } else if (!skip_sum) { 1273 } else if (!skip_sum) {
936 /* we're doing a write, do the async checksumming */ 1274 /* we're doing a write, do the async checksumming */
@@ -966,6 +1304,9 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
966 1304
967int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) 1305int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
968{ 1306{
1307 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) {
1308 WARN_ON(1);
1309 }
969 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1310 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
970 GFP_NOFS); 1311 GFP_NOFS);
971} 1312}
@@ -2105,6 +2446,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2105 int pending_del_nr = 0; 2446 int pending_del_nr = 0;
2106 int pending_del_slot = 0; 2447 int pending_del_slot = 0;
2107 int extent_type = -1; 2448 int extent_type = -1;
2449 int encoding;
2108 u64 mask = root->sectorsize - 1; 2450 u64 mask = root->sectorsize - 1;
2109 2451
2110 if (root->ref_cows) 2452 if (root->ref_cows)
@@ -2144,6 +2486,7 @@ search_again:
2144 leaf = path->nodes[0]; 2486 leaf = path->nodes[0];
2145 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2487 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2146 found_type = btrfs_key_type(&found_key); 2488 found_type = btrfs_key_type(&found_key);
2489 encoding = 0;
2147 2490
2148 if (found_key.objectid != inode->i_ino) 2491 if (found_key.objectid != inode->i_ino)
2149 break; 2492 break;
@@ -2156,6 +2499,10 @@ search_again:
2156 fi = btrfs_item_ptr(leaf, path->slots[0], 2499 fi = btrfs_item_ptr(leaf, path->slots[0],
2157 struct btrfs_file_extent_item); 2500 struct btrfs_file_extent_item);
2158 extent_type = btrfs_file_extent_type(leaf, fi); 2501 extent_type = btrfs_file_extent_type(leaf, fi);
2502 encoding = btrfs_file_extent_compression(leaf, fi);
2503 encoding |= btrfs_file_extent_encryption(leaf, fi);
2504 encoding |= btrfs_file_extent_other_encoding(leaf, fi);
2505
2159 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 2506 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2160 item_end += 2507 item_end +=
2161 btrfs_file_extent_num_bytes(leaf, fi); 2508 btrfs_file_extent_num_bytes(leaf, fi);
@@ -2200,7 +2547,7 @@ search_again:
2200 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 2547 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2201 u64 num_dec; 2548 u64 num_dec;
2202 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 2549 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
2203 if (!del_item) { 2550 if (!del_item && !encoding) {
2204 u64 orig_num_bytes = 2551 u64 orig_num_bytes =
2205 btrfs_file_extent_num_bytes(leaf, fi); 2552 btrfs_file_extent_num_bytes(leaf, fi);
2206 extent_num_bytes = new_size - 2553 extent_num_bytes = new_size -
@@ -2436,7 +2783,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
2436 last_byte = min(extent_map_end(em), block_end); 2783 last_byte = min(extent_map_end(em), block_end);
2437 last_byte = (last_byte + mask) & ~mask; 2784 last_byte = (last_byte + mask) & ~mask;
2438 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { 2785 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
2786 u64 hint_byte = 0;
2439 hole_size = last_byte - cur_offset; 2787 hole_size = last_byte - cur_offset;
2788 err = btrfs_drop_extents(trans, root, inode,
2789 cur_offset,
2790 cur_offset + hole_size,
2791 cur_offset, &hint_byte);
2792 if (err)
2793 break;
2440 err = btrfs_insert_file_extent(trans, root, 2794 err = btrfs_insert_file_extent(trans, root,
2441 inode->i_ino, cur_offset, 0, 2795 inode->i_ino, cur_offset, 0,
2442 0, hole_size, 0, hole_size, 2796 0, hole_size, 0, hole_size,
@@ -3785,6 +4139,7 @@ int btrfs_writepages(struct address_space *mapping,
3785 struct writeback_control *wbc) 4139 struct writeback_control *wbc)
3786{ 4140{
3787 struct extent_io_tree *tree; 4141 struct extent_io_tree *tree;
4142
3788 tree = &BTRFS_I(mapping->host)->io_tree; 4143 tree = &BTRFS_I(mapping->host)->io_tree;
3789 return extent_writepages(tree, mapping, btrfs_get_extent, wbc); 4144 return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
3790} 4145}
@@ -4285,9 +4640,11 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
4285 * ordered extents get created before we return 4640 * ordered extents get created before we return
4286 */ 4641 */
4287 atomic_inc(&root->fs_info->async_submit_draining); 4642 atomic_inc(&root->fs_info->async_submit_draining);
4288 while(atomic_read(&root->fs_info->nr_async_submits)) { 4643 while(atomic_read(&root->fs_info->nr_async_submits) ||
4644 atomic_read(&root->fs_info->async_delalloc_pages)) {
4289 wait_event(root->fs_info->async_submit_wait, 4645 wait_event(root->fs_info->async_submit_wait,
4290 (atomic_read(&root->fs_info->nr_async_submits) == 0)); 4646 (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
4647 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
4291 } 4648 }
4292 atomic_dec(&root->fs_info->async_submit_draining); 4649 atomic_dec(&root->fs_info->async_submit_draining);
4293 return 0; 4650 return 0;