aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2010-03-10 10:52:59 -0500
committerChris Mason <chris.mason@oracle.com>2010-03-15 11:00:10 -0400
commit940100a4a7b78b27e60a3e72340fb9b5397dcdb2 (patch)
tree8eec3d4a95cbc530ddaa82f0d2848e14519eec2a
parent51684082b11c304829ea22193d4d96a5b1663b97 (diff)
Btrfs: be more selective in the defrag ioctl
The btrfs defrag ioctl had some bugs around delalloc accounting, and it wasn't properly skipping pages that were not in the mapping. It wasn't properly clearing the page checked flag, which could make the writeback code ignore the page forever while pinning it as dirty. This commit fixes those problems and makes defrag a little smarter. It skips holes and it doesn't waste time defragging large extents. If a tiny extent comes before a very large extent, it will defrag both of them to make sure the tiny extent ends up next to something big. Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/btrfs/ioctl.c150
1 files changed, 140 insertions, 10 deletions
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7875a75315d0..3a89cd77f307 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -475,6 +475,73 @@ out_unlock:
475 return error; 475 return error;
476} 476}
477 477
478static int should_defrag_range(struct inode *inode, u64 start, u64 len,
479 u64 *last_len, u64 *skip, u64 *defrag_end)
480{
481 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
482 struct extent_map *em = NULL;
483 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
484 int ret = 1;
485
486 /*
487 * make sure that once we start defragging and extent, we keep on
488 * defragging it
489 */
490 if (start < *defrag_end)
491 return 1;
492
493 *skip = 0;
494
495 /*
496 * hopefully we have this extent in the tree already, try without
497 * the full extent lock
498 */
499 read_lock(&em_tree->lock);
500 em = lookup_extent_mapping(em_tree, start, len);
501 read_unlock(&em_tree->lock);
502
503 if (!em) {
504 /* get the big lock and read metadata off disk */
505 lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
506 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
507 unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
508
509 if (!em)
510 return 0;
511 }
512
513 /* this will cover holes, and inline extents */
514 if (em->block_start >= EXTENT_MAP_LAST_BYTE)
515 ret = 0;
516
517 /*
518 * we hit a real extent, if it is big don't bother defragging it again
519 */
520 if ((*last_len == 0 || *last_len >= 256 * 1024) &&
521 em->len >= 256 * 1024)
522 ret = 0;
523
524 /*
525 * last_len ends up being a counter of how many bytes we've defragged.
526 * every time we choose not to defrag an extent, we reset *last_len
527 * so that the next tiny extent will force a defrag.
528 *
529 * The end result of this is that tiny extents before a single big
530 * extent will force at least part of that big extent to be defragged.
531 */
532 if (ret) {
533 *last_len += len;
534 *defrag_end = extent_map_end(em);
535 } else {
536 *last_len = 0;
537 *skip = extent_map_end(em);
538 *defrag_end = 0;
539 }
540
541 free_extent_map(em);
542 return ret;
543}
544
478static int btrfs_defrag_file(struct file *file) 545static int btrfs_defrag_file(struct file *file)
479{ 546{
480 struct inode *inode = fdentry(file)->d_inode; 547 struct inode *inode = fdentry(file)->d_inode;
@@ -487,37 +554,86 @@ static int btrfs_defrag_file(struct file *file)
487 unsigned long total_read = 0; 554 unsigned long total_read = 0;
488 u64 page_start; 555 u64 page_start;
489 u64 page_end; 556 u64 page_end;
557 u64 last_len = 0;
558 u64 skip = 0;
559 u64 defrag_end = 0;
490 unsigned long i; 560 unsigned long i;
491 int ret; 561 int ret;
492 562
493 ret = btrfs_check_data_free_space(root, inode, inode->i_size); 563 if (inode->i_size == 0)
494 if (ret) 564 return 0;
495 return -ENOSPC; 565
566 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
567 i = 0;
568 while (i <= last_index) {
569 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
570 PAGE_CACHE_SIZE, &last_len, &skip,
571 &defrag_end)) {
572 unsigned long next;
573 /*
574 * the should_defrag function tells us how much to skip
575 * bump our counter by the suggested amount
576 */
577 next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
578 i = max(i + 1, next);
579 continue;
580 }
496 581
497 mutex_lock(&inode->i_mutex);
498 last_index = inode->i_size >> PAGE_CACHE_SHIFT;
499 for (i = 0; i <= last_index; i++) {
500 if (total_read % ra_pages == 0) { 582 if (total_read % ra_pages == 0) {
501 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, 583 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
502 min(last_index, i + ra_pages - 1)); 584 min(last_index, i + ra_pages - 1));
503 } 585 }
504 total_read++; 586 total_read++;
587 mutex_lock(&inode->i_mutex);
588
589 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
590 if (ret) {
591 ret = -ENOSPC;
592 break;
593 }
594
595 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
596 if (ret) {
597 btrfs_free_reserved_data_space(root, inode,
598 PAGE_CACHE_SIZE);
599 ret = -ENOSPC;
600 break;
601 }
505again: 602again:
603 if (inode->i_size == 0 ||
604 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
605 ret = 0;
606 goto err_reservations;
607 }
608
506 page = grab_cache_page(inode->i_mapping, i); 609 page = grab_cache_page(inode->i_mapping, i);
507 if (!page) 610 if (!page)
508 goto out_unlock; 611 goto err_reservations;
612
509 if (!PageUptodate(page)) { 613 if (!PageUptodate(page)) {
510 btrfs_readpage(NULL, page); 614 btrfs_readpage(NULL, page);
511 lock_page(page); 615 lock_page(page);
512 if (!PageUptodate(page)) { 616 if (!PageUptodate(page)) {
513 unlock_page(page); 617 unlock_page(page);
514 page_cache_release(page); 618 page_cache_release(page);
515 goto out_unlock; 619 goto err_reservations;
516 } 620 }
517 } 621 }
518 622
623 if (page->mapping != inode->i_mapping) {
624 unlock_page(page);
625 page_cache_release(page);
626 goto again;
627 }
628
519 wait_on_page_writeback(page); 629 wait_on_page_writeback(page);
520 630
631 if (PageDirty(page)) {
632 btrfs_free_reserved_data_space(root, inode,
633 PAGE_CACHE_SIZE);
634 goto loop_unlock;
635 }
636
521 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 637 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
522 page_end = page_start + PAGE_CACHE_SIZE - 1; 638 page_end = page_start + PAGE_CACHE_SIZE - 1;
523 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 639 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -538,18 +654,32 @@ again:
538 * page if it is dirtied again later 654 * page if it is dirtied again later
539 */ 655 */
540 clear_page_dirty_for_io(page); 656 clear_page_dirty_for_io(page);
657 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
658 page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
659 EXTENT_DO_ACCOUNTING, GFP_NOFS);
541 660
542 btrfs_set_extent_delalloc(inode, page_start, page_end); 661 btrfs_set_extent_delalloc(inode, page_start, page_end);
662 ClearPageChecked(page);
543 set_page_dirty(page); 663 set_page_dirty(page);
544 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 664 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
665
666loop_unlock:
545 unlock_page(page); 667 unlock_page(page);
546 page_cache_release(page); 668 page_cache_release(page);
669 mutex_unlock(&inode->i_mutex);
670
671 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
547 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); 672 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
673 i++;
548 } 674 }
549 675
550out_unlock:
551 mutex_unlock(&inode->i_mutex);
552 return 0; 676 return 0;
677
678err_reservations:
679 mutex_unlock(&inode->i_mutex);
680 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
681 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
682 return ret;
553} 683}
554 684
555static noinline int btrfs_ioctl_resize(struct btrfs_root *root, 685static noinline int btrfs_ioctl_resize(struct btrfs_root *root,