aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/ioctl.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2011-05-24 15:35:30 -0400
committerChris Mason <chris.mason@oracle.com>2011-05-26 17:52:15 -0400
commit4cb5300bc839b8a943eb19c9f27f25470e22d0ca (patch)
treeac0f2fb481c7aa6af08a624d276fa6d580c94c9b /fs/btrfs/ioctl.c
parentd6c0cb379c5198487e4ac124728cbb2346d63b1f (diff)
Btrfs: add mount -o auto_defrag
This will detect small random writes into files and queue the up for an auto defrag process. It isn't well suited to database workloads yet, but works for smaller files such as rpm, sqlite or bdb databases. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/ioctl.c')
-rw-r--r--fs/btrfs/ioctl.c448
1 files changed, 346 insertions, 102 deletions
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c4f17e4e2c9c..85e818ce00c5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -656,6 +656,106 @@ out_unlock:
656 return error; 656 return error;
657} 657}
658 658
659/*
660 * When we're defragging a range, we don't want to kick it off again
661 * if it is really just waiting for delalloc to send it down.
662 * If we find a nice big extent or delalloc range for the bytes in the
663 * file you want to defrag, we return 0 to let you know to skip this
664 * part of the file
665 */
666static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
667{
668 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
669 struct extent_map *em = NULL;
670 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
671 u64 end;
672
673 read_lock(&em_tree->lock);
674 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
675 read_unlock(&em_tree->lock);
676
677 if (em) {
678 end = extent_map_end(em);
679 free_extent_map(em);
680 if (end - offset > thresh)
681 return 0;
682 }
683 /* if we already have a nice delalloc here, just stop */
684 thresh /= 2;
685 end = count_range_bits(io_tree, &offset, offset + thresh,
686 thresh, EXTENT_DELALLOC, 1);
687 if (end >= thresh)
688 return 0;
689 return 1;
690}
691
692/*
693 * helper function to walk through a file and find extents
694 * newer than a specific transid, and smaller than thresh.
695 *
696 * This is used by the defragging code to find new and small
697 * extents
698 */
699static int find_new_extents(struct btrfs_root *root,
700 struct inode *inode, u64 newer_than,
701 u64 *off, int thresh)
702{
703 struct btrfs_path *path;
704 struct btrfs_key min_key;
705 struct btrfs_key max_key;
706 struct extent_buffer *leaf;
707 struct btrfs_file_extent_item *extent;
708 int type;
709 int ret;
710
711 path = btrfs_alloc_path();
712 if (!path)
713 return -ENOMEM;
714
715 min_key.objectid = inode->i_ino;
716 min_key.type = BTRFS_EXTENT_DATA_KEY;
717 min_key.offset = *off;
718
719 max_key.objectid = inode->i_ino;
720 max_key.type = (u8)-1;
721 max_key.offset = (u64)-1;
722
723 path->keep_locks = 1;
724
725 while(1) {
726 ret = btrfs_search_forward(root, &min_key, &max_key,
727 path, 0, newer_than);
728 if (ret != 0)
729 goto none;
730 if (min_key.objectid != inode->i_ino)
731 goto none;
732 if (min_key.type != BTRFS_EXTENT_DATA_KEY)
733 goto none;
734
735 leaf = path->nodes[0];
736 extent = btrfs_item_ptr(leaf, path->slots[0],
737 struct btrfs_file_extent_item);
738
739 type = btrfs_file_extent_type(leaf, extent);
740 if (type == BTRFS_FILE_EXTENT_REG &&
741 btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
742 check_defrag_in_cache(inode, min_key.offset, thresh)) {
743 *off = min_key.offset;
744 btrfs_free_path(path);
745 return 0;
746 }
747
748 if (min_key.offset == (u64)-1)
749 goto none;
750
751 min_key.offset++;
752 btrfs_release_path(path);
753 }
754none:
755 btrfs_free_path(path);
756 return -ENOENT;
757}
758
659static int should_defrag_range(struct inode *inode, u64 start, u64 len, 759static int should_defrag_range(struct inode *inode, u64 start, u64 len,
660 int thresh, u64 *last_len, u64 *skip, 760 int thresh, u64 *last_len, u64 *skip,
661 u64 *defrag_end) 761 u64 *defrag_end)
@@ -665,10 +765,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
665 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 765 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
666 int ret = 1; 766 int ret = 1;
667 767
668
669 if (thresh == 0)
670 thresh = 256 * 1024;
671
672 /* 768 /*
673 * make sure that once we start defragging and extent, we keep on 769 * make sure that once we start defragging and extent, we keep on
674 * defragging it 770 * defragging it
@@ -727,27 +823,176 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
727 return ret; 823 return ret;
728} 824}
729 825
730static int btrfs_defrag_file(struct file *file, 826/*
731 struct btrfs_ioctl_defrag_range_args *range) 827 * it doesn't do much good to defrag one or two pages
828 * at a time. This pulls in a nice chunk of pages
829 * to COW and defrag.
830 *
831 * It also makes sure the delalloc code has enough
832 * dirty data to avoid making new small extents as part
833 * of the defrag
834 *
835 * It's a good idea to start RA on this range
836 * before calling this.
837 */
838static int cluster_pages_for_defrag(struct inode *inode,
839 struct page **pages,
840 unsigned long start_index,
841 int num_pages)
732{ 842{
733 struct inode *inode = fdentry(file)->d_inode; 843 unsigned long file_end;
734 struct btrfs_root *root = BTRFS_I(inode)->root; 844 u64 isize = i_size_read(inode);
735 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 845 u64 page_start;
846 u64 page_end;
847 int ret;
848 int i;
849 int i_done;
736 struct btrfs_ordered_extent *ordered; 850 struct btrfs_ordered_extent *ordered;
737 struct page *page; 851 struct extent_state *cached_state = NULL;
852
853 if (isize == 0)
854 return 0;
855 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
856
857 ret = btrfs_delalloc_reserve_space(inode,
858 num_pages << PAGE_CACHE_SHIFT);
859 if (ret)
860 return ret;
861again:
862 ret = 0;
863 i_done = 0;
864
865 /* step one, lock all the pages */
866 for (i = 0; i < num_pages; i++) {
867 struct page *page;
868 page = grab_cache_page(inode->i_mapping,
869 start_index + i);
870 if (!page)
871 break;
872
873 if (!PageUptodate(page)) {
874 btrfs_readpage(NULL, page);
875 lock_page(page);
876 if (!PageUptodate(page)) {
877 unlock_page(page);
878 page_cache_release(page);
879 ret = -EIO;
880 break;
881 }
882 }
883 isize = i_size_read(inode);
884 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
885 if (!isize || page->index > file_end ||
886 page->mapping != inode->i_mapping) {
887 /* whoops, we blew past eof, skip this page */
888 unlock_page(page);
889 page_cache_release(page);
890 break;
891 }
892 pages[i] = page;
893 i_done++;
894 }
895 if (!i_done || ret)
896 goto out;
897
898 if (!(inode->i_sb->s_flags & MS_ACTIVE))
899 goto out;
900
901 /*
902 * so now we have a nice long stream of locked
903 * and up to date pages, lets wait on them
904 */
905 for (i = 0; i < i_done; i++)
906 wait_on_page_writeback(pages[i]);
907
908 page_start = page_offset(pages[0]);
909 page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
910
911 lock_extent_bits(&BTRFS_I(inode)->io_tree,
912 page_start, page_end - 1, 0, &cached_state,
913 GFP_NOFS);
914 ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1);
915 if (ordered &&
916 ordered->file_offset + ordered->len > page_start &&
917 ordered->file_offset < page_end) {
918 btrfs_put_ordered_extent(ordered);
919 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
920 page_start, page_end - 1,
921 &cached_state, GFP_NOFS);
922 for (i = 0; i < i_done; i++) {
923 unlock_page(pages[i]);
924 page_cache_release(pages[i]);
925 }
926 btrfs_wait_ordered_range(inode, page_start,
927 page_end - page_start);
928 goto again;
929 }
930 if (ordered)
931 btrfs_put_ordered_extent(ordered);
932
933 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
934 page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
935 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
936 GFP_NOFS);
937
938 if (i_done != num_pages) {
939 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
940 btrfs_delalloc_release_space(inode,
941 (num_pages - i_done) << PAGE_CACHE_SHIFT);
942 }
943
944
945 btrfs_set_extent_delalloc(inode, page_start, page_end - 1,
946 &cached_state);
947
948 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
949 page_start, page_end - 1, &cached_state,
950 GFP_NOFS);
951
952 for (i = 0; i < i_done; i++) {
953 clear_page_dirty_for_io(pages[i]);
954 ClearPageChecked(pages[i]);
955 set_page_extent_mapped(pages[i]);
956 set_page_dirty(pages[i]);
957 unlock_page(pages[i]);
958 page_cache_release(pages[i]);
959 }
960 return i_done;
961out:
962 for (i = 0; i < i_done; i++) {
963 unlock_page(pages[i]);
964 page_cache_release(pages[i]);
965 }
966 btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT);
967 return ret;
968
969}
970
971int btrfs_defrag_file(struct inode *inode, struct file *file,
972 struct btrfs_ioctl_defrag_range_args *range,
973 u64 newer_than, unsigned long max_to_defrag)
974{
975 struct btrfs_root *root = BTRFS_I(inode)->root;
738 struct btrfs_super_block *disk_super; 976 struct btrfs_super_block *disk_super;
977 struct file_ra_state *ra = NULL;
739 unsigned long last_index; 978 unsigned long last_index;
740 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
741 unsigned long total_read = 0;
742 u64 features; 979 u64 features;
743 u64 page_start;
744 u64 page_end;
745 u64 last_len = 0; 980 u64 last_len = 0;
746 u64 skip = 0; 981 u64 skip = 0;
747 u64 defrag_end = 0; 982 u64 defrag_end = 0;
983 u64 newer_off = range->start;
984 int newer_left = 0;
748 unsigned long i; 985 unsigned long i;
749 int ret; 986 int ret;
987 int defrag_count = 0;
750 int compress_type = BTRFS_COMPRESS_ZLIB; 988 int compress_type = BTRFS_COMPRESS_ZLIB;
989 int extent_thresh = range->extent_thresh;
990 int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
991 u64 new_align = ~((u64)128 * 1024 - 1);
992 struct page **pages = NULL;
993
994 if (extent_thresh == 0)
995 extent_thresh = 256 * 1024;
751 996
752 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { 997 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
753 if (range->compress_type > BTRFS_COMPRESS_TYPES) 998 if (range->compress_type > BTRFS_COMPRESS_TYPES)
@@ -759,6 +1004,27 @@ static int btrfs_defrag_file(struct file *file,
759 if (inode->i_size == 0) 1004 if (inode->i_size == 0)
760 return 0; 1005 return 0;
761 1006
1007 /*
1008 * if we were not given a file, allocate a readahead
1009 * context
1010 */
1011 if (!file) {
1012 ra = kzalloc(sizeof(*ra), GFP_NOFS);
1013 if (!ra)
1014 return -ENOMEM;
1015 file_ra_state_init(ra, inode->i_mapping);
1016 } else {
1017 ra = &file->f_ra;
1018 }
1019
1020 pages = kmalloc(sizeof(struct page *) * newer_cluster,
1021 GFP_NOFS);
1022 if (!pages) {
1023 ret = -ENOMEM;
1024 goto out_ra;
1025 }
1026
1027 /* find the last page to defrag */
762 if (range->start + range->len > range->start) { 1028 if (range->start + range->len > range->start) {
763 last_index = min_t(u64, inode->i_size - 1, 1029 last_index = min_t(u64, inode->i_size - 1,
764 range->start + range->len - 1) >> PAGE_CACHE_SHIFT; 1030 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
@@ -766,11 +1032,37 @@ static int btrfs_defrag_file(struct file *file,
766 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; 1032 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
767 } 1033 }
768 1034
769 i = range->start >> PAGE_CACHE_SHIFT; 1035 if (newer_than) {
770 while (i <= last_index) { 1036 ret = find_new_extents(root, inode, newer_than,
771 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, 1037 &newer_off, 64 * 1024);
1038 if (!ret) {
1039 range->start = newer_off;
1040 /*
1041 * we always align our defrag to help keep
1042 * the extents in the file evenly spaced
1043 */
1044 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1045 newer_left = newer_cluster;
1046 } else
1047 goto out_ra;
1048 } else {
1049 i = range->start >> PAGE_CACHE_SHIFT;
1050 }
1051 if (!max_to_defrag)
1052 max_to_defrag = last_index - 1;
1053
1054 while (i <= last_index && defrag_count < max_to_defrag) {
1055 /*
1056 * make sure we stop running if someone unmounts
1057 * the FS
1058 */
1059 if (!(inode->i_sb->s_flags & MS_ACTIVE))
1060 break;
1061
1062 if (!newer_than &&
1063 !should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
772 PAGE_CACHE_SIZE, 1064 PAGE_CACHE_SIZE,
773 range->extent_thresh, 1065 extent_thresh,
774 &last_len, &skip, 1066 &last_len, &skip,
775 &defrag_end)) { 1067 &defrag_end)) {
776 unsigned long next; 1068 unsigned long next;
@@ -782,92 +1074,39 @@ static int btrfs_defrag_file(struct file *file,
782 i = max(i + 1, next); 1074 i = max(i + 1, next);
783 continue; 1075 continue;
784 } 1076 }
785
786 if (total_read % ra_pages == 0) {
787 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
788 min(last_index, i + ra_pages - 1));
789 }
790 total_read++;
791 mutex_lock(&inode->i_mutex);
792 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 1077 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
793 BTRFS_I(inode)->force_compress = compress_type; 1078 BTRFS_I(inode)->force_compress = compress_type;
794 1079
795 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 1080 btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
796 if (ret)
797 goto err_unlock;
798again:
799 if (inode->i_size == 0 ||
800 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
801 ret = 0;
802 goto err_reservations;
803 }
804 1081
805 page = grab_cache_page(inode->i_mapping, i); 1082 ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
806 if (!page) { 1083 if (ret < 0)
807 ret = -ENOMEM; 1084 goto out_ra;
808 goto err_reservations;
809 }
810
811 if (!PageUptodate(page)) {
812 btrfs_readpage(NULL, page);
813 lock_page(page);
814 if (!PageUptodate(page)) {
815 unlock_page(page);
816 page_cache_release(page);
817 ret = -EIO;
818 goto err_reservations;
819 }
820 }
821
822 if (page->mapping != inode->i_mapping) {
823 unlock_page(page);
824 page_cache_release(page);
825 goto again;
826 }
827
828 wait_on_page_writeback(page);
829 1085
830 if (PageDirty(page)) { 1086 defrag_count += ret;
831 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 1087 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
832 goto loop_unlock; 1088 i += ret;
833 }
834 1089
835 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 1090 if (newer_than) {
836 page_end = page_start + PAGE_CACHE_SIZE - 1; 1091 if (newer_off == (u64)-1)
837 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 1092 break;
838 1093
839 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1094 newer_off = max(newer_off + 1,
840 if (ordered) { 1095 (u64)i << PAGE_CACHE_SHIFT);
841 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 1096
842 unlock_page(page); 1097 ret = find_new_extents(root, inode,
843 page_cache_release(page); 1098 newer_than, &newer_off,
844 btrfs_start_ordered_extent(inode, ordered, 1); 1099 64 * 1024);
845 btrfs_put_ordered_extent(ordered); 1100 if (!ret) {
846 goto again; 1101 range->start = newer_off;
1102 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1103 newer_left = newer_cluster;
1104 } else {
1105 break;
1106 }
1107 } else {
1108 i++;
847 } 1109 }
848 set_page_extent_mapped(page);
849
850 /*
851 * this makes sure page_mkwrite is called on the
852 * page if it is dirtied again later
853 */
854 clear_page_dirty_for_io(page);
855 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
856 page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
857 EXTENT_DO_ACCOUNTING, GFP_NOFS);
858
859 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
860 ClearPageChecked(page);
861 set_page_dirty(page);
862 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
863
864loop_unlock:
865 unlock_page(page);
866 page_cache_release(page);
867 mutex_unlock(&inode->i_mutex);
868
869 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
870 i++;
871 } 1110 }
872 1111
873 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) 1112 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
@@ -899,12 +1138,14 @@ loop_unlock:
899 btrfs_set_super_incompat_flags(disk_super, features); 1138 btrfs_set_super_incompat_flags(disk_super, features);
900 } 1139 }
901 1140
902 return 0; 1141 if (!file)
1142 kfree(ra);
1143 return defrag_count;
903 1144
904err_reservations: 1145out_ra:
905 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 1146 if (!file)
906err_unlock: 1147 kfree(ra);
907 mutex_unlock(&inode->i_mutex); 1148 kfree(pages);
908 return ret; 1149 return ret;
909} 1150}
910 1151
@@ -1756,7 +1997,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1756 /* the rest are all set to zero by kzalloc */ 1997 /* the rest are all set to zero by kzalloc */
1757 range->len = (u64)-1; 1998 range->len = (u64)-1;
1758 } 1999 }
1759 ret = btrfs_defrag_file(file, range); 2000 ret = btrfs_defrag_file(fdentry(file)->d_inode, file,
2001 range, 0, 0);
2002 if (ret > 0)
2003 ret = 0;
1760 kfree(range); 2004 kfree(range);
1761 break; 2005 break;
1762 default: 2006 default: