diff options
author | Tristan Ye <tristan.ye@oracle.com> | 2011-05-25 02:23:43 -0400 |
---|---|---|
committer | Tristan Ye <tristan.ye@oracle.com> | 2011-05-25 03:17:12 -0400 |
commit | 53069d4e76954e2e63c1b3c501051c6fbcf7298c (patch) | |
tree | 6b906eb22fef78636c7d4db7120370d7715f5aef | |
parent | ee16cc037e255801892481a2d0b7c1fff2adf1aa (diff) |
Ocfs2/move_extents: move/defrag extents within a certain range.
the basic logic of moving extents for a file is pretty like punching-hole
sequence, walk the extents within the range as user specified, calculating
an appropriate len to defrag/move, then let ocfs2_defrag/move_extent() to
do the actual moving.
This func ends up setting 'OCFS2_MOVE_EXT_FL_COMPLETE' to userpace if operation
gets done successfully.
Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
-rw-r--r-- | fs/ocfs2/ioctl.c | 5 | ||||
-rw-r--r-- | fs/ocfs2/move_extents.c | 308 | ||||
-rw-r--r-- | fs/ocfs2/move_extents.h | 2 |
3 files changed, 315 insertions, 0 deletions
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index fd248ed53df7..59100598b0cb 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include "dir.h" | 26 | #include "dir.h" |
27 | #include "buffer_head_io.h" | 27 | #include "buffer_head_io.h" |
28 | #include "suballoc.h" | 28 | #include "suballoc.h" |
29 | #include "move_extents.h" | ||
29 | 30 | ||
30 | #include <linux/ext2_fs.h> | 31 | #include <linux/ext2_fs.h> |
31 | 32 | ||
@@ -951,6 +952,8 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) | |||
951 | return -EFAULT; | 952 | return -EFAULT; |
952 | 953 | ||
953 | return ocfs2_info_handle(inode, &info, 0); | 954 | return ocfs2_info_handle(inode, &info, 0); |
955 | case OCFS2_IOC_MOVE_EXT: | ||
956 | return ocfs2_ioctl_move_extents(filp, (void __user *)arg); | ||
954 | default: | 957 | default: |
955 | return -ENOTTY; | 958 | return -ENOTTY; |
956 | } | 959 | } |
@@ -993,6 +996,8 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) | |||
993 | return -EFAULT; | 996 | return -EFAULT; |
994 | 997 | ||
995 | return ocfs2_info_handle(inode, &info, 1); | 998 | return ocfs2_info_handle(inode, &info, 1); |
999 | case OCFS2_IOC_MOVE_EXT: | ||
1000 | break; | ||
996 | default: | 1001 | default: |
997 | return -ENOIOCTLCMD; | 1002 | return -ENOIOCTLCMD; |
998 | } | 1003 | } |
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 1c822e08fea0..800552168d8a 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c | |||
@@ -827,3 +827,311 @@ static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged, | |||
827 | *len_defraged = 0; | 827 | *len_defraged = 0; |
828 | } | 828 | } |
829 | } | 829 | } |
830 | |||
831 | static int __ocfs2_move_extents_range(struct buffer_head *di_bh, | ||
832 | struct ocfs2_move_extents_context *context) | ||
833 | { | ||
834 | int ret = 0, flags, do_defrag, skip = 0; | ||
835 | u32 cpos, phys_cpos, move_start, len_to_move, alloc_size; | ||
836 | u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0; | ||
837 | |||
838 | struct inode *inode = context->inode; | ||
839 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; | ||
840 | struct ocfs2_move_extents *range = context->range; | ||
841 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
842 | |||
843 | if ((inode->i_size == 0) || (range->me_len == 0)) | ||
844 | return 0; | ||
845 | |||
846 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) | ||
847 | return 0; | ||
848 | |||
849 | context->refcount_loc = le64_to_cpu(di->i_refcount_loc); | ||
850 | |||
851 | ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh); | ||
852 | ocfs2_init_dealloc_ctxt(&context->dealloc); | ||
853 | |||
854 | /* | ||
855 | * TO-DO XXX: | ||
856 | * | ||
857 | * - xattr extents. | ||
858 | */ | ||
859 | |||
860 | do_defrag = context->auto_defrag; | ||
861 | |||
862 | /* | ||
863 | * extents moving happens in unit of clusters, for the sake | ||
864 | * of simplicity, we may ignore two clusters where 'byte_start' | ||
865 | * and 'byte_start + len' were within. | ||
866 | */ | ||
867 | move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start); | ||
868 | len_to_move = (range->me_start + range->me_len) >> | ||
869 | osb->s_clustersize_bits; | ||
870 | if (len_to_move >= move_start) | ||
871 | len_to_move -= move_start; | ||
872 | else | ||
873 | len_to_move = 0; | ||
874 | |||
875 | if (do_defrag) | ||
876 | defrag_thresh = range->me_threshold >> osb->s_clustersize_bits; | ||
877 | else | ||
878 | new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, | ||
879 | range->me_goal); | ||
880 | |||
881 | mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, " | ||
882 | "thresh: %u\n", | ||
883 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
884 | (unsigned long long)range->me_start, | ||
885 | (unsigned long long)range->me_len, | ||
886 | move_start, len_to_move, defrag_thresh); | ||
887 | |||
888 | cpos = move_start; | ||
889 | while (len_to_move) { | ||
890 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size, | ||
891 | &flags); | ||
892 | if (ret) { | ||
893 | mlog_errno(ret); | ||
894 | goto out; | ||
895 | } | ||
896 | |||
897 | if (alloc_size > len_to_move) | ||
898 | alloc_size = len_to_move; | ||
899 | |||
900 | /* | ||
901 | * XXX: how to deal with a hole: | ||
902 | * | ||
903 | * - skip the hole of course | ||
904 | * - force a new defragmentation | ||
905 | */ | ||
906 | if (!phys_cpos) { | ||
907 | if (do_defrag) | ||
908 | len_defraged = 0; | ||
909 | |||
910 | goto next; | ||
911 | } | ||
912 | |||
913 | if (do_defrag) { | ||
914 | ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged, | ||
915 | defrag_thresh, &skip); | ||
916 | /* | ||
917 | * skip large extents | ||
918 | */ | ||
919 | if (skip) { | ||
920 | skip = 0; | ||
921 | goto next; | ||
922 | } | ||
923 | |||
924 | mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, " | ||
925 | "alloc_size: %u, len_defraged: %u\n", | ||
926 | cpos, phys_cpos, alloc_size, len_defraged); | ||
927 | |||
928 | ret = ocfs2_defrag_extent(context, cpos, phys_cpos, | ||
929 | alloc_size, flags); | ||
930 | } else { | ||
931 | ret = ocfs2_move_extent(context, cpos, phys_cpos, | ||
932 | &new_phys_cpos, alloc_size, | ||
933 | flags); | ||
934 | |||
935 | new_phys_cpos += alloc_size; | ||
936 | } | ||
937 | |||
938 | if (ret < 0) { | ||
939 | mlog_errno(ret); | ||
940 | goto out; | ||
941 | } | ||
942 | |||
943 | context->clusters_moved += alloc_size; | ||
944 | next: | ||
945 | cpos += alloc_size; | ||
946 | len_to_move -= alloc_size; | ||
947 | } | ||
948 | |||
949 | range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE; | ||
950 | |||
951 | out: | ||
952 | range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb, | ||
953 | context->clusters_moved); | ||
954 | range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb, | ||
955 | context->new_phys_cpos); | ||
956 | |||
957 | ocfs2_schedule_truncate_log_flush(osb, 1); | ||
958 | ocfs2_run_deallocs(osb, &context->dealloc); | ||
959 | |||
960 | return ret; | ||
961 | } | ||
962 | |||
963 | static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) | ||
964 | { | ||
965 | int status; | ||
966 | handle_t *handle; | ||
967 | struct inode *inode = context->inode; | ||
968 | struct ocfs2_dinode *di; | ||
969 | struct buffer_head *di_bh = NULL; | ||
970 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
971 | |||
972 | if (!inode) | ||
973 | return -ENOENT; | ||
974 | |||
975 | if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) | ||
976 | return -EROFS; | ||
977 | |||
978 | mutex_lock(&inode->i_mutex); | ||
979 | |||
980 | /* | ||
981 | * This prevents concurrent writes from other nodes | ||
982 | */ | ||
983 | status = ocfs2_rw_lock(inode, 1); | ||
984 | if (status) { | ||
985 | mlog_errno(status); | ||
986 | goto out; | ||
987 | } | ||
988 | |||
989 | status = ocfs2_inode_lock(inode, &di_bh, 1); | ||
990 | if (status) { | ||
991 | mlog_errno(status); | ||
992 | goto out_rw_unlock; | ||
993 | } | ||
994 | |||
995 | /* | ||
996 | * rememer ip_xattr_sem also needs to be held if necessary | ||
997 | */ | ||
998 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
999 | |||
1000 | status = __ocfs2_move_extents_range(di_bh, context); | ||
1001 | |||
1002 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1003 | if (status) { | ||
1004 | mlog_errno(status); | ||
1005 | goto out_inode_unlock; | ||
1006 | } | ||
1007 | |||
1008 | /* | ||
1009 | * We update ctime for these changes | ||
1010 | */ | ||
1011 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | ||
1012 | if (IS_ERR(handle)) { | ||
1013 | status = PTR_ERR(handle); | ||
1014 | mlog_errno(status); | ||
1015 | goto out_inode_unlock; | ||
1016 | } | ||
1017 | |||
1018 | status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, | ||
1019 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1020 | if (status) { | ||
1021 | mlog_errno(status); | ||
1022 | goto out_commit; | ||
1023 | } | ||
1024 | |||
1025 | di = (struct ocfs2_dinode *)di_bh->b_data; | ||
1026 | inode->i_ctime = CURRENT_TIME; | ||
1027 | di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); | ||
1028 | di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); | ||
1029 | |||
1030 | ocfs2_journal_dirty(handle, di_bh); | ||
1031 | |||
1032 | out_commit: | ||
1033 | ocfs2_commit_trans(osb, handle); | ||
1034 | |||
1035 | out_inode_unlock: | ||
1036 | brelse(di_bh); | ||
1037 | ocfs2_inode_unlock(inode, 1); | ||
1038 | out_rw_unlock: | ||
1039 | ocfs2_rw_unlock(inode, 1); | ||
1040 | out: | ||
1041 | mutex_unlock(&inode->i_mutex); | ||
1042 | |||
1043 | return status; | ||
1044 | } | ||
1045 | |||
1046 | int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) | ||
1047 | { | ||
1048 | int status; | ||
1049 | |||
1050 | struct inode *inode = filp->f_path.dentry->d_inode; | ||
1051 | struct ocfs2_move_extents range; | ||
1052 | struct ocfs2_move_extents_context *context = NULL; | ||
1053 | |||
1054 | status = mnt_want_write(filp->f_path.mnt); | ||
1055 | if (status) | ||
1056 | return status; | ||
1057 | |||
1058 | if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) | ||
1059 | goto out; | ||
1060 | |||
1061 | if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { | ||
1062 | status = -EPERM; | ||
1063 | goto out; | ||
1064 | } | ||
1065 | |||
1066 | context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS); | ||
1067 | if (!context) { | ||
1068 | status = -ENOMEM; | ||
1069 | mlog_errno(status); | ||
1070 | goto out; | ||
1071 | } | ||
1072 | |||
1073 | context->inode = inode; | ||
1074 | context->file = filp; | ||
1075 | |||
1076 | if (argp) { | ||
1077 | if (copy_from_user(&range, (struct ocfs2_move_extents *)argp, | ||
1078 | sizeof(range))) { | ||
1079 | status = -EFAULT; | ||
1080 | goto out; | ||
1081 | } | ||
1082 | } else { | ||
1083 | status = -EINVAL; | ||
1084 | goto out; | ||
1085 | } | ||
1086 | |||
1087 | if (range.me_start > i_size_read(inode)) | ||
1088 | goto out; | ||
1089 | |||
1090 | if (range.me_start + range.me_len > i_size_read(inode)) | ||
1091 | range.me_len = i_size_read(inode) - range.me_start; | ||
1092 | |||
1093 | context->range = ⦥ | ||
1094 | |||
1095 | if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { | ||
1096 | context->auto_defrag = 1; | ||
1097 | if (!range.me_threshold) | ||
1098 | /* | ||
1099 | * ok, the default theshold for the defragmentation | ||
1100 | * is 1M, since our maximum clustersize was 1M also. | ||
1101 | * any thought? | ||
1102 | */ | ||
1103 | range.me_threshold = 1024 * 1024; | ||
1104 | } else { | ||
1105 | /* | ||
1106 | * first best-effort attempt to validate and adjust the goal | ||
1107 | * (physical address in block), while it can't guarantee later | ||
1108 | * operation can succeed all the time since global_bitmap may | ||
1109 | * change a bit over time. | ||
1110 | */ | ||
1111 | |||
1112 | status = ocfs2_validate_and_adjust_move_goal(inode, &range); | ||
1113 | if (status) | ||
1114 | goto out; | ||
1115 | } | ||
1116 | |||
1117 | status = ocfs2_move_extents(context); | ||
1118 | if (status) | ||
1119 | mlog_errno(status); | ||
1120 | out: | ||
1121 | /* | ||
1122 | * movement/defragmentation may end up being partially completed, | ||
1123 | * that's the reason why we need to return userspace the finished | ||
1124 | * length and new_offset even if failure happens somewhere. | ||
1125 | */ | ||
1126 | if (argp) { | ||
1127 | if (copy_to_user((struct ocfs2_move_extents *)argp, &range, | ||
1128 | sizeof(range))) | ||
1129 | status = -EFAULT; | ||
1130 | } | ||
1131 | |||
1132 | kfree(context); | ||
1133 | |||
1134 | mnt_drop_write(filp->f_path.mnt); | ||
1135 | |||
1136 | return status; | ||
1137 | } | ||
diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h index 27570f7f6909..4e143e811441 100644 --- a/fs/ocfs2/move_extents.h +++ b/fs/ocfs2/move_extents.h | |||
@@ -17,4 +17,6 @@ | |||
17 | #ifndef OCFS2_MOVE_EXTENTS_H | 17 | #ifndef OCFS2_MOVE_EXTENTS_H |
18 | #define OCFS2_MOVE_EXTENTS_H | 18 | #define OCFS2_MOVE_EXTENTS_H |
19 | 19 | ||
20 | int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp); | ||
21 | |||
20 | #endif /* OCFS2_MOVE_EXTENTS_H */ | 22 | #endif /* OCFS2_MOVE_EXTENTS_H */ |