aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJosef Bacik <josef@redhat.com>2011-01-25 14:57:24 -0500
committerJosef Bacik <josef@redhat.com>2011-03-17 14:21:15 -0400
commitd0215f3e5ebb5803cd6ec067b10c5e00a3ad7cfc (patch)
tree790070af3c6c05b0599e26c7ce93516192a04a94 /fs
parent9f570b8d48b6677b5557d86fb3ca148215e295f2 (diff)
Btrfs: simplify our write path
Our aio_write function is huge and kind of hard to follow at times. So this patch fixes this by breaking out the buffered and direct write paths out into seperate functions so it's a little clearer what's going on. I've also fixed some wrong typing that we had and added the ability to handle getting an error back from btrfs_set_extent_delalloc. Tested this with xfstests and everything came out fine. Thanks, Signed-off-by: Josef Bacik <josef@redhat.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/file.c355
1 files changed, 180 insertions, 175 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4d4975592668..f2a80e570a6c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -45,14 +45,14 @@
45 * and be replaced with calls into generic code. 45 * and be replaced with calls into generic code.
46 */ 46 */
47static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 47static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
48 int write_bytes, 48 size_t write_bytes,
49 struct page **prepared_pages, 49 struct page **prepared_pages,
50 struct iov_iter *i) 50 struct iov_iter *i)
51{ 51{
52 size_t copied = 0; 52 size_t copied = 0;
53 size_t total_copied = 0;
53 int pg = 0; 54 int pg = 0;
54 int offset = pos & (PAGE_CACHE_SIZE - 1); 55 int offset = pos & (PAGE_CACHE_SIZE - 1);
55 int total_copied = 0;
56 56
57 while (write_bytes > 0) { 57 while (write_bytes > 0) {
58 size_t count = min_t(size_t, 58 size_t count = min_t(size_t,
@@ -129,13 +129,12 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
129 * this also makes the decision about creating an inline extent vs 129 * this also makes the decision about creating an inline extent vs
130 * doing real data extents, marking pages dirty and delalloc as required. 130 * doing real data extents, marking pages dirty and delalloc as required.
131 */ 131 */
132static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, 132static noinline int dirty_and_release_pages(struct btrfs_root *root,
133 struct btrfs_root *root, 133 struct file *file,
134 struct file *file, 134 struct page **pages,
135 struct page **pages, 135 size_t num_pages,
136 size_t num_pages, 136 loff_t pos,
137 loff_t pos, 137 size_t write_bytes)
138 size_t write_bytes)
139{ 138{
140 int err = 0; 139 int err = 0;
141 int i; 140 int i;
@@ -153,7 +152,8 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
153 end_of_last_block = start_pos + num_bytes - 1; 152 end_of_last_block = start_pos + num_bytes - 1;
154 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 153 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
155 NULL); 154 NULL);
156 BUG_ON(err); 155 if (err)
156 return err;
157 157
158 for (i = 0; i < num_pages; i++) { 158 for (i = 0; i < num_pages; i++) {
159 struct page *p = pages[i]; 159 struct page *p = pages[i];
@@ -896,127 +896,38 @@ fail:
896 896
897} 897}
898 898
899static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 899static noinline ssize_t __btrfs_buffered_write(struct file *file,
900 const struct iovec *iov, 900 struct iov_iter *i,
901 unsigned long nr_segs, loff_t pos) 901 loff_t pos)
902{ 902{
903 struct file *file = iocb->ki_filp;
904 struct inode *inode = fdentry(file)->d_inode; 903 struct inode *inode = fdentry(file)->d_inode;
905 struct btrfs_root *root = BTRFS_I(inode)->root; 904 struct btrfs_root *root = BTRFS_I(inode)->root;
906 struct page **pages = NULL; 905 struct page **pages = NULL;
907 struct iov_iter i;
908 loff_t *ppos = &iocb->ki_pos;
909 loff_t start_pos;
910 ssize_t num_written = 0;
911 ssize_t err = 0;
912 size_t count;
913 size_t ocount;
914 int ret = 0;
915 int nrptrs;
916 unsigned long first_index; 906 unsigned long first_index;
917 unsigned long last_index; 907 unsigned long last_index;
918 int will_write; 908 size_t num_written = 0;
919 int buffered = 0; 909 int nrptrs;
920 int copied = 0; 910 int ret;
921 int dirty_pages = 0;
922
923 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
924 (file->f_flags & O_DIRECT));
925
926 start_pos = pos;
927
928 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
929
930 mutex_lock(&inode->i_mutex);
931
932 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
933 if (err)
934 goto out;
935 count = ocount;
936
937 current->backing_dev_info = inode->i_mapping->backing_dev_info;
938 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
939 if (err)
940 goto out;
941
942 if (count == 0)
943 goto out;
944
945 err = file_remove_suid(file);
946 if (err)
947 goto out;
948
949 /*
950 * If BTRFS flips readonly due to some impossible error
951 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
952 * although we have opened a file as writable, we have
953 * to stop this write operation to ensure FS consistency.
954 */
955 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
956 err = -EROFS;
957 goto out;
958 }
959
960 file_update_time(file);
961 BTRFS_I(inode)->sequence++;
962
963 if (unlikely(file->f_flags & O_DIRECT)) {
964 num_written = generic_file_direct_write(iocb, iov, &nr_segs,
965 pos, ppos, count,
966 ocount);
967 /*
968 * the generic O_DIRECT will update in-memory i_size after the
969 * DIOs are done. But our endio handlers that update the on
970 * disk i_size never update past the in memory i_size. So we
971 * need one more update here to catch any additions to the
972 * file
973 */
974 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
975 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
976 mark_inode_dirty(inode);
977 }
978
979 if (num_written < 0) {
980 ret = num_written;
981 num_written = 0;
982 goto out;
983 } else if (num_written == count) {
984 /* pick up pos changes done by the generic code */
985 pos = *ppos;
986 goto out;
987 }
988 /*
989 * We are going to do buffered for the rest of the range, so we
990 * need to make sure to invalidate the buffered pages when we're
991 * done.
992 */
993 buffered = 1;
994 pos += num_written;
995 }
996 911
997 iov_iter_init(&i, iov, nr_segs, count, num_written); 912 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
998 nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
999 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 913 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
1000 (sizeof(struct page *))); 914 (sizeof(struct page *)));
1001 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 915 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1002 if (!pages) { 916 if (!pages)
1003 ret = -ENOMEM; 917 return -ENOMEM;
1004 goto out;
1005 }
1006
1007 /* generic_write_checks can change our pos */
1008 start_pos = pos;
1009 918
1010 first_index = pos >> PAGE_CACHE_SHIFT; 919 first_index = pos >> PAGE_CACHE_SHIFT;
1011 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; 920 last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT;
1012 921
1013 while (iov_iter_count(&i) > 0) { 922 while (iov_iter_count(i) > 0) {
1014 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 923 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1015 size_t write_bytes = min(iov_iter_count(&i), 924 size_t write_bytes = min(iov_iter_count(i),
1016 nrptrs * (size_t)PAGE_CACHE_SIZE - 925 nrptrs * (size_t)PAGE_CACHE_SIZE -
1017 offset); 926 offset);
1018 size_t num_pages = (write_bytes + offset + 927 size_t num_pages = (write_bytes + offset +
1019 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 928 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
929 size_t dirty_pages;
930 size_t copied;
1020 931
1021 WARN_ON(num_pages > nrptrs); 932 WARN_ON(num_pages > nrptrs);
1022 memset(pages, 0, sizeof(struct page *) * nrptrs); 933 memset(pages, 0, sizeof(struct page *) * nrptrs);
@@ -1025,15 +936,15 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1025 * Fault pages before locking them in prepare_pages 936 * Fault pages before locking them in prepare_pages
1026 * to avoid recursive lock 937 * to avoid recursive lock
1027 */ 938 */
1028 if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) { 939 if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1029 ret = -EFAULT; 940 ret = -EFAULT;
1030 goto out; 941 break;
1031 } 942 }
1032 943
1033 ret = btrfs_delalloc_reserve_space(inode, 944 ret = btrfs_delalloc_reserve_space(inode,
1034 num_pages << PAGE_CACHE_SHIFT); 945 num_pages << PAGE_CACHE_SHIFT);
1035 if (ret) 946 if (ret)
1036 goto out; 947 break;
1037 948
1038 ret = prepare_pages(root, file, pages, num_pages, 949 ret = prepare_pages(root, file, pages, num_pages,
1039 pos, first_index, last_index, 950 pos, first_index, last_index,
@@ -1041,11 +952,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1041 if (ret) { 952 if (ret) {
1042 btrfs_delalloc_release_space(inode, 953 btrfs_delalloc_release_space(inode,
1043 num_pages << PAGE_CACHE_SHIFT); 954 num_pages << PAGE_CACHE_SHIFT);
1044 goto out; 955 break;
1045 } 956 }
1046 957
1047 copied = btrfs_copy_from_user(pos, num_pages, 958 copied = btrfs_copy_from_user(pos, num_pages,
1048 write_bytes, pages, &i); 959 write_bytes, pages, i);
1049 960
1050 /* 961 /*
1051 * if we have trouble faulting in the pages, fall 962 * if we have trouble faulting in the pages, fall
@@ -1061,6 +972,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1061 PAGE_CACHE_SIZE - 1) >> 972 PAGE_CACHE_SIZE - 1) >>
1062 PAGE_CACHE_SHIFT; 973 PAGE_CACHE_SHIFT;
1063 974
975 /*
976 * If we had a short copy we need to release the excess delaloc
977 * bytes we reserved. We need to increment outstanding_extents
978 * because btrfs_delalloc_release_space will decrement it, but
979 * we still have an outstanding extent for the chunk we actually
980 * managed to copy.
981 */
1064 if (num_pages > dirty_pages) { 982 if (num_pages > dirty_pages) {
1065 if (copied > 0) 983 if (copied > 0)
1066 atomic_inc( 984 atomic_inc(
@@ -1071,39 +989,157 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1071 } 989 }
1072 990
1073 if (copied > 0) { 991 if (copied > 0) {
1074 dirty_and_release_pages(NULL, root, file, pages, 992 ret = dirty_and_release_pages(root, file, pages,
1075 dirty_pages, pos, copied); 993 dirty_pages, pos,
994 copied);
995 if (ret) {
996 btrfs_delalloc_release_space(inode,
997 dirty_pages << PAGE_CACHE_SHIFT);
998 btrfs_drop_pages(pages, num_pages);
999 break;
1000 }
1076 } 1001 }
1077 1002
1078 btrfs_drop_pages(pages, num_pages); 1003 btrfs_drop_pages(pages, num_pages);
1079 1004
1080 if (copied > 0) { 1005 cond_resched();
1081 if (will_write) { 1006
1082 filemap_fdatawrite_range(inode->i_mapping, pos, 1007 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1083 pos + copied - 1); 1008 dirty_pages);
1084 } else { 1009 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1085 balance_dirty_pages_ratelimited_nr( 1010 btrfs_btree_balance_dirty(root, 1);
1086 inode->i_mapping, 1011 btrfs_throttle(root);
1087 dirty_pages);
1088 if (dirty_pages <
1089 (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1090 btrfs_btree_balance_dirty(root, 1);
1091 btrfs_throttle(root);
1092 }
1093 }
1094 1012
1095 pos += copied; 1013 pos += copied;
1096 num_written += copied; 1014 num_written += copied;
1015 }
1097 1016
1098 cond_resched(); 1017 kfree(pages);
1018
1019 return num_written ? num_written : ret;
1020}
1021
1022static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1023 const struct iovec *iov,
1024 unsigned long nr_segs, loff_t pos,
1025 loff_t *ppos, size_t count, size_t ocount)
1026{
1027 struct file *file = iocb->ki_filp;
1028 struct inode *inode = fdentry(file)->d_inode;
1029 struct iov_iter i;
1030 ssize_t written;
1031 ssize_t written_buffered;
1032 loff_t endbyte;
1033 int err;
1034
1035 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
1036 count, ocount);
1037
1038 /*
1039 * the generic O_DIRECT will update in-memory i_size after the
1040 * DIOs are done. But our endio handlers that update the on
1041 * disk i_size never update past the in memory i_size. So we
1042 * need one more update here to catch any additions to the
1043 * file
1044 */
1045 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
1046 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
1047 mark_inode_dirty(inode);
1099 } 1048 }
1049
1050 if (written < 0 || written == count)
1051 return written;
1052
1053 pos += written;
1054 count -= written;
1055 iov_iter_init(&i, iov, nr_segs, count, written);
1056 written_buffered = __btrfs_buffered_write(file, &i, pos);
1057 if (written_buffered < 0) {
1058 err = written_buffered;
1059 goto out;
1060 }
1061 endbyte = pos + written_buffered - 1;
1062 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
1063 if (err)
1064 goto out;
1065 written += written_buffered;
1066 *ppos = pos + written_buffered;
1067 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
1068 endbyte >> PAGE_CACHE_SHIFT);
1100out: 1069out:
1101 mutex_unlock(&inode->i_mutex); 1070 return written ? written : err;
1102 if (ret) 1071}
1103 err = ret;
1104 1072
1105 kfree(pages); 1073static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1106 *ppos = pos; 1074 const struct iovec *iov,
1075 unsigned long nr_segs, loff_t pos)
1076{
1077 struct file *file = iocb->ki_filp;
1078 struct inode *inode = fdentry(file)->d_inode;
1079 struct btrfs_root *root = BTRFS_I(inode)->root;
1080 loff_t *ppos = &iocb->ki_pos;
1081 ssize_t num_written = 0;
1082 ssize_t err = 0;
1083 size_t count, ocount;
1084
1085 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1086
1087 mutex_lock(&inode->i_mutex);
1088
1089 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
1090 if (err) {
1091 mutex_unlock(&inode->i_mutex);
1092 goto out;
1093 }
1094 count = ocount;
1095
1096 current->backing_dev_info = inode->i_mapping->backing_dev_info;
1097 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1098 if (err) {
1099 mutex_unlock(&inode->i_mutex);
1100 goto out;
1101 }
1102
1103 if (count == 0) {
1104 mutex_unlock(&inode->i_mutex);
1105 goto out;
1106 }
1107
1108 err = file_remove_suid(file);
1109 if (err) {
1110 mutex_unlock(&inode->i_mutex);
1111 goto out;
1112 }
1113
1114 /*
1115 * If BTRFS flips readonly due to some impossible error
1116 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
1117 * although we have opened a file as writable, we have
1118 * to stop this write operation to ensure FS consistency.
1119 */
1120 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
1121 mutex_unlock(&inode->i_mutex);
1122 err = -EROFS;
1123 goto out;
1124 }
1125
1126 file_update_time(file);
1127 BTRFS_I(inode)->sequence++;
1128
1129 if (unlikely(file->f_flags & O_DIRECT)) {
1130 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1131 pos, ppos, count, ocount);
1132 } else {
1133 struct iov_iter i;
1134
1135 iov_iter_init(&i, iov, nr_segs, count, num_written);
1136
1137 num_written = __btrfs_buffered_write(file, &i, pos);
1138 if (num_written > 0)
1139 *ppos = pos + num_written;
1140 }
1141
1142 mutex_unlock(&inode->i_mutex);
1107 1143
1108 /* 1144 /*
1109 * we want to make sure fsync finds this change 1145 * we want to make sure fsync finds this change
@@ -1118,43 +1154,12 @@ out:
1118 * one running right now. 1154 * one running right now.
1119 */ 1155 */
1120 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1156 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1121 1157 if (num_written > 0 || num_written == -EIOCBQUEUED) {
1122 if (num_written > 0 && will_write) { 1158 err = generic_write_sync(file, pos, num_written);
1123 struct btrfs_trans_handle *trans; 1159 if (err < 0 && num_written > 0)
1124
1125 err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1126 if (err)
1127 num_written = err; 1160 num_written = err;
1128
1129 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1130 trans = btrfs_start_transaction(root, 0);
1131 if (IS_ERR(trans)) {
1132 num_written = PTR_ERR(trans);
1133 goto done;
1134 }
1135 mutex_lock(&inode->i_mutex);
1136 ret = btrfs_log_dentry_safe(trans, root,
1137 file->f_dentry);
1138 mutex_unlock(&inode->i_mutex);
1139 if (ret == 0) {
1140 ret = btrfs_sync_log(trans, root);
1141 if (ret == 0)
1142 btrfs_end_transaction(trans, root);
1143 else
1144 btrfs_commit_transaction(trans, root);
1145 } else if (ret != BTRFS_NO_LOG_SYNC) {
1146 btrfs_commit_transaction(trans, root);
1147 } else {
1148 btrfs_end_transaction(trans, root);
1149 }
1150 }
1151 if (file->f_flags & O_DIRECT && buffered) {
1152 invalidate_mapping_pages(inode->i_mapping,
1153 start_pos >> PAGE_CACHE_SHIFT,
1154 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1155 }
1156 } 1161 }
1157done: 1162out:
1158 current->backing_dev_info = NULL; 1163 current->backing_dev_info = NULL;
1159 return num_written ? num_written : err; 1164 return num_written ? num_written : err;
1160} 1165}