aboutsummaryrefslogtreecommitdiffstats
path: root/fs/fuse/dir.c
diff options
context:
space:
mode:
authorMiklos Szeredi <mszeredi@suse.cz>2008-04-30 03:54:41 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-04-30 11:29:50 -0400
commit3be5a52b30aa5cf9d795b7634f728f612197b1c4 (patch)
tree5a78251a351e273cf2061a527a381c7ba256fc15 /fs/fuse/dir.c
parentb88473f73e6d7b6af9cfc4ecc349d82c75d9a6af (diff)
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/fuse/dir.c')
-rw-r--r--fs/fuse/dir.c84
1 files changed, 81 insertions, 3 deletions
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c4807b3fc8a3..48b9971ecd97 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1107,6 +1107,50 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
1107} 1107}
1108 1108
1109/* 1109/*
1110 * Prevent concurrent writepages on inode
1111 *
1112 * This is done by adding a negative bias to the inode write counter
1113 * and waiting for all pending writes to finish.
1114 */
1115void fuse_set_nowrite(struct inode *inode)
1116{
1117 struct fuse_conn *fc = get_fuse_conn(inode);
1118 struct fuse_inode *fi = get_fuse_inode(inode);
1119
1120 BUG_ON(!mutex_is_locked(&inode->i_mutex));
1121
1122 spin_lock(&fc->lock);
1123 BUG_ON(fi->writectr < 0);
1124 fi->writectr += FUSE_NOWRITE;
1125 spin_unlock(&fc->lock);
1126 wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE);
1127}
1128
1129/*
1130 * Allow writepages on inode
1131 *
1132 * Remove the bias from the writecounter and send any queued
1133 * writepages.
1134 */
1135static void __fuse_release_nowrite(struct inode *inode)
1136{
1137 struct fuse_inode *fi = get_fuse_inode(inode);
1138
1139 BUG_ON(fi->writectr != FUSE_NOWRITE);
1140 fi->writectr = 0;
1141 fuse_flush_writepages(inode);
1142}
1143
1144void fuse_release_nowrite(struct inode *inode)
1145{
1146 struct fuse_conn *fc = get_fuse_conn(inode);
1147
1148 spin_lock(&fc->lock);
1149 __fuse_release_nowrite(inode);
1150 spin_unlock(&fc->lock);
1151}
1152
1153/*
1110 * Set attributes, and at the same time refresh them. 1154 * Set attributes, and at the same time refresh them.
1111 * 1155 *
1112 * Truncation is slightly complicated, because the 'truncate' request 1156 * Truncation is slightly complicated, because the 'truncate' request
@@ -1122,6 +1166,8 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
1122 struct fuse_req *req; 1166 struct fuse_req *req;
1123 struct fuse_setattr_in inarg; 1167 struct fuse_setattr_in inarg;
1124 struct fuse_attr_out outarg; 1168 struct fuse_attr_out outarg;
1169 bool is_truncate = false;
1170 loff_t oldsize;
1125 int err; 1171 int err;
1126 1172
1127 if (!fuse_allow_task(fc, current)) 1173 if (!fuse_allow_task(fc, current))
@@ -1145,12 +1191,16 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
1145 send_sig(SIGXFSZ, current, 0); 1191 send_sig(SIGXFSZ, current, 0);
1146 return -EFBIG; 1192 return -EFBIG;
1147 } 1193 }
1194 is_truncate = true;
1148 } 1195 }
1149 1196
1150 req = fuse_get_req(fc); 1197 req = fuse_get_req(fc);
1151 if (IS_ERR(req)) 1198 if (IS_ERR(req))
1152 return PTR_ERR(req); 1199 return PTR_ERR(req);
1153 1200
1201 if (is_truncate)
1202 fuse_set_nowrite(inode);
1203
1154 memset(&inarg, 0, sizeof(inarg)); 1204 memset(&inarg, 0, sizeof(inarg));
1155 memset(&outarg, 0, sizeof(outarg)); 1205 memset(&outarg, 0, sizeof(outarg));
1156 iattr_to_fattr(attr, &inarg); 1206 iattr_to_fattr(attr, &inarg);
@@ -1181,16 +1231,44 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
1181 if (err) { 1231 if (err) {
1182 if (err == -EINTR) 1232 if (err == -EINTR)
1183 fuse_invalidate_attr(inode); 1233 fuse_invalidate_attr(inode);
1184 return err; 1234 goto error;
1185 } 1235 }
1186 1236
1187 if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { 1237 if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
1188 make_bad_inode(inode); 1238 make_bad_inode(inode);
1189 return -EIO; 1239 err = -EIO;
1240 goto error;
1241 }
1242
1243 spin_lock(&fc->lock);
1244 fuse_change_attributes_common(inode, &outarg.attr,
1245 attr_timeout(&outarg));
1246 oldsize = inode->i_size;
1247 i_size_write(inode, outarg.attr.size);
1248
1249 if (is_truncate) {
1250 /* NOTE: this may release/reacquire fc->lock */
1251 __fuse_release_nowrite(inode);
1252 }
1253 spin_unlock(&fc->lock);
1254
1255 /*
1256 * Only call invalidate_inode_pages2() after removing
1257 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
1258 */
1259 if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
1260 if (outarg.attr.size < oldsize)
1261 fuse_truncate(inode->i_mapping, outarg.attr.size);
1262 invalidate_inode_pages2(inode->i_mapping);
1190 } 1263 }
1191 1264
1192 fuse_change_attributes(inode, &outarg.attr, attr_timeout(&outarg), 0);
1193 return 0; 1265 return 0;
1266
1267error:
1268 if (is_truncate)
1269 fuse_release_nowrite(inode);
1270
1271 return err;
1194} 1272}
1195 1273
1196static int fuse_setattr(struct dentry *entry, struct iattr *attr) 1274static int fuse_setattr(struct dentry *entry, struct iattr *attr)