aboutsummaryrefslogtreecommitdiffstats
path: root/fs/fuse/file.c
diff options
context:
space:
mode:
authorMiklos Szeredi <mszeredi@suse.cz>2008-04-30 03:54:41 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-04-30 11:29:50 -0400
commit3be5a52b30aa5cf9d795b7634f728f612197b1c4 (patch)
tree5a78251a351e273cf2061a527a381c7ba256fc15 /fs/fuse/file.c
parentb88473f73e6d7b6af9cfc4ecc349d82c75d9a6af (diff)
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/fuse/file.c')
-rw-r--r--fs/fuse/file.c321
1 files changed, 305 insertions, 16 deletions
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 676b0bc8a86d..68051f3bdf91 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -210,6 +210,49 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
210 return (u64) v0 + ((u64) v1 << 32); 210 return (u64) v0 + ((u64) v1 << 32);
211} 211}
212 212
213/*
214 * Check if page is under writeback
215 *
216 * This is currently done by walking the list of writepage requests
217 * for the inode, which can be pretty inefficient.
218 */
219static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
220{
221 struct fuse_conn *fc = get_fuse_conn(inode);
222 struct fuse_inode *fi = get_fuse_inode(inode);
223 struct fuse_req *req;
224 bool found = false;
225
226 spin_lock(&fc->lock);
227 list_for_each_entry(req, &fi->writepages, writepages_entry) {
228 pgoff_t curr_index;
229
230 BUG_ON(req->inode != inode);
231 curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
232 if (curr_index == index) {
233 found = true;
234 break;
235 }
236 }
237 spin_unlock(&fc->lock);
238
239 return found;
240}
241
242/*
243 * Wait for page writeback to be completed.
244 *
245 * Since fuse doesn't rely on the VM writeback tracking, this has to
246 * use some other means.
247 */
248static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
249{
250 struct fuse_inode *fi = get_fuse_inode(inode);
251
252 wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
253 return 0;
254}
255
213static int fuse_flush(struct file *file, fl_owner_t id) 256static int fuse_flush(struct file *file, fl_owner_t id)
214{ 257{
215 struct inode *inode = file->f_path.dentry->d_inode; 258 struct inode *inode = file->f_path.dentry->d_inode;
@@ -245,6 +288,21 @@ static int fuse_flush(struct file *file, fl_owner_t id)
245 return err; 288 return err;
246} 289}
247 290
291/*
292 * Wait for all pending writepages on the inode to finish.
293 *
294 * This is currently done by blocking further writes with FUSE_NOWRITE
295 * and waiting for all sent writes to complete.
296 *
297 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
298 * could conflict with truncation.
299 */
300static void fuse_sync_writes(struct inode *inode)
301{
302 fuse_set_nowrite(inode);
303 fuse_release_nowrite(inode);
304}
305
248int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, 306int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
249 int isdir) 307 int isdir)
250{ 308{
@@ -261,6 +319,17 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
261 if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) 319 if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
262 return 0; 320 return 0;
263 321
322 /*
323 * Start writeback against all dirty pages of the inode, then
324 * wait for all outstanding writes, before sending the FSYNC
325 * request.
326 */
327 err = write_inode_now(inode, 0);
328 if (err)
329 return err;
330
331 fuse_sync_writes(inode);
332
264 req = fuse_get_req(fc); 333 req = fuse_get_req(fc);
265 if (IS_ERR(req)) 334 if (IS_ERR(req))
266 return PTR_ERR(req); 335 return PTR_ERR(req);
@@ -340,6 +409,13 @@ static int fuse_readpage(struct file *file, struct page *page)
340 if (is_bad_inode(inode)) 409 if (is_bad_inode(inode))
341 goto out; 410 goto out;
342 411
412 /*
413 * Page writeback can extend beyond the liftime of the
414 * page-cache page, so make sure we read a properly synced
415 * page.
416 */
417 fuse_wait_on_page_writeback(inode, page->index);
418
343 req = fuse_get_req(fc); 419 req = fuse_get_req(fc);
344 err = PTR_ERR(req); 420 err = PTR_ERR(req);
345 if (IS_ERR(req)) 421 if (IS_ERR(req))
@@ -411,6 +487,8 @@ static int fuse_readpages_fill(void *_data, struct page *page)
411 struct inode *inode = data->inode; 487 struct inode *inode = data->inode;
412 struct fuse_conn *fc = get_fuse_conn(inode); 488 struct fuse_conn *fc = get_fuse_conn(inode);
413 489
490 fuse_wait_on_page_writeback(inode, page->index);
491
414 if (req->num_pages && 492 if (req->num_pages &&
415 (req->num_pages == FUSE_MAX_PAGES_PER_REQ || 493 (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
416 (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || 494 (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
@@ -477,11 +555,10 @@ static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
477} 555}
478 556
479static void fuse_write_fill(struct fuse_req *req, struct file *file, 557static void fuse_write_fill(struct fuse_req *req, struct file *file,
480 struct inode *inode, loff_t pos, size_t count, 558 struct fuse_file *ff, struct inode *inode,
481 int writepage) 559 loff_t pos, size_t count, int writepage)
482{ 560{
483 struct fuse_conn *fc = get_fuse_conn(inode); 561 struct fuse_conn *fc = get_fuse_conn(inode);
484 struct fuse_file *ff = file->private_data;
485 struct fuse_write_in *inarg = &req->misc.write.in; 562 struct fuse_write_in *inarg = &req->misc.write.in;
486 struct fuse_write_out *outarg = &req->misc.write.out; 563 struct fuse_write_out *outarg = &req->misc.write.out;
487 564
@@ -490,7 +567,7 @@ static void fuse_write_fill(struct fuse_req *req, struct file *file,
490 inarg->offset = pos; 567 inarg->offset = pos;
491 inarg->size = count; 568 inarg->size = count;
492 inarg->write_flags = writepage ? FUSE_WRITE_CACHE : 0; 569 inarg->write_flags = writepage ? FUSE_WRITE_CACHE : 0;
493 inarg->flags = file->f_flags; 570 inarg->flags = file ? file->f_flags : 0;
494 req->in.h.opcode = FUSE_WRITE; 571 req->in.h.opcode = FUSE_WRITE;
495 req->in.h.nodeid = get_node_id(inode); 572 req->in.h.nodeid = get_node_id(inode);
496 req->in.argpages = 1; 573 req->in.argpages = 1;
@@ -511,7 +588,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
511 fl_owner_t owner) 588 fl_owner_t owner)
512{ 589{
513 struct fuse_conn *fc = get_fuse_conn(inode); 590 struct fuse_conn *fc = get_fuse_conn(inode);
514 fuse_write_fill(req, file, inode, pos, count, 0); 591 fuse_write_fill(req, file, file->private_data, inode, pos, count, 0);
515 if (owner != NULL) { 592 if (owner != NULL) {
516 struct fuse_write_in *inarg = &req->misc.write.in; 593 struct fuse_write_in *inarg = &req->misc.write.in;
517 inarg->write_flags |= FUSE_WRITE_LOCKOWNER; 594 inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
@@ -546,6 +623,12 @@ static int fuse_buffered_write(struct file *file, struct inode *inode,
546 if (is_bad_inode(inode)) 623 if (is_bad_inode(inode))
547 return -EIO; 624 return -EIO;
548 625
626 /*
627 * Make sure writepages on the same page are not mixed up with
628 * plain writes.
629 */
630 fuse_wait_on_page_writeback(inode, page->index);
631
549 req = fuse_get_req(fc); 632 req = fuse_get_req(fc);
550 if (IS_ERR(req)) 633 if (IS_ERR(req))
551 return PTR_ERR(req); 634 return PTR_ERR(req);
@@ -716,21 +799,225 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
716 return res; 799 return res;
717} 800}
718 801
719static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) 802static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
720{ 803{
721 if ((vma->vm_flags & VM_SHARED)) { 804 __free_page(req->pages[0]);
722 if ((vma->vm_flags & VM_WRITE)) 805 fuse_file_put(req->ff);
723 return -ENODEV; 806 fuse_put_request(fc, req);
724 else 807}
725 vma->vm_flags &= ~VM_MAYWRITE; 808
809static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
810{
811 struct inode *inode = req->inode;
812 struct fuse_inode *fi = get_fuse_inode(inode);
813 struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
814
815 list_del(&req->writepages_entry);
816 dec_bdi_stat(bdi, BDI_WRITEBACK);
817 dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);
818 bdi_writeout_inc(bdi);
819 wake_up(&fi->page_waitq);
820}
821
822/* Called under fc->lock, may release and reacquire it */
823static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
824{
825 struct fuse_inode *fi = get_fuse_inode(req->inode);
826 loff_t size = i_size_read(req->inode);
827 struct fuse_write_in *inarg = &req->misc.write.in;
828
829 if (!fc->connected)
830 goto out_free;
831
832 if (inarg->offset + PAGE_CACHE_SIZE <= size) {
833 inarg->size = PAGE_CACHE_SIZE;
834 } else if (inarg->offset < size) {
835 inarg->size = size & (PAGE_CACHE_SIZE - 1);
836 } else {
837 /* Got truncated off completely */
838 goto out_free;
726 } 839 }
727 return generic_file_mmap(file, vma); 840
841 req->in.args[1].size = inarg->size;
842 fi->writectr++;
843 request_send_background_locked(fc, req);
844 return;
845
846 out_free:
847 fuse_writepage_finish(fc, req);
848 spin_unlock(&fc->lock);
849 fuse_writepage_free(fc, req);
850 spin_lock(&fc->lock);
728} 851}
729 852
730static int fuse_set_page_dirty(struct page *page) 853/*
854 * If fi->writectr is positive (no truncate or fsync going on) send
855 * all queued writepage requests.
856 *
857 * Called with fc->lock
858 */
859void fuse_flush_writepages(struct inode *inode)
731{ 860{
732 printk("fuse_set_page_dirty: should not happen\n"); 861 struct fuse_conn *fc = get_fuse_conn(inode);
733 dump_stack(); 862 struct fuse_inode *fi = get_fuse_inode(inode);
863 struct fuse_req *req;
864
865 while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
866 req = list_entry(fi->queued_writes.next, struct fuse_req, list);
867 list_del_init(&req->list);
868 fuse_send_writepage(fc, req);
869 }
870}
871
872static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
873{
874 struct inode *inode = req->inode;
875 struct fuse_inode *fi = get_fuse_inode(inode);
876
877 mapping_set_error(inode->i_mapping, req->out.h.error);
878 spin_lock(&fc->lock);
879 fi->writectr--;
880 fuse_writepage_finish(fc, req);
881 spin_unlock(&fc->lock);
882 fuse_writepage_free(fc, req);
883}
884
885static int fuse_writepage_locked(struct page *page)
886{
887 struct address_space *mapping = page->mapping;
888 struct inode *inode = mapping->host;
889 struct fuse_conn *fc = get_fuse_conn(inode);
890 struct fuse_inode *fi = get_fuse_inode(inode);
891 struct fuse_req *req;
892 struct fuse_file *ff;
893 struct page *tmp_page;
894
895 set_page_writeback(page);
896
897 req = fuse_request_alloc_nofs();
898 if (!req)
899 goto err;
900
901 tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
902 if (!tmp_page)
903 goto err_free;
904
905 spin_lock(&fc->lock);
906 BUG_ON(list_empty(&fi->write_files));
907 ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
908 req->ff = fuse_file_get(ff);
909 spin_unlock(&fc->lock);
910
911 fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1);
912
913 copy_highpage(tmp_page, page);
914 req->num_pages = 1;
915 req->pages[0] = tmp_page;
916 req->page_offset = 0;
917 req->end = fuse_writepage_end;
918 req->inode = inode;
919
920 inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
921 inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
922 end_page_writeback(page);
923
924 spin_lock(&fc->lock);
925 list_add(&req->writepages_entry, &fi->writepages);
926 list_add_tail(&req->list, &fi->queued_writes);
927 fuse_flush_writepages(inode);
928 spin_unlock(&fc->lock);
929
930 return 0;
931
932err_free:
933 fuse_request_free(req);
934err:
935 end_page_writeback(page);
936 return -ENOMEM;
937}
938
939static int fuse_writepage(struct page *page, struct writeback_control *wbc)
940{
941 int err;
942
943 err = fuse_writepage_locked(page);
944 unlock_page(page);
945
946 return err;
947}
948
949static int fuse_launder_page(struct page *page)
950{
951 int err = 0;
952 if (clear_page_dirty_for_io(page)) {
953 struct inode *inode = page->mapping->host;
954 err = fuse_writepage_locked(page);
955 if (!err)
956 fuse_wait_on_page_writeback(inode, page->index);
957 }
958 return err;
959}
960
961/*
962 * Write back dirty pages now, because there may not be any suitable
963 * open files later
964 */
965static void fuse_vma_close(struct vm_area_struct *vma)
966{
967 filemap_write_and_wait(vma->vm_file->f_mapping);
968}
969
970/*
971 * Wait for writeback against this page to complete before allowing it
972 * to be marked dirty again, and hence written back again, possibly
973 * before the previous writepage completed.
974 *
975 * Block here, instead of in ->writepage(), so that the userspace fs
976 * can only block processes actually operating on the filesystem.
977 *
978 * Otherwise unprivileged userspace fs would be able to block
979 * unrelated:
980 *
981 * - page migration
982 * - sync(2)
983 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
984 */
985static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page)
986{
987 /*
988 * Don't use page->mapping as it may become NULL from a
989 * concurrent truncate.
990 */
991 struct inode *inode = vma->vm_file->f_mapping->host;
992
993 fuse_wait_on_page_writeback(inode, page->index);
994 return 0;
995}
996
997static struct vm_operations_struct fuse_file_vm_ops = {
998 .close = fuse_vma_close,
999 .fault = filemap_fault,
1000 .page_mkwrite = fuse_page_mkwrite,
1001};
1002
1003static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
1004{
1005 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1006 struct inode *inode = file->f_dentry->d_inode;
1007 struct fuse_conn *fc = get_fuse_conn(inode);
1008 struct fuse_inode *fi = get_fuse_inode(inode);
1009 struct fuse_file *ff = file->private_data;
1010 /*
1011 * file may be written through mmap, so chain it onto the
1012 * inodes's write_file list
1013 */
1014 spin_lock(&fc->lock);
1015 if (list_empty(&ff->write_entry))
1016 list_add(&ff->write_entry, &fi->write_files);
1017 spin_unlock(&fc->lock);
1018 }
1019 file_accessed(file);
1020 vma->vm_ops = &fuse_file_vm_ops;
734 return 0; 1021 return 0;
735} 1022}
736 1023
@@ -940,10 +1227,12 @@ static const struct file_operations fuse_direct_io_file_operations = {
940 1227
941static const struct address_space_operations fuse_file_aops = { 1228static const struct address_space_operations fuse_file_aops = {
942 .readpage = fuse_readpage, 1229 .readpage = fuse_readpage,
1230 .writepage = fuse_writepage,
1231 .launder_page = fuse_launder_page,
943 .write_begin = fuse_write_begin, 1232 .write_begin = fuse_write_begin,
944 .write_end = fuse_write_end, 1233 .write_end = fuse_write_end,
945 .readpages = fuse_readpages, 1234 .readpages = fuse_readpages,
946 .set_page_dirty = fuse_set_page_dirty, 1235 .set_page_dirty = __set_page_dirty_nobuffers,
947 .bmap = fuse_bmap, 1236 .bmap = fuse_bmap,
948}; 1237};
949 1238