aboutsummaryrefslogtreecommitdiffstats
path: root/fs/fuse/fuse_i.h
diff options
context:
space:
mode:
authorMiklos Szeredi <mszeredi@suse.cz>2008-04-30 03:54:41 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-04-30 11:29:50 -0400
commit3be5a52b30aa5cf9d795b7634f728f612197b1c4 (patch)
tree5a78251a351e273cf2061a527a381c7ba256fc15 /fs/fuse/fuse_i.h
parentb88473f73e6d7b6af9cfc4ecc349d82c75d9a6af (diff)
fuse: support writable mmap
Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/fuse/fuse_i.h')
-rw-r--r--fs/fuse/fuse_i.h37
1 files changed, 37 insertions, 0 deletions
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index c0481e48d161..4b094fbc9c7f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -15,6 +15,7 @@
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/mutex.h> 17#include <linux/mutex.h>
18#include <linux/rwsem.h>
18 19
19/** Max number of pages that can be used in a single read request */ 20/** Max number of pages that can be used in a single read request */
20#define FUSE_MAX_PAGES_PER_REQ 32 21#define FUSE_MAX_PAGES_PER_REQ 32
@@ -25,6 +26,9 @@
25/** Congestion starts at 75% of maximum */ 26/** Congestion starts at 75% of maximum */
26#define FUSE_CONGESTION_THRESHOLD (FUSE_MAX_BACKGROUND * 75 / 100) 27#define FUSE_CONGESTION_THRESHOLD (FUSE_MAX_BACKGROUND * 75 / 100)
27 28
29/** Bias for fi->writectr, meaning new writepages must not be sent */
30#define FUSE_NOWRITE INT_MIN
31
28/** It could be as large as PATH_MAX, but would that have any uses? */ 32/** It could be as large as PATH_MAX, but would that have any uses? */
29#define FUSE_NAME_MAX 1024 33#define FUSE_NAME_MAX 1024
30 34
@@ -73,6 +77,19 @@ struct fuse_inode {
73 77
74 /** Files usable in writepage. Protected by fc->lock */ 78 /** Files usable in writepage. Protected by fc->lock */
75 struct list_head write_files; 79 struct list_head write_files;
80
81 /** Writepages pending on truncate or fsync */
82 struct list_head queued_writes;
83
84 /** Number of sent writes, a negative bias (FUSE_NOWRITE)
85 * means more writes are blocked */
86 int writectr;
87
88 /** Waitq for writepage completion */
89 wait_queue_head_t page_waitq;
90
91 /** List of writepage requestst (pending or sent) */
92 struct list_head writepages;
76}; 93};
77 94
78/** FUSE specific file data */ 95/** FUSE specific file data */
@@ -242,6 +259,12 @@ struct fuse_req {
242 /** File used in the request (or NULL) */ 259 /** File used in the request (or NULL) */
243 struct fuse_file *ff; 260 struct fuse_file *ff;
244 261
262 /** Inode used in the request or NULL */
263 struct inode *inode;
264
265 /** Link on fi->writepages */
266 struct list_head writepages_entry;
267
245 /** Request completion callback */ 268 /** Request completion callback */
246 void (*end)(struct fuse_conn *, struct fuse_req *); 269 void (*end)(struct fuse_conn *, struct fuse_req *);
247 270
@@ -504,6 +527,11 @@ void fuse_init_symlink(struct inode *inode);
504void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, 527void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
505 u64 attr_valid, u64 attr_version); 528 u64 attr_valid, u64 attr_version);
506 529
530void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
531 u64 attr_valid);
532
533void fuse_truncate(struct address_space *mapping, loff_t offset);
534
507/** 535/**
508 * Initialize the client device 536 * Initialize the client device
509 */ 537 */
@@ -522,6 +550,8 @@ void fuse_ctl_cleanup(void);
522 */ 550 */
523struct fuse_req *fuse_request_alloc(void); 551struct fuse_req *fuse_request_alloc(void);
524 552
553struct fuse_req *fuse_request_alloc_nofs(void);
554
525/** 555/**
526 * Free a request 556 * Free a request
527 */ 557 */
@@ -558,6 +588,8 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
558 */ 588 */
559void request_send_background(struct fuse_conn *fc, struct fuse_req *req); 589void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
560 590
591void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req);
592
561/* Abort all requests */ 593/* Abort all requests */
562void fuse_abort_conn(struct fuse_conn *fc); 594void fuse_abort_conn(struct fuse_conn *fc);
563 595
@@ -600,3 +632,8 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
600 632
601int fuse_update_attributes(struct inode *inode, struct kstat *stat, 633int fuse_update_attributes(struct inode *inode, struct kstat *stat,
602 struct file *file, bool *refreshed); 634 struct file *file, bool *refreshed);
635
636void fuse_flush_writepages(struct inode *inode);
637
638void fuse_set_nowrite(struct inode *inode);
639void fuse_release_nowrite(struct inode *inode);