summaryrefslogtreecommitdiffstats
path: root/fs/fuse
diff options
context:
space:
mode:
authorKirill Smelkov <kirr@nexedi.com>2019-03-27 07:14:15 -0400
committerMiklos Szeredi <mszeredi@redhat.com>2019-04-24 11:05:06 -0400
commitad2ba64dd489805e7ddf5fecf166cae1e09fc5c0 (patch)
treee5ccd42c77f4ff72d3869cb6d93b4d83f88fd6f9 /fs/fuse
parentf2294482ff65dd9c9c3c6ae1447f908c6aa60f52 (diff)
fuse: allow filesystems to have precise control over data cache
On networked filesystems file data can be changed externally. FUSE provides notification messages for filesystem to inform kernel that metadata or data region of a file needs to be invalidated in local page cache. That provides the basis for filesystem implementations to invalidate kernel cache explicitly based on observed filesystem-specific events. FUSE has also "automatic" invalidation mode(*) when the kernel automatically invalidates data cache of a file if it sees mtime change. It also automatically invalidates whole data cache of a file if it sees file size being changed. The automatic mode has corresponding capability - FUSE_AUTO_INVAL_DATA. However, due to probably historical reason, that capability controls only whether mtime change should be resulting in automatic invalidation or not. A change in file size always results in invalidating whole data cache of a file irregardless of whether FUSE_AUTO_INVAL_DATA was negotiated(+). The filesystem I write[1] represents data arrays stored in networked database as local files suitable for mmap. It is read-only filesystem - changes to data are committed externally via database interfaces and the filesystem only glues data into contiguous file streams suitable for mmap and traditional array processing. The files are big - starting from hundreds gigabytes and more. The files change regularly, and frequently by data being appended to their end. The size of files thus changes frequently. If a file was accessed locally and some part of its data got into page cache, we want that data to stay cached unless there is memory pressure, or unless corresponding part of the file was actually changed. However current FUSE behaviour - when it sees file size change - is to invalidate the whole file. The data cache of the file is thus completely lost even on small size change, and despite that the filesystem server is careful to accurately translate database changes into FUSE invalidation messages to kernel. Let's fix it: if a filesystem, through new FUSE_EXPLICIT_INVAL_DATA capability, indicates to kernel that it is fully responsible for data cache invalidation, then the kernel won't invalidate files data cache on size change and only truncate that cache to new size in case the size decreased. (*) see 72d0d248ca "fuse: add FUSE_AUTO_INVAL_DATA init flag", eed2179efe "fuse: invalidate inode mapping if mtime changes" (+) in writeback mode the kernel does not invalidate data cache on file size change, but neither it allows the filesystem to set the size due to external event (see 8373200b12 "fuse: Trust kernel i_size only") [1] https://lab.nexedi.com/kirr/wendelin.core/blob/a50f1d9f/wcfs/wcfs.go#L20 Signed-off-by: Kirill Smelkov <kirr@nexedi.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Diffstat (limited to 'fs/fuse')
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/fuse/inode.c7
2 files changed, 8 insertions, 2 deletions
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e6195bc8f836..24dbca777775 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -694,6 +694,9 @@ struct fuse_conn {
694 /** Use enhanced/automatic page cache invalidation. */ 694 /** Use enhanced/automatic page cache invalidation. */
695 unsigned auto_inval_data:1; 695 unsigned auto_inval_data:1;
696 696
697 /** Filesystem is fully reponsible for page cache invalidation. */
698 unsigned explicit_inval_data:1;
699
697 /** Does the filesystem support readdirplus? */ 700 /** Does the filesystem support readdirplus? */
698 unsigned do_readdirplus:1; 701 unsigned do_readdirplus:1;
699 702
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 36981ea7eac0..c67b39e88fd5 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -237,7 +237,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
237 237
238 if (oldsize != attr->size) { 238 if (oldsize != attr->size) {
239 truncate_pagecache(inode, attr->size); 239 truncate_pagecache(inode, attr->size);
240 inval = true; 240 if (!fc->explicit_inval_data)
241 inval = true;
241 } else if (fc->auto_inval_data) { 242 } else if (fc->auto_inval_data) {
242 struct timespec64 new_mtime = { 243 struct timespec64 new_mtime = {
243 .tv_sec = attr->mtime, 244 .tv_sec = attr->mtime,
@@ -912,6 +913,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
912 fc->dont_mask = 1; 913 fc->dont_mask = 1;
913 if (arg->flags & FUSE_AUTO_INVAL_DATA) 914 if (arg->flags & FUSE_AUTO_INVAL_DATA)
914 fc->auto_inval_data = 1; 915 fc->auto_inval_data = 1;
916 else if (arg->flags & FUSE_EXPLICIT_INVAL_DATA)
917 fc->explicit_inval_data = 1;
915 if (arg->flags & FUSE_DO_READDIRPLUS) { 918 if (arg->flags & FUSE_DO_READDIRPLUS) {
916 fc->do_readdirplus = 1; 919 fc->do_readdirplus = 1;
917 if (arg->flags & FUSE_READDIRPLUS_AUTO) 920 if (arg->flags & FUSE_READDIRPLUS_AUTO)
@@ -973,7 +976,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
973 FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | 976 FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
974 FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | 977 FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
975 FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | 978 FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
976 FUSE_NO_OPENDIR_SUPPORT; 979 FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA;
977 req->in.h.opcode = FUSE_INIT; 980 req->in.h.opcode = FUSE_INIT;
978 req->in.numargs = 1; 981 req->in.numargs = 1;
979 req->in.args[0].size = sizeof(*arg); 982 req->in.args[0].size = sizeof(*arg);