diff options
author | Andrew Morton <akpm@osdl.org> | 2006-03-31 05:30:42 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-03-31 15:18:54 -0500 |
commit | f79e2abb9bd452d97295f34376dedbec9686b986 (patch) | |
tree | 56b9998caa11983556e842fb9a8143d86d765fa3 /fs | |
parent | d6dfd1310d3562698fd7c3c086f6c239f96394ac (diff) |
[PATCH] sys_sync_file_range()
Remove the recently-added LINUX_FADV_ASYNC_WRITE and LINUX_FADV_WRITE_WAIT
fadvise() additions, do it in a new sys_sync_file_range() syscall instead.
Reasons:
- It's more flexible. Things which would require two or three syscalls with
fadvise() can be done in a single syscall.
- Using fadvise() in this manner is something not covered by POSIX.
The patch wires up the syscall for x86.
The sycall is implemented in the new fs/sync.c. The intention is that we can
move sys_fsync(), sys_fdatasync() and perhaps sys_sync() into there later.
Documentation for the syscall is in fs/sync.c.
A test app (sync_file_range.c) is in
http://www.zip.com.au/~akpm/linux/patches/stuff/ext3-tools.tar.gz.
The available-to-GPL-modules do_sync_file_range() is for knfsd: "A COMMIT can
say NFS_DATA_SYNC or NFS_FILE_SYNC. I can skip the ->fsync call for
NFS_DATA_SYNC which is hopefully the more common."
Note: the `async' writeout mode SYNC_FILE_RANGE_WRITE will turn synchronous if
the queue is congested. This is trivial to fix: add a new flag bit, set
wbc->nonblocking. But I'm not sure that we want to expose implementation
details down to that level.
Note: it's notable that we can sync an fd which wasn't opened for writing.
Same with fsync() and fdatasync()).
Note: the code takes some care to handle attempts to sync file contents
outside the 16TB offset on 32-bit machines. It makes such attempts appear to
succeed, for best 32-bit/64-bit compatibility. Perhaps it should make such
requests fail...
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/Makefile | 2 | ||||
-rw-r--r-- | fs/sync.c | 164 |
2 files changed, 165 insertions, 1 deletions
diff --git a/fs/Makefile b/fs/Makefile index f3a4f7077175..83bf478e786b 100644 --- a/fs/Makefile +++ b/fs/Makefile | |||
@@ -10,7 +10,7 @@ obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \ | |||
10 | ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ | 10 | ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ |
11 | attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ | 11 | attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ |
12 | seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \ | 12 | seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \ |
13 | ioprio.o pnode.o drop_caches.o splice.o | 13 | ioprio.o pnode.o drop_caches.o splice.o sync.o |
14 | 14 | ||
15 | obj-$(CONFIG_INOTIFY) += inotify.o | 15 | obj-$(CONFIG_INOTIFY) += inotify.o |
16 | obj-$(CONFIG_EPOLL) += eventpoll.o | 16 | obj-$(CONFIG_EPOLL) += eventpoll.o |
diff --git a/fs/sync.c b/fs/sync.c new file mode 100644 index 000000000000..8616006d2094 --- /dev/null +++ b/fs/sync.c | |||
@@ -0,0 +1,164 @@ | |||
1 | /* | ||
2 | * High-level sync()-related operations | ||
3 | */ | ||
4 | |||
5 | #include <linux/kernel.h> | ||
6 | #include <linux/file.h> | ||
7 | #include <linux/fs.h> | ||
8 | #include <linux/module.h> | ||
9 | #include <linux/writeback.h> | ||
10 | #include <linux/syscalls.h> | ||
11 | #include <linux/linkage.h> | ||
12 | #include <linux/pagemap.h> | ||
13 | |||
14 | #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ | ||
15 | SYNC_FILE_RANGE_WAIT_AFTER) | ||
16 | |||
17 | /* | ||
18 | * sys_sync_file_range() permits finely controlled syncing over a segment of | ||
19 | * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is | ||
20 | * zero then sys_sync_file_range() will operate from offset out to EOF. | ||
21 | * | ||
22 | * The flag bits are: | ||
23 | * | ||
24 | * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range | ||
25 | * before performing the write. | ||
26 | * | ||
27 | * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the | ||
28 | * range which are not presently under writeback. | ||
29 | * | ||
30 | * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range | ||
31 | * after performing the write. | ||
32 | * | ||
33 | * Useful combinations of the flag bits are: | ||
34 | * | ||
35 | * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages | ||
36 | * in the range which were dirty on entry to sys_sync_file_range() are placed | ||
37 | * under writeout. This is a start-write-for-data-integrity operation. | ||
38 | * | ||
39 | * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which | ||
40 | * are not presently under writeout. This is an asynchronous flush-to-disk | ||
41 | * operation. Not suitable for data integrity operations. | ||
42 | * | ||
43 | * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for | ||
44 | * completion of writeout of all pages in the range. This will be used after an | ||
45 | * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait | ||
46 | * for that operation to complete and to return the result. | ||
47 | * | ||
48 | * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER: | ||
49 | * a traditional sync() operation. This is a write-for-data-integrity operation | ||
50 | * which will ensure that all pages in the range which were dirty on entry to | ||
51 | * sys_sync_file_range() are committed to disk. | ||
52 | * | ||
53 | * | ||
54 | * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any | ||
55 | * I/O errors or ENOSPC conditions and will return those to the caller, after | ||
56 | * clearing the EIO and ENOSPC flags in the address_space. | ||
57 | * | ||
58 | * It should be noted that none of these operations write out the file's | ||
59 | * metadata. So unless the application is strictly performing overwrites of | ||
60 | * already-instantiated disk blocks, there are no guarantees here that the data | ||
61 | * will be available after a crash. | ||
62 | */ | ||
63 | asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes, | ||
64 | int flags) | ||
65 | { | ||
66 | int ret; | ||
67 | struct file *file; | ||
68 | loff_t endbyte; /* inclusive */ | ||
69 | int fput_needed; | ||
70 | umode_t i_mode; | ||
71 | |||
72 | ret = -EINVAL; | ||
73 | if (flags & ~VALID_FLAGS) | ||
74 | goto out; | ||
75 | |||
76 | endbyte = offset + nbytes; | ||
77 | |||
78 | if ((s64)offset < 0) | ||
79 | goto out; | ||
80 | if ((s64)endbyte < 0) | ||
81 | goto out; | ||
82 | if (endbyte < offset) | ||
83 | goto out; | ||
84 | |||
85 | if (sizeof(pgoff_t) == 4) { | ||
86 | if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) { | ||
87 | /* | ||
88 | * The range starts outside a 32 bit machine's | ||
89 | * pagecache addressing capabilities. Let it "succeed" | ||
90 | */ | ||
91 | ret = 0; | ||
92 | goto out; | ||
93 | } | ||
94 | if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) { | ||
95 | /* | ||
96 | * Out to EOF | ||
97 | */ | ||
98 | nbytes = 0; | ||
99 | } | ||
100 | } | ||
101 | |||
102 | if (nbytes == 0) | ||
103 | endbyte = -1; | ||
104 | else | ||
105 | endbyte--; /* inclusive */ | ||
106 | |||
107 | ret = -EBADF; | ||
108 | file = fget_light(fd, &fput_needed); | ||
109 | if (!file) | ||
110 | goto out; | ||
111 | |||
112 | i_mode = file->f_dentry->d_inode->i_mode; | ||
113 | ret = -ESPIPE; | ||
114 | if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) && | ||
115 | !S_ISLNK(i_mode)) | ||
116 | goto out_put; | ||
117 | |||
118 | ret = do_sync_file_range(file, offset, endbyte, flags); | ||
119 | out_put: | ||
120 | fput_light(file, fput_needed); | ||
121 | out: | ||
122 | return ret; | ||
123 | } | ||
124 | |||
125 | /* | ||
126 | * `endbyte' is inclusive | ||
127 | */ | ||
128 | int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, | ||
129 | int flags) | ||
130 | { | ||
131 | int ret; | ||
132 | struct address_space *mapping; | ||
133 | |||
134 | mapping = file->f_mapping; | ||
135 | if (!mapping) { | ||
136 | ret = -EINVAL; | ||
137 | goto out; | ||
138 | } | ||
139 | |||
140 | ret = 0; | ||
141 | if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { | ||
142 | ret = wait_on_page_writeback_range(mapping, | ||
143 | offset >> PAGE_CACHE_SHIFT, | ||
144 | endbyte >> PAGE_CACHE_SHIFT); | ||
145 | if (ret < 0) | ||
146 | goto out; | ||
147 | } | ||
148 | |||
149 | if (flags & SYNC_FILE_RANGE_WRITE) { | ||
150 | ret = __filemap_fdatawrite_range(mapping, offset, endbyte, | ||
151 | WB_SYNC_NONE); | ||
152 | if (ret < 0) | ||
153 | goto out; | ||
154 | } | ||
155 | |||
156 | if (flags & SYNC_FILE_RANGE_WAIT_AFTER) { | ||
157 | ret = wait_on_page_writeback_range(mapping, | ||
158 | offset >> PAGE_CACHE_SHIFT, | ||
159 | endbyte >> PAGE_CACHE_SHIFT); | ||
160 | } | ||
161 | out: | ||
162 | return ret; | ||
163 | } | ||
164 | EXPORT_SYMBOL_GPL(do_sync_file_range); | ||