diff options
author | Andrew Morton <akpm@osdl.org> | 2006-03-31 05:30:42 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-03-31 15:18:54 -0500 |
commit | f79e2abb9bd452d97295f34376dedbec9686b986 (patch) | |
tree | 56b9998caa11983556e842fb9a8143d86d765fa3 | |
parent | d6dfd1310d3562698fd7c3c086f6c239f96394ac (diff) |
[PATCH] sys_sync_file_range()
Remove the recently-added LINUX_FADV_ASYNC_WRITE and LINUX_FADV_WRITE_WAIT
fadvise() additions, do it in a new sys_sync_file_range() syscall instead.
Reasons:
- It's more flexible. Things which would require two or three syscalls with
fadvise() can be done in a single syscall.
- Using fadvise() in this manner is something not covered by POSIX.
The patch wires up the syscall for x86.
The sycall is implemented in the new fs/sync.c. The intention is that we can
move sys_fsync(), sys_fdatasync() and perhaps sys_sync() into there later.
Documentation for the syscall is in fs/sync.c.
A test app (sync_file_range.c) is in
http://www.zip.com.au/~akpm/linux/patches/stuff/ext3-tools.tar.gz.
The available-to-GPL-modules do_sync_file_range() is for knfsd: "A COMMIT can
say NFS_DATA_SYNC or NFS_FILE_SYNC. I can skip the ->fsync call for
NFS_DATA_SYNC which is hopefully the more common."
Note: the `async' writeout mode SYNC_FILE_RANGE_WRITE will turn synchronous if
the queue is congested. This is trivial to fix: add a new flag bit, set
wbc->nonblocking. But I'm not sure that we want to expose implementation
details down to that level.
Note: it's notable that we can sync an fd which wasn't opened for writing.
Same with fsync() and fdatasync()).
Note: the code takes some care to handle attempts to sync file contents
outside the 16TB offset on 32-bit machines. It makes such attempts appear to
succeed, for best 32-bit/64-bit compatibility. Perhaps it should make such
requests fail...
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | arch/i386/kernel/syscall_table.S | 1 | ||||
-rw-r--r-- | fs/Makefile | 2 | ||||
-rw-r--r-- | fs/sync.c | 164 | ||||
-rw-r--r-- | include/asm-i386/unistd.h | 3 | ||||
-rw-r--r-- | include/linux/fadvise.h | 6 | ||||
-rw-r--r-- | include/linux/fs.h | 7 | ||||
-rw-r--r-- | include/linux/syscalls.h | 2 | ||||
-rw-r--r-- | mm/fadvise.c | 20 |
8 files changed, 177 insertions, 28 deletions
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index ce3ef4fa0551..4f58b9c0efe3 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S | |||
@@ -313,3 +313,4 @@ ENTRY(sys_call_table) | |||
313 | .long sys_set_robust_list | 313 | .long sys_set_robust_list |
314 | .long sys_get_robust_list | 314 | .long sys_get_robust_list |
315 | .long sys_splice | 315 | .long sys_splice |
316 | .long sys_sync_file_range | ||
diff --git a/fs/Makefile b/fs/Makefile index f3a4f7077175..83bf478e786b 100644 --- a/fs/Makefile +++ b/fs/Makefile | |||
@@ -10,7 +10,7 @@ obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \ | |||
10 | ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ | 10 | ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ |
11 | attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ | 11 | attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ |
12 | seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \ | 12 | seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \ |
13 | ioprio.o pnode.o drop_caches.o splice.o | 13 | ioprio.o pnode.o drop_caches.o splice.o sync.o |
14 | 14 | ||
15 | obj-$(CONFIG_INOTIFY) += inotify.o | 15 | obj-$(CONFIG_INOTIFY) += inotify.o |
16 | obj-$(CONFIG_EPOLL) += eventpoll.o | 16 | obj-$(CONFIG_EPOLL) += eventpoll.o |
diff --git a/fs/sync.c b/fs/sync.c new file mode 100644 index 000000000000..8616006d2094 --- /dev/null +++ b/fs/sync.c | |||
@@ -0,0 +1,164 @@ | |||
1 | /* | ||
2 | * High-level sync()-related operations | ||
3 | */ | ||
4 | |||
5 | #include <linux/kernel.h> | ||
6 | #include <linux/file.h> | ||
7 | #include <linux/fs.h> | ||
8 | #include <linux/module.h> | ||
9 | #include <linux/writeback.h> | ||
10 | #include <linux/syscalls.h> | ||
11 | #include <linux/linkage.h> | ||
12 | #include <linux/pagemap.h> | ||
13 | |||
14 | #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ | ||
15 | SYNC_FILE_RANGE_WAIT_AFTER) | ||
16 | |||
17 | /* | ||
18 | * sys_sync_file_range() permits finely controlled syncing over a segment of | ||
19 | * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is | ||
20 | * zero then sys_sync_file_range() will operate from offset out to EOF. | ||
21 | * | ||
22 | * The flag bits are: | ||
23 | * | ||
24 | * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range | ||
25 | * before performing the write. | ||
26 | * | ||
27 | * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the | ||
28 | * range which are not presently under writeback. | ||
29 | * | ||
30 | * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range | ||
31 | * after performing the write. | ||
32 | * | ||
33 | * Useful combinations of the flag bits are: | ||
34 | * | ||
35 | * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages | ||
36 | * in the range which were dirty on entry to sys_sync_file_range() are placed | ||
37 | * under writeout. This is a start-write-for-data-integrity operation. | ||
38 | * | ||
39 | * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which | ||
40 | * are not presently under writeout. This is an asynchronous flush-to-disk | ||
41 | * operation. Not suitable for data integrity operations. | ||
42 | * | ||
43 | * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for | ||
44 | * completion of writeout of all pages in the range. This will be used after an | ||
45 | * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait | ||
46 | * for that operation to complete and to return the result. | ||
47 | * | ||
48 | * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER: | ||
49 | * a traditional sync() operation. This is a write-for-data-integrity operation | ||
50 | * which will ensure that all pages in the range which were dirty on entry to | ||
51 | * sys_sync_file_range() are committed to disk. | ||
52 | * | ||
53 | * | ||
54 | * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any | ||
55 | * I/O errors or ENOSPC conditions and will return those to the caller, after | ||
56 | * clearing the EIO and ENOSPC flags in the address_space. | ||
57 | * | ||
58 | * It should be noted that none of these operations write out the file's | ||
59 | * metadata. So unless the application is strictly performing overwrites of | ||
60 | * already-instantiated disk blocks, there are no guarantees here that the data | ||
61 | * will be available after a crash. | ||
62 | */ | ||
63 | asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes, | ||
64 | int flags) | ||
65 | { | ||
66 | int ret; | ||
67 | struct file *file; | ||
68 | loff_t endbyte; /* inclusive */ | ||
69 | int fput_needed; | ||
70 | umode_t i_mode; | ||
71 | |||
72 | ret = -EINVAL; | ||
73 | if (flags & ~VALID_FLAGS) | ||
74 | goto out; | ||
75 | |||
76 | endbyte = offset + nbytes; | ||
77 | |||
78 | if ((s64)offset < 0) | ||
79 | goto out; | ||
80 | if ((s64)endbyte < 0) | ||
81 | goto out; | ||
82 | if (endbyte < offset) | ||
83 | goto out; | ||
84 | |||
85 | if (sizeof(pgoff_t) == 4) { | ||
86 | if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) { | ||
87 | /* | ||
88 | * The range starts outside a 32 bit machine's | ||
89 | * pagecache addressing capabilities. Let it "succeed" | ||
90 | */ | ||
91 | ret = 0; | ||
92 | goto out; | ||
93 | } | ||
94 | if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) { | ||
95 | /* | ||
96 | * Out to EOF | ||
97 | */ | ||
98 | nbytes = 0; | ||
99 | } | ||
100 | } | ||
101 | |||
102 | if (nbytes == 0) | ||
103 | endbyte = -1; | ||
104 | else | ||
105 | endbyte--; /* inclusive */ | ||
106 | |||
107 | ret = -EBADF; | ||
108 | file = fget_light(fd, &fput_needed); | ||
109 | if (!file) | ||
110 | goto out; | ||
111 | |||
112 | i_mode = file->f_dentry->d_inode->i_mode; | ||
113 | ret = -ESPIPE; | ||
114 | if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) && | ||
115 | !S_ISLNK(i_mode)) | ||
116 | goto out_put; | ||
117 | |||
118 | ret = do_sync_file_range(file, offset, endbyte, flags); | ||
119 | out_put: | ||
120 | fput_light(file, fput_needed); | ||
121 | out: | ||
122 | return ret; | ||
123 | } | ||
124 | |||
125 | /* | ||
126 | * `endbyte' is inclusive | ||
127 | */ | ||
128 | int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, | ||
129 | int flags) | ||
130 | { | ||
131 | int ret; | ||
132 | struct address_space *mapping; | ||
133 | |||
134 | mapping = file->f_mapping; | ||
135 | if (!mapping) { | ||
136 | ret = -EINVAL; | ||
137 | goto out; | ||
138 | } | ||
139 | |||
140 | ret = 0; | ||
141 | if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { | ||
142 | ret = wait_on_page_writeback_range(mapping, | ||
143 | offset >> PAGE_CACHE_SHIFT, | ||
144 | endbyte >> PAGE_CACHE_SHIFT); | ||
145 | if (ret < 0) | ||
146 | goto out; | ||
147 | } | ||
148 | |||
149 | if (flags & SYNC_FILE_RANGE_WRITE) { | ||
150 | ret = __filemap_fdatawrite_range(mapping, offset, endbyte, | ||
151 | WB_SYNC_NONE); | ||
152 | if (ret < 0) | ||
153 | goto out; | ||
154 | } | ||
155 | |||
156 | if (flags & SYNC_FILE_RANGE_WAIT_AFTER) { | ||
157 | ret = wait_on_page_writeback_range(mapping, | ||
158 | offset >> PAGE_CACHE_SHIFT, | ||
159 | endbyte >> PAGE_CACHE_SHIFT); | ||
160 | } | ||
161 | out: | ||
162 | return ret; | ||
163 | } | ||
164 | EXPORT_SYMBOL_GPL(do_sync_file_range); | ||
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index 789e9bdd0a40..2e7f3e257fdd 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h | |||
@@ -319,8 +319,9 @@ | |||
319 | #define __NR_set_robust_list 311 | 319 | #define __NR_set_robust_list 311 |
320 | #define __NR_get_robust_list 312 | 320 | #define __NR_get_robust_list 312 |
321 | #define __NR_sys_splice 313 | 321 | #define __NR_sys_splice 313 |
322 | #define __NR_sys_sync_file_range 314 | ||
322 | 323 | ||
323 | #define NR_syscalls 314 | 324 | #define NR_syscalls 315 |
324 | 325 | ||
325 | /* | 326 | /* |
326 | * user-visible error numbers are in the range -1 - -128: see | 327 | * user-visible error numbers are in the range -1 - -128: see |
diff --git a/include/linux/fadvise.h b/include/linux/fadvise.h index b2913bba35d8..e8e747139b9a 100644 --- a/include/linux/fadvise.h +++ b/include/linux/fadvise.h | |||
@@ -18,10 +18,4 @@ | |||
18 | #define POSIX_FADV_NOREUSE 5 /* Data will be accessed once. */ | 18 | #define POSIX_FADV_NOREUSE 5 /* Data will be accessed once. */ |
19 | #endif | 19 | #endif |
20 | 20 | ||
21 | /* | ||
22 | * Linux-specific fadvise() extensions: | ||
23 | */ | ||
24 | #define LINUX_FADV_ASYNC_WRITE 32 /* Start writeout on range */ | ||
25 | #define LINUX_FADV_WRITE_WAIT 33 /* Wait upon writeout to range */ | ||
26 | |||
27 | #endif /* FADVISE_H_INCLUDED */ | 21 | #endif /* FADVISE_H_INCLUDED */ |
diff --git a/include/linux/fs.h b/include/linux/fs.h index 20a7afd4590c..4ed7e602d703 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -757,6 +757,13 @@ extern void send_sigio(struct fown_struct *fown, int fd, int band); | |||
757 | extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); | 757 | extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); |
758 | extern int fcntl_getlease(struct file *filp); | 758 | extern int fcntl_getlease(struct file *filp); |
759 | 759 | ||
760 | /* fs/sync.c */ | ||
761 | #define SYNC_FILE_RANGE_WAIT_BEFORE 1 | ||
762 | #define SYNC_FILE_RANGE_WRITE 2 | ||
763 | #define SYNC_FILE_RANGE_WAIT_AFTER 4 | ||
764 | extern int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, | ||
765 | int flags); | ||
766 | |||
760 | /* fs/locks.c */ | 767 | /* fs/locks.c */ |
761 | extern void locks_init_lock(struct file_lock *); | 768 | extern void locks_init_lock(struct file_lock *); |
762 | extern void locks_copy_lock(struct file_lock *, struct file_lock *); | 769 | extern void locks_copy_lock(struct file_lock *, struct file_lock *); |
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e78ffc7d5b56..5717147596b6 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h | |||
@@ -571,5 +571,7 @@ asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename, | |||
571 | asmlinkage long sys_unshare(unsigned long unshare_flags); | 571 | asmlinkage long sys_unshare(unsigned long unshare_flags); |
572 | asmlinkage long sys_splice(int fdin, int fdout, size_t len, | 572 | asmlinkage long sys_splice(int fdin, int fdout, size_t len, |
573 | unsigned int flags); | 573 | unsigned int flags); |
574 | asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes, | ||
575 | int flags); | ||
574 | 576 | ||
575 | #endif | 577 | #endif |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 907c39257ca0..0a03357a1f8e 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -35,17 +35,6 @@ | |||
35 | * | 35 | * |
36 | * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. | 36 | * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. |
37 | * | 37 | * |
38 | * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently | ||
39 | * dirty pages at the disk. | ||
40 | * | ||
41 | * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push | ||
42 | * all of the currently dirty pages at the disk, wait until they have been | ||
43 | * written. | ||
44 | * | ||
45 | * It should be noted that none of these operations write out the file's | ||
46 | * metadata. So unless the application is strictly performing overwrites of | ||
47 | * already-instantiated disk blocks, there are no guarantees here that the data | ||
48 | * will be available after a crash. | ||
49 | */ | 38 | */ |
50 | asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | 39 | asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) |
51 | { | 40 | { |
@@ -129,15 +118,6 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | |||
129 | invalidate_mapping_pages(mapping, start_index, | 118 | invalidate_mapping_pages(mapping, start_index, |
130 | end_index); | 119 | end_index); |
131 | break; | 120 | break; |
132 | case LINUX_FADV_ASYNC_WRITE: | ||
133 | ret = __filemap_fdatawrite_range(mapping, offset, endbyte, | ||
134 | WB_SYNC_NONE); | ||
135 | break; | ||
136 | case LINUX_FADV_WRITE_WAIT: | ||
137 | ret = wait_on_page_writeback_range(mapping, | ||
138 | offset >> PAGE_CACHE_SHIFT, | ||
139 | endbyte >> PAGE_CACHE_SHIFT); | ||
140 | break; | ||
141 | default: | 121 | default: |
142 | ret = -EINVAL; | 122 | ret = -EINVAL; |
143 | } | 123 | } |