aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Morton <akpm@osdl.org>2006-03-24 06:18:04 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-24 10:33:25 -0500
commitebcf28e1c7a295f3321249dd235ad2e45938fdd9 (patch)
treefdd2e131e627af55d3741a7fafad0edaa61410c1
parent469eb4d03878b676418f853011ebfb54ccf83a5e (diff)
[PATCH] fadvise(): write commands
Add two new linux-specific fadvise extensions(): LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file offsets `offset' and `offset+len'. Any pages which are currently under writeout are skipped, whether or not they are dirty. LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file offsets `offset' and `offset+len'. By combining these two operations the application may do several things: LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently dirty pages at the disk. LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push all of the currently dirty pages at the disk, wait until they have been written. It should be noted that none of these operations write out the file's metadata. So unless the application is strictly performing overwrites of already-instantiated disk blocks, there are no guarantees here that the data will be available after a crash. To complete this suite of operations I guess we should have a "sync file metadata only" operation. This gives applications access to all the building blocks needed for all sorts of sync operations. But sync-metadata doesn't fit well with the fadvise() interface. Probably it should be a new syscall: sys_fmetadatasync(). The patch also diddles with the meaning of `endbyte' in sys_fadvise64_64(). It is made to represent that last affected byte in the file (ie: it is inclusive). Generally, all these byterange and pagerange functions are inclusive so we can easily represent EOF with -1. As Ulrich notes, these two functions are somewhat abusive of the fadvise() concept, which appears to be "set the future policy for this fd". But these commands are a perfect fit with the fadvise() impementation, and several of the existing fadvise() commands are synchronous and don't affect future policy either. I think we can live with the slight incongruity. Cc: Michael Kerrisk <mtk-manpages@gmx.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/fadvise.h6
-rw-r--r--include/linux/fs.h5
-rw-r--r--mm/fadvise.c46
-rw-r--r--mm/filemap.c10
4 files changed, 57 insertions, 10 deletions
diff --git a/include/linux/fadvise.h b/include/linux/fadvise.h
index e8e747139b9a..b2913bba35d8 100644
--- a/include/linux/fadvise.h
+++ b/include/linux/fadvise.h
@@ -18,4 +18,10 @@
18#define POSIX_FADV_NOREUSE 5 /* Data will be accessed once. */ 18#define POSIX_FADV_NOREUSE 5 /* Data will be accessed once. */
19#endif 19#endif
20 20
21/*
22 * Linux-specific fadvise() extensions:
23 */
24#define LINUX_FADV_ASYNC_WRITE 32 /* Start writeout on range */
25#define LINUX_FADV_WRITE_WAIT 33 /* Wait upon writeout to range */
26
21#endif /* FADVISE_H_INCLUDED */ 27#endif /* FADVISE_H_INCLUDED */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 65e6df247ea5..0ad70c1e5e55 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1473,6 +1473,11 @@ extern int filemap_fdatawait(struct address_space *);
1473extern int filemap_write_and_wait(struct address_space *mapping); 1473extern int filemap_write_and_wait(struct address_space *mapping);
1474extern int filemap_write_and_wait_range(struct address_space *mapping, 1474extern int filemap_write_and_wait_range(struct address_space *mapping,
1475 loff_t lstart, loff_t lend); 1475 loff_t lstart, loff_t lend);
1476extern int wait_on_page_writeback_range(struct address_space *mapping,
1477 pgoff_t start, pgoff_t end);
1478extern int __filemap_fdatawrite_range(struct address_space *mapping,
1479 loff_t start, loff_t end, int sync_mode);
1480
1476extern void sync_supers(void); 1481extern void sync_supers(void);
1477extern void sync_filesystems(int wait); 1482extern void sync_filesystems(int wait);
1478extern void emergency_sync(void); 1483extern void emergency_sync(void);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index d257c89e7704..907c39257ca0 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -15,6 +15,7 @@
15#include <linux/backing-dev.h> 15#include <linux/backing-dev.h>
16#include <linux/pagevec.h> 16#include <linux/pagevec.h>
17#include <linux/fadvise.h> 17#include <linux/fadvise.h>
18#include <linux/writeback.h>
18#include <linux/syscalls.h> 19#include <linux/syscalls.h>
19 20
20#include <asm/unistd.h> 21#include <asm/unistd.h>
@@ -22,13 +23,36 @@
22/* 23/*
23 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could 24 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
24 * deactivate the pages and clear PG_Referenced. 25 * deactivate the pages and clear PG_Referenced.
26 *
27 * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
28 * offsets `offset' and `offset+len' inclusive. Any pages which are currently
29 * under writeout are skipped, whether or not they are dirty.
30 *
31 * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
32 * offsets `offset' and `offset+len'.
33 *
34 * By combining these two operations the application may do several things:
35 *
36 * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.
37 *
38 * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently
39 * dirty pages at the disk.
40 *
41 * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push
42 * all of the currently dirty pages at the disk, wait until they have been
43 * written.
44 *
45 * It should be noted that none of these operations write out the file's
46 * metadata. So unless the application is strictly performing overwrites of
47 * already-instantiated disk blocks, there are no guarantees here that the data
48 * will be available after a crash.
25 */ 49 */
26asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) 50asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
27{ 51{
28 struct file *file = fget(fd); 52 struct file *file = fget(fd);
29 struct address_space *mapping; 53 struct address_space *mapping;
30 struct backing_dev_info *bdi; 54 struct backing_dev_info *bdi;
31 loff_t endbyte; 55 loff_t endbyte; /* inclusive */
32 pgoff_t start_index; 56 pgoff_t start_index;
33 pgoff_t end_index; 57 pgoff_t end_index;
34 unsigned long nrpages; 58 unsigned long nrpages;
@@ -56,6 +80,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
56 endbyte = offset + len; 80 endbyte = offset + len;
57 if (!len || endbyte < len) 81 if (!len || endbyte < len)
58 endbyte = -1; 82 endbyte = -1;
83 else
84 endbyte--; /* inclusive */
59 85
60 bdi = mapping->backing_dev_info; 86 bdi = mapping->backing_dev_info;
61 87
@@ -78,7 +104,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
78 104
79 /* First and last PARTIAL page! */ 105 /* First and last PARTIAL page! */
80 start_index = offset >> PAGE_CACHE_SHIFT; 106 start_index = offset >> PAGE_CACHE_SHIFT;
81 end_index = (endbyte-1) >> PAGE_CACHE_SHIFT; 107 end_index = endbyte >> PAGE_CACHE_SHIFT;
82 108
83 /* Careful about overflow on the "+1" */ 109 /* Careful about overflow on the "+1" */
84 nrpages = end_index - start_index + 1; 110 nrpages = end_index - start_index + 1;
@@ -96,11 +122,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
96 filemap_flush(mapping); 122 filemap_flush(mapping);
97 123
98 /* First and last FULL page! */ 124 /* First and last FULL page! */
99 start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; 125 start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
100 end_index = (endbyte >> PAGE_CACHE_SHIFT); 126 end_index = (endbyte >> PAGE_CACHE_SHIFT);
101 127
102 if (end_index > start_index) 128 if (end_index >= start_index)
103 invalidate_mapping_pages(mapping, start_index, end_index-1); 129 invalidate_mapping_pages(mapping, start_index,
130 end_index);
131 break;
132 case LINUX_FADV_ASYNC_WRITE:
133 ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
134 WB_SYNC_NONE);
135 break;
136 case LINUX_FADV_WRITE_WAIT:
137 ret = wait_on_page_writeback_range(mapping,
138 offset >> PAGE_CACHE_SHIFT,
139 endbyte >> PAGE_CACHE_SHIFT);
104 break; 140 break;
105 default: 141 default:
106 ret = -EINVAL; 142 ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index c1b1708cc95d..3ef20739e725 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -183,8 +183,8 @@ static int sync_page(void *word)
183 * these two operations is that if a dirty page/buffer is encountered, it must 183 * these two operations is that if a dirty page/buffer is encountered, it must
184 * be waited upon, and not just skipped over. 184 * be waited upon, and not just skipped over.
185 */ 185 */
186static int __filemap_fdatawrite_range(struct address_space *mapping, 186int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
187 loff_t start, loff_t end, int sync_mode) 187 loff_t end, int sync_mode)
188{ 188{
189 int ret; 189 int ret;
190 struct writeback_control wbc = { 190 struct writeback_control wbc = {
@@ -213,8 +213,8 @@ int filemap_fdatawrite(struct address_space *mapping)
213} 213}
214EXPORT_SYMBOL(filemap_fdatawrite); 214EXPORT_SYMBOL(filemap_fdatawrite);
215 215
216static int filemap_fdatawrite_range(struct address_space *mapping, 216static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
217 loff_t start, loff_t end) 217 loff_t end)
218{ 218{
219 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); 219 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
220} 220}
@@ -233,7 +233,7 @@ EXPORT_SYMBOL(filemap_flush);
233 * Wait for writeback to complete against pages indexed by start->end 233 * Wait for writeback to complete against pages indexed by start->end
234 * inclusive 234 * inclusive
235 */ 235 */
236static int wait_on_page_writeback_range(struct address_space *mapping, 236int wait_on_page_writeback_range(struct address_space *mapping,
237 pgoff_t start, pgoff_t end) 237 pgoff_t start, pgoff_t end)
238{ 238{
239 struct pagevec pvec; 239 struct pagevec pvec;