vfs: fix data corruption when blocksize < pagesize for mmaped data

->page_mkwrite() is used by filesystems to allocate blocks under a page which is becoming writeably mmapped in some process' address space. This allows a filesystem to return a page fault if there is not enough space available, user exceeds quota or similar problem happens, rather than silently discarding data later when writepage is called. However VFS fails to call ->page_mkwrite() in all the cases where filesystems need it when blocksize < pagesize. For example when blocksize = 1024, pagesize = 4096 the following is problematic: ftruncate(fd, 0); pwrite(fd, buf, 1024, 0); map = mmap(NULL, 1024, PROT_WRITE, MAP_SHARED, fd, 0); map[0] = 'a'; ----> page_mkwrite() for index 0 is called ftruncate(fd, 10000); /* or even pwrite(fd, buf, 1, 10000) */ mremap(map, 1024, 10000, 0); map[4095] = 'a'; ----> no page_mkwrite() called At the moment ->page_mkwrite() is called, filesystem can allocate only one block for the page because i_size == 1024. Otherwise it would create blocks beyond i_size which is generally undesirable. But later at ->writepage() time, we also need to store data at offset 4095 but we don't have block allocated for it. This patch introduces a helper function filesystems can use to have ->page_mkwrite() called at all the necessary moments. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@vger.kernel.org
author: Jan Kara <jack@suse.cz> 2014-10-01 21:49:18 -0400
committer: Theodore Ts'o <tytso@mit.edu> 2014-10-01 21:49:18 -0400
commit: 90a8020278c1598fafd071736a0846b38510309c (patch)
tree: 2ab461b549a2b5f6b933895b1e61eb98627bba94
parent: f6e63f90809946d410c42045577cb159fedabf8c (diff)
3 files changed, 61 insertions, 0 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index 9a6029e0dd71..6dc1475dcb2d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2087,6 +2087,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
                        struct page *page, void *fsdata)
 {
        struct inode *inode = mapping->host;
+        loff_t old_size = inode->i_size;
        int i_size_changed = 0;
        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -2106,6 +2107,8 @@ int generic_write_end(struct file *file, struct address_space *mapping,
        unlock_page(page);
        page_cache_release(page);
+        if (old_size < pos)
+                pagecache_isize_extended(inode, old_size, pos);
        /*
         * Don't mark the inode dirty under page lock. First, it unnecessarily
         * makes the holding time of page lock longer. Second, it forces lock
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8981cc882ed2..5005464fe012 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1155,6 +1155,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
 extern void truncate_pagecache(struct inode *inode, loff_t new);
 extern void truncate_setsize(struct inode *inode, loff_t newsize);
+void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
 void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
 int truncate_inode_page(struct address_space *mapping, struct page *page);
 int generic_error_remove_page(struct address_space *mapping, struct page *page);
diff --git a/mm/truncate.c b/mm/truncate.c
index 96d167372d89..261eaf6e5a19 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -20,6 +20,7 @@
 #include <linux/buffer_head.h>  /* grr. try_to_release_page,
                                   do_invalidatepage */
 #include <linux/cleancache.h>
+#include <linux/rmap.h>
 #include "internal.h"
 static void clear_exceptional_entry(struct address_space *mapping,
@@ -719,12 +720,68 @@ EXPORT_SYMBOL(truncate_pagecache);
 */
 void truncate_setsize(struct inode *inode, loff_t newsize)
 {
+        loff_t oldsize = inode->i_size;
        i_size_write(inode, newsize);
+        if (newsize > oldsize)
+                pagecache_isize_extended(inode, oldsize, newsize);
        truncate_pagecache(inode, newsize);
 }
 EXPORT_SYMBOL(truncate_setsize);
 /**
+ * pagecache_isize_extended - update pagecache after extension of i_size
+ * @inode:      inode for which i_size was extended
+ * @from:       original inode size
+ * @to:         new inode size
+ *
+ * Handle extension of inode size either caused by extending truncate or by
+ * write starting after current i_size. We mark the page straddling current
+ * i_size RO so that page_mkwrite() is called on the nearest write access to
+ * the page.  This way filesystem can be sure that page_mkwrite() is called on
+ * the page before user writes to the page via mmap after the i_size has been
+ * changed.
+ *
+ * The function must be called after i_size is updated so that page fault
+ * coming after we unlock the page will already see the new i_size.
+ * The function must be called while we still hold i_mutex - this not only
+ * makes sure i_size is stable but also that userspace cannot observe new
+ * i_size value before we are prepared to store mmap writes at new inode size.
+ */
+void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
+{
+        int bsize = 1 << inode->i_blkbits;
+        loff_t rounded_from;
+        struct page *page;
+        pgoff_t index;
+        WARN_ON(!mutex_is_locked(&inode->i_mutex));
+        WARN_ON(to > inode->i_size);
+        if (from >= to || bsize == PAGE_CACHE_SIZE)
+                return;
+        /* Page straddling @from will not have any hole block created? */
+        rounded_from = round_up(from, bsize);
+        if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1)))
+                return;
+        index = from >> PAGE_CACHE_SHIFT;
+        page = find_lock_page(inode->i_mapping, index);
+        /* Page not cached? Nothing to do */
+        if (!page)
+                return;
+        /*
+         * See clear_page_dirty_for_io() for details why set_page_dirty()
+         * is needed.
+         */
+        if (page_mkclean(page))
+                set_page_dirty(page);
+        unlock_page(page);
+        page_cache_release(page);
+}
+EXPORT_SYMBOL(pagecache_isize_extended);
+/**
 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
 * @inode: inode
 * @lstart: offset of beginning of hole
author	Jan Kara <jack@suse.cz>	2014-10-01 21:49:18 -0400
committer	Theodore Ts'o <tytso@mit.edu>	2014-10-01 21:49:18 -0400
commit	90a8020278c1598fafd071736a0846b38510309c (patch)
tree	2ab461b549a2b5f6b933895b1e61eb98627bba94
parent	f6e63f90809946d410c42045577cb159fedabf8c (diff)

diff --git a/fs/buffer.c b/fs/buffer.c index 9a6029e0dd71..6dc1475dcb2d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c
@@ -2087,6 +2087,7 @@ int generic_write_end(struct file file, struct address_space mapping,
2087	struct page page, void fsdata)	2087	struct page page, void fsdata)
2088	{	2088	{
2089	struct inode *inode = mapping->host;	2089	struct inode *inode = mapping->host;
		2090	loff_t old_size = inode->i_size;
2090	int i_size_changed = 0;	2091	int i_size_changed = 0;
2091		2092
2092	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);	2093	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -2106,6 +2107,8 @@ int generic_write_end(struct file file, struct address_space mapping,
2106	unlock_page(page);	2107	unlock_page(page);
2107	page_cache_release(page);	2108	page_cache_release(page);
2108		2109
		2110	if (old_size < pos)
		2111	pagecache_isize_extended(inode, old_size, pos);
2109	/*	2112	/*
2110	* Don't mark the inode dirty under page lock. First, it unnecessarily	2113	* Don't mark the inode dirty under page lock. First, it unnecessarily
2111	* makes the holding time of page lock longer. Second, it forces lock	2114	* makes the holding time of page lock longer. Second, it forces lock


diff --git a/include/linux/mm.h b/include/linux/mm.h index 8981cc882ed2..5005464fe012 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h
@@ -1155,6 +1155,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
1155		1155
1156	extern void truncate_pagecache(struct inode *inode, loff_t new);	1156	extern void truncate_pagecache(struct inode *inode, loff_t new);
1157	extern void truncate_setsize(struct inode *inode, loff_t newsize);	1157	extern void truncate_setsize(struct inode *inode, loff_t newsize);
		1158	void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
1158	void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);	1159	void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
1159	int truncate_inode_page(struct address_space mapping, struct page page);	1160	int truncate_inode_page(struct address_space mapping, struct page page);
1160	int generic_error_remove_page(struct address_space mapping, struct page page);	1161	int generic_error_remove_page(struct address_space mapping, struct page page);


diff --git a/mm/truncate.c b/mm/truncate.c index 96d167372d89..261eaf6e5a19 100644 --- a/mm/truncate.c +++ b/mm/truncate.c
@@ -20,6 +20,7 @@
20	#include <linux/buffer_head.h> /* grr. try_to_release_page,	20	#include <linux/buffer_head.h> /* grr. try_to_release_page,
21	do_invalidatepage */	21	do_invalidatepage */
22	#include <linux/cleancache.h>	22	#include <linux/cleancache.h>
		23	#include <linux/rmap.h>
23	#include "internal.h"	24	#include "internal.h"
24		25
25	static void clear_exceptional_entry(struct address_space *mapping,	26	static void clear_exceptional_entry(struct address_space *mapping,
@@ -719,12 +720,68 @@ EXPORT_SYMBOL(truncate_pagecache);
719	*/	720	*/
720	void truncate_setsize(struct inode *inode, loff_t newsize)	721	void truncate_setsize(struct inode *inode, loff_t newsize)
721	{	722	{
		723	loff_t oldsize = inode->i_size;
		724
722	i_size_write(inode, newsize);	725	i_size_write(inode, newsize);
		726	if (newsize > oldsize)
		727	pagecache_isize_extended(inode, oldsize, newsize);
723	truncate_pagecache(inode, newsize);	728	truncate_pagecache(inode, newsize);
724	}	729	}
725	EXPORT_SYMBOL(truncate_setsize);	730	EXPORT_SYMBOL(truncate_setsize);
726		731
727	/**	732	/**
		733	* pagecache_isize_extended - update pagecache after extension of i_size
		734	* @inode: inode for which i_size was extended
		735	* @from: original inode size
		736	* @to: new inode size
		737	*
		738	* Handle extension of inode size either caused by extending truncate or by
		739	* write starting after current i_size. We mark the page straddling current
		740	* i_size RO so that page_mkwrite() is called on the nearest write access to
		741	* the page. This way filesystem can be sure that page_mkwrite() is called on
		742	* the page before user writes to the page via mmap after the i_size has been
		743	* changed.
		744	*
		745	* The function must be called after i_size is updated so that page fault
		746	* coming after we unlock the page will already see the new i_size.
		747	* The function must be called while we still hold i_mutex - this not only
		748	* makes sure i_size is stable but also that userspace cannot observe new
		749	* i_size value before we are prepared to store mmap writes at new inode size.
		750	*/
		751	void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
		752	{
		753	int bsize = 1 << inode->i_blkbits;
		754	loff_t rounded_from;
		755	struct page *page;
		756	pgoff_t index;
		757
		758	WARN_ON(!mutex_is_locked(&inode->i_mutex));
		759	WARN_ON(to > inode->i_size);
		760
		761	if (from >= to \|\| bsize == PAGE_CACHE_SIZE)
		762	return;
		763	/* Page straddling @from will not have any hole block created? */
		764	rounded_from = round_up(from, bsize);
		765	if (to <= rounded_from \|\| !(rounded_from & (PAGE_CACHE_SIZE - 1)))
		766	return;
		767
		768	index = from >> PAGE_CACHE_SHIFT;
		769	page = find_lock_page(inode->i_mapping, index);
		770	/* Page not cached? Nothing to do */
		771	if (!page)
		772	return;
		773	/*
		774	* See clear_page_dirty_for_io() for details why set_page_dirty()
		775	* is needed.
		776	*/
		777	if (page_mkclean(page))
		778	set_page_dirty(page);
		779	unlock_page(page);
		780	page_cache_release(page);
		781	}
		782	EXPORT_SYMBOL(pagecache_isize_extended);
		783
		784	/**
728	* truncate_pagecache_range - unmap and remove pagecache that is hole-punched	785	* truncate_pagecache_range - unmap and remove pagecache that is hole-punched
729	* @inode: inode	786	* @inode: inode
730	* @lstart: offset of beginning of hole	787	* @lstart: offset of beginning of hole