1 files changed, 62 insertions, 55 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index 2219a76e2caf..698c6b2cc462 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -41,6 +41,7 @@
 #include <linux/bitops.h>
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
+#include <linux/cleancache.h>
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
@@ -54,23 +55,15 @@ init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 }
 EXPORT_SYMBOL(init_buffer);
-static int sync_buffer(void *word)
+static int sleep_on_buffer(void *word)
 {
-        struct block_device *bd;
-        struct buffer_head *bh
-                = container_of(word, struct buffer_head, b_state);
-        smp_mb();
-        bd = bh->b_bdev;
-        if (bd)
-                blk_run_address_space(bd->bd_inode->i_mapping);
        io_schedule();
        return 0;
 }
 void __lock_buffer(struct buffer_head *bh)
 {
-        wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
+        wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
                                                        TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__lock_buffer);
@@ -90,7 +83,7 @@ EXPORT_SYMBOL(unlock_buffer);
 */
 void __wait_on_buffer(struct buffer_head * bh)
 {
-        wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
+        wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__wait_on_buffer);
@@ -277,6 +270,10 @@ void invalidate_bdev(struct block_device *bdev)
        invalidate_bh_lrus();
        lru_add_drain_all();    /* make sure all lru add caches are flushed */
        invalidate_mapping_pages(mapping, 0, -1);
+        /* 99% of the time, we don't need to flush the cleancache on the bdev.
+         * But, for the strange corners, lets be cautious
+         */
+        cleancache_flush_inode(mapping);
 }
 EXPORT_SYMBOL(invalidate_bdev);
@@ -749,10 +746,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
        struct buffer_head *bh;
        struct list_head tmp;
-        struct address_space *mapping, *prev_mapping = NULL;
+        struct address_space *mapping;
        int err = 0, err2;
+        struct blk_plug plug;
        INIT_LIST_HEAD(&tmp);
+        blk_start_plug(&plug);
        spin_lock(lock);
        while (!list_empty(list)) {
@@ -775,7 +774,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
                                 * still in flight on potentially older
                                 * contents.
                                 */
-                                write_dirty_buffer(bh, WRITE_SYNC_PLUG);
+                                write_dirty_buffer(bh, WRITE_SYNC);
                                /*
                                 * Kick off IO for the previous mapping. Note
@@ -783,16 +782,16 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
                                 * wait_on_buffer() will do that for us
                                 * through sync_buffer().
                                 */
-                                if (prev_mapping && prev_mapping != mapping)
-                                        blk_run_address_space(prev_mapping);
-                                prev_mapping = mapping;
                                brelse(bh);
                                spin_lock(lock);
                        }
                }
        }
+        spin_unlock(lock);
+        blk_finish_plug(&plug);
+        spin_lock(lock);
        while (!list_empty(&tmp)) {
                bh = BH_ENTRY(tmp.prev);
                get_bh(bh);
@@ -1144,7 +1143,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
 * inode list.
 *
 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
- * mapping->tree_lock and the global inode_lock.
+ * mapping->tree_lock and mapping->host->i_lock.
 */
 void mark_buffer_dirty(struct buffer_head *bh)
 {
@@ -1614,14 +1613,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
 * prevents this contention from occurring.
 *
 * If block_write_full_page() is called with wbc->sync_mode ==
- * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
+ * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
- * causes the writes to be flagged as synchronous writes, but the
+ * causes the writes to be flagged as synchronous writes.
- * block device queue will NOT be unplugged, since usually many pages
- * will be pushed to the out before the higher-level caller actually
- * waits for the writes to be completed.  The various wait functions,
- * such as wait_on_writeback_range() will ultimately call sync_page()
- * which will ultimately call blk_run_backing_dev(), which will end up
- * unplugging the device queue.
 */
 static int __block_write_full_page(struct inode *inode, struct page *page,
                        get_block_t *get_block, struct writeback_control *wbc,
@@ -1634,7 +1627,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
        const unsigned blocksize = 1 << inode->i_blkbits;
        int nr_underway = 0;
        int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
-                        WRITE_SYNC_PLUG : WRITE);
+                        WRITE_SYNC : WRITE);
        BUG_ON(!PageLocked(page));
@@ -2343,24 +2336,26 @@ EXPORT_SYMBOL(block_commit_write);
 * page lock we can determine safely if the page is beyond EOF. If it is not
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
+ *
+ * Direct callers of this function should call vfs_check_frozen() so that page
+ * fault does not busyloop until the fs is thawed.
 */
-int
+int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+                         get_block_t get_block)
-                   get_block_t get_block)
 {
        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        unsigned long end;
        loff_t size;
-        int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
+        int ret;
        lock_page(page);
        size = i_size_read(inode);
        if ((page->mapping != inode->i_mapping) ||
            (page_offset(page) > size)) {
-                /* page got truncated out from underneath us */
+                /* We overload EFAULT to mean page got truncated */
-                unlock_page(page);
+                ret = -EFAULT;
-                goto out;
+                goto out_unlock;
        }
        /* page is wholly or partially inside EOF */
@@ -2373,18 +2368,41 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
        if (!ret)
                ret = block_commit_write(page, 0, end);
-        if (unlikely(ret)) {
+        if (unlikely(ret < 0))
-                unlock_page(page);
+                goto out_unlock;
-                if (ret == -ENOMEM)
+        /*
-                        ret = VM_FAULT_OOM;
+         * Freezing in progress? We check after the page is marked dirty and
-                else /* -ENOSPC, -EIO, etc */
+         * with page lock held so if the test here fails, we are sure freezing
-                        ret = VM_FAULT_SIGBUS;
+         * code will wait during syncing until the page fault is done - at that
-        } else
+         * point page will be dirty and unlocked so freezing code will write it
-                ret = VM_FAULT_LOCKED;
+         * and writeprotect it again.
+         */
-out:
+        set_page_dirty(page);
+        if (inode->i_sb->s_frozen != SB_UNFROZEN) {
+                ret = -EAGAIN;
+                goto out_unlock;
+        }
+        return 0;
+out_unlock:
+        unlock_page(page);
        return ret;
 }
+EXPORT_SYMBOL(__block_page_mkwrite);
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+                   get_block_t get_block)
+{
+        int ret;
+        struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
+        /*
+         * This check is racy but catches the common case. The check in
+         * __block_page_mkwrite() is reliable.
+         */
+        vfs_check_frozen(sb, SB_FREEZE_WRITE);
+        ret = __block_page_mkwrite(vma, vmf, get_block);
+        return block_page_mkwrite_return(ret);
+}
 EXPORT_SYMBOL(block_page_mkwrite);
 /*
@@ -3138,17 +3156,6 @@ out:
 }
 EXPORT_SYMBOL(try_to_free_buffers);
-void block_sync_page(struct page *page)
-{
-        struct address_space *mapping;
-        smp_mb();
-        mapping = page_mapping(page);
-        if (mapping)
-                blk_run_backing_dev(mapping->backing_dev_info, page);
-}
-EXPORT_SYMBOL(block_sync_page);
 /*
 * There are no bdflush tunables left.  But distributions are
 * still running obsolete flush daemons, so we terminate them here.

diff --git a/fs/buffer.c b/fs/buffer.c index 2219a76e2caf..698c6b2cc462 100644 --- a/fs/buffer.c +++ b/fs/buffer.c
@@ -41,6 +41,7 @@
41	#include <linux/bitops.h>	41	#include <linux/bitops.h>
42	#include <linux/mpage.h>	42	#include <linux/mpage.h>
43	#include <linux/bit_spinlock.h>	43	#include <linux/bit_spinlock.h>
		44	#include <linux/cleancache.h>
44		45
45	static int fsync_buffers_list(spinlock_t lock, struct list_head list);	46	static int fsync_buffers_list(spinlock_t lock, struct list_head list);
46		47
@@ -54,23 +55,15 @@ init_buffer(struct buffer_head bh, bh_end_io_t handler, void *private)
54	}	55	}
55	EXPORT_SYMBOL(init_buffer);	56	EXPORT_SYMBOL(init_buffer);
56		57
57	static int sync_buffer(void *word)	58	static int sleep_on_buffer(void *word)
58	{	59	{
59	struct block_device *bd;
60	struct buffer_head *bh
61	= container_of(word, struct buffer_head, b_state);
62
63	smp_mb();
64	bd = bh->b_bdev;
65	if (bd)
66	blk_run_address_space(bd->bd_inode->i_mapping);
67	io_schedule();	60	io_schedule();
68	return 0;	61	return 0;
69	}	62	}
70		63
71	void __lock_buffer(struct buffer_head *bh)	64	void __lock_buffer(struct buffer_head *bh)
72	{	65	{
73	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,	66	wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
74	TASK_UNINTERRUPTIBLE);	67	TASK_UNINTERRUPTIBLE);
75	}	68	}
76	EXPORT_SYMBOL(__lock_buffer);	69	EXPORT_SYMBOL(__lock_buffer);
@@ -90,7 +83,7 @@ EXPORT_SYMBOL(unlock_buffer);
90	*/	83	*/
91	void __wait_on_buffer(struct buffer_head * bh)	84	void __wait_on_buffer(struct buffer_head * bh)
92	{	85	{
93	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);	86	wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
94	}	87	}
95	EXPORT_SYMBOL(__wait_on_buffer);	88	EXPORT_SYMBOL(__wait_on_buffer);
96		89
@@ -277,6 +270,10 @@ void invalidate_bdev(struct block_device *bdev)
277	invalidate_bh_lrus();	270	invalidate_bh_lrus();
278	lru_add_drain_all(); /* make sure all lru add caches are flushed */	271	lru_add_drain_all(); /* make sure all lru add caches are flushed */
279	invalidate_mapping_pages(mapping, 0, -1);	272	invalidate_mapping_pages(mapping, 0, -1);
		273	/* 99% of the time, we don't need to flush the cleancache on the bdev.
		274	* But, for the strange corners, lets be cautious
		275	*/
		276	cleancache_flush_inode(mapping);
280	}	277	}
281	EXPORT_SYMBOL(invalidate_bdev);	278	EXPORT_SYMBOL(invalidate_bdev);
282		279
@@ -749,10 +746,12 @@ static int fsync_buffers_list(spinlock_t lock, struct list_head list)
749	{	746	{
750	struct buffer_head *bh;	747	struct buffer_head *bh;
751	struct list_head tmp;	748	struct list_head tmp;
752	struct address_space mapping, prev_mapping = NULL;	749	struct address_space *mapping;
753	int err = 0, err2;	750	int err = 0, err2;
		751	struct blk_plug plug;
754		752
755	INIT_LIST_HEAD(&tmp);	753	INIT_LIST_HEAD(&tmp);
		754	blk_start_plug(&plug);
756		755
757	spin_lock(lock);	756	spin_lock(lock);
758	while (!list_empty(list)) {	757	while (!list_empty(list)) {
@@ -775,7 +774,7 @@ static int fsync_buffers_list(spinlock_t lock, struct list_head list)
775	* still in flight on potentially older	774	* still in flight on potentially older
776	* contents.	775	* contents.
777	*/	776	*/
778	write_dirty_buffer(bh, WRITE_SYNC_PLUG);	777	write_dirty_buffer(bh, WRITE_SYNC);
779		778
780	/*	779	/*
781	* Kick off IO for the previous mapping. Note	780	* Kick off IO for the previous mapping. Note
@@ -783,16 +782,16 @@ static int fsync_buffers_list(spinlock_t lock, struct list_head list)
783	* wait_on_buffer() will do that for us	782	* wait_on_buffer() will do that for us
784	* through sync_buffer().	783	* through sync_buffer().
785	*/	784	*/
786	if (prev_mapping && prev_mapping != mapping)
787	blk_run_address_space(prev_mapping);
788	prev_mapping = mapping;
789
790	brelse(bh);	785	brelse(bh);
791	spin_lock(lock);	786	spin_lock(lock);
792	}	787	}
793	}	788	}
794	}	789	}
795		790
		791	spin_unlock(lock);
		792	blk_finish_plug(&plug);
		793	spin_lock(lock);
		794
796	while (!list_empty(&tmp)) {	795	while (!list_empty(&tmp)) {
797	bh = BH_ENTRY(tmp.prev);	796	bh = BH_ENTRY(tmp.prev);
798	get_bh(bh);	797	get_bh(bh);
@@ -1144,7 +1143,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
1144	* inode list.	1143	* inode list.
1145	*	1144	*
1146	* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,	1145	* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1147	* mapping->tree_lock and the global inode_lock.	1146	* mapping->tree_lock and mapping->host->i_lock.
1148	*/	1147	*/
1149	void mark_buffer_dirty(struct buffer_head *bh)	1148	void mark_buffer_dirty(struct buffer_head *bh)
1150	{	1149	{
@@ -1614,14 +1613,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
1614	* prevents this contention from occurring.	1613	* prevents this contention from occurring.
1615	*	1614	*
1616	* If block_write_full_page() is called with wbc->sync_mode ==	1615	* If block_write_full_page() is called with wbc->sync_mode ==
1617	* WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this	1616	* WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1618	* causes the writes to be flagged as synchronous writes, but the	1617	* causes the writes to be flagged as synchronous writes.
1619	* block device queue will NOT be unplugged, since usually many pages
1620	* will be pushed to the out before the higher-level caller actually
1621	* waits for the writes to be completed. The various wait functions,
1622	* such as wait_on_writeback_range() will ultimately call sync_page()
1623	* which will ultimately call blk_run_backing_dev(), which will end up
1624	* unplugging the device queue.
1625	*/	1618	*/
1626	static int __block_write_full_page(struct inode inode, struct page page,	1619	static int __block_write_full_page(struct inode inode, struct page page,
1627	get_block_t get_block, struct writeback_control wbc,	1620	get_block_t get_block, struct writeback_control wbc,
@@ -1634,7 +1627,7 @@ static int __block_write_full_page(struct inode inode, struct page page,
1634	const unsigned blocksize = 1 << inode->i_blkbits;	1627	const unsigned blocksize = 1 << inode->i_blkbits;
1635	int nr_underway = 0;	1628	int nr_underway = 0;
1636	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?	1629	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1637	WRITE_SYNC_PLUG : WRITE);	1630	WRITE_SYNC : WRITE);
1638		1631
1639	BUG_ON(!PageLocked(page));	1632	BUG_ON(!PageLocked(page));
1640		1633
@@ -2343,24 +2336,26 @@ EXPORT_SYMBOL(block_commit_write);
2343	* page lock we can determine safely if the page is beyond EOF. If it is not	2336	* page lock we can determine safely if the page is beyond EOF. If it is not
2344	* beyond EOF, then the page is guaranteed safe against truncation until we	2337	* beyond EOF, then the page is guaranteed safe against truncation until we
2345	* unlock the page.	2338	* unlock the page.
		2339	*
		2340	* Direct callers of this function should call vfs_check_frozen() so that page
		2341	* fault does not busyloop until the fs is thawed.
2346	*/	2342	*/
2347	int	2343	int __block_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf,
2348	block_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf,	2344	get_block_t get_block)
2349	get_block_t get_block)
2350	{	2345	{
2351	struct page *page = vmf->page;	2346	struct page *page = vmf->page;
2352	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;	2347	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2353	unsigned long end;	2348	unsigned long end;
2354	loff_t size;	2349	loff_t size;
2355	int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */	2350	int ret;
2356		2351
2357	lock_page(page);	2352	lock_page(page);
2358	size = i_size_read(inode);	2353	size = i_size_read(inode);
2359	if ((page->mapping != inode->i_mapping) \|\|	2354	if ((page->mapping != inode->i_mapping) \|\|
2360	(page_offset(page) > size)) {	2355	(page_offset(page) > size)) {
2361	/* page got truncated out from underneath us */	2356	/* We overload EFAULT to mean page got truncated */
2362	unlock_page(page);	2357	ret = -EFAULT;
2363	goto out;	2358	goto out_unlock;
2364	}	2359	}
2365		2360
2366	/* page is wholly or partially inside EOF */	2361	/* page is wholly or partially inside EOF */
@@ -2373,18 +2368,41 @@ block_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf,
2373	if (!ret)	2368	if (!ret)
2374	ret = block_commit_write(page, 0, end);	2369	ret = block_commit_write(page, 0, end);
2375		2370
2376	if (unlikely(ret)) {	2371	if (unlikely(ret < 0))
2377	unlock_page(page);	2372	goto out_unlock;
2378	if (ret == -ENOMEM)	2373	/*
2379	ret = VM_FAULT_OOM;	2374	* Freezing in progress? We check after the page is marked dirty and
2380	else /* -ENOSPC, -EIO, etc */	2375	* with page lock held so if the test here fails, we are sure freezing
2381	ret = VM_FAULT_SIGBUS;	2376	* code will wait during syncing until the page fault is done - at that
2382	} else	2377	* point page will be dirty and unlocked so freezing code will write it
2383	ret = VM_FAULT_LOCKED;	2378	* and writeprotect it again.
2384		2379	*/
2385	out:	2380	set_page_dirty(page);
		2381	if (inode->i_sb->s_frozen != SB_UNFROZEN) {
		2382	ret = -EAGAIN;
		2383	goto out_unlock;
		2384	}
		2385	return 0;
		2386	out_unlock:
		2387	unlock_page(page);
2386	return ret;	2388	return ret;
2387	}	2389	}
		2390	EXPORT_SYMBOL(__block_page_mkwrite);
		2391
		2392	int block_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf,
		2393	get_block_t get_block)
		2394	{
		2395	int ret;
		2396	struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
		2397
		2398	/*
		2399	* This check is racy but catches the common case. The check in
		2400	* __block_page_mkwrite() is reliable.
		2401	*/
		2402	vfs_check_frozen(sb, SB_FREEZE_WRITE);
		2403	ret = __block_page_mkwrite(vma, vmf, get_block);
		2404	return block_page_mkwrite_return(ret);
		2405	}
2388	EXPORT_SYMBOL(block_page_mkwrite);	2406	EXPORT_SYMBOL(block_page_mkwrite);
2389		2407
2390	/*	2408	/*
@@ -3138,17 +3156,6 @@ out:
3138	}	3156	}
3139	EXPORT_SYMBOL(try_to_free_buffers);	3157	EXPORT_SYMBOL(try_to_free_buffers);
3140		3158
3141	void block_sync_page(struct page *page)
3142	{
3143	struct address_space *mapping;
3144
3145	smp_mb();
3146	mapping = page_mapping(page);
3147	if (mapping)
3148	blk_run_backing_dev(mapping->backing_dev_info, page);
3149	}
3150	EXPORT_SYMBOL(block_sync_page);
3151
3152	/*	3159	/*
3153	* There are no bdflush tunables left. But distributions are	3160	* There are no bdflush tunables left. But distributions are
3154	* still running obsolete flush daemons, so we terminate them here.	3161	* still running obsolete flush daemons, so we terminate them here.