From 585d3bc06f4ca57f975a5a1f698f65a45ea66225 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 25 Feb 2009 10:44:19 +0100
Subject: fs: move bdev code out of buffer.c

Move some block device related code out from buffer.c and put it in
block_dev.c. I'm trying to move non-buffer_head code out of buffer.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/buffer.c | 145 ------------------------------------------------------------
 1 file changed, 145 deletions(-)

(limited to 'fs/buffer.c')

diff --git a/fs/buffer.c b/fs/buffer.c
index 891e1c78e4f1..a2fd743d97cb 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -165,151 +165,6 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	put_bh(bh);
 }
 
-/*
- * Write out and wait upon all the dirty data associated with a block
- * device via its mapping.  Does not take the superblock lock.
- */
-int sync_blockdev(struct block_device *bdev)
-{
-	int ret = 0;
-
-	if (bdev)
-		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
-	return ret;
-}
-EXPORT_SYMBOL(sync_blockdev);
-
-/*
- * Write out and wait upon all dirty data associated with this
- * device.   Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.
- */
-int fsync_bdev(struct block_device *bdev)
-{
-	struct super_block *sb = get_super(bdev);
-	if (sb) {
-		int res = fsync_super(sb);
-		drop_super(sb);
-		return res;
-	}
-	return sync_blockdev(bdev);
-}
-
-/**
- * freeze_bdev  --  lock a filesystem and force it into a consistent state
- * @bdev:	blockdevice to lock
- *
- * This takes the block device bd_mount_sem to make sure no new mounts
- * happen on bdev until thaw_bdev() is called.
- * If a superblock is found on this device, we take the s_umount semaphore
- * on it to make sure nobody unmounts until the snapshot creation is done.
- * The reference counter (bd_fsfreeze_count) guarantees that only the last
- * unfreeze process can unfreeze the frozen filesystem actually when multiple
- * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
- * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
- * actually.
- */
-struct super_block *freeze_bdev(struct block_device *bdev)
-{
-	struct super_block *sb;
-	int error = 0;
-
-	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (bdev->bd_fsfreeze_count > 0) {
-		bdev->bd_fsfreeze_count++;
-		sb = get_super(bdev);
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return sb;
-	}
-	bdev->bd_fsfreeze_count++;
-
-	down(&bdev->bd_mount_sem);
-	sb = get_super(bdev);
-	if (sb && !(sb->s_flags & MS_RDONLY)) {
-		sb->s_frozen = SB_FREEZE_WRITE;
-		smp_wmb();
-
-		__fsync_super(sb);
-
-		sb->s_frozen = SB_FREEZE_TRANS;
-		smp_wmb();
-
-		sync_blockdev(sb->s_bdev);
-
-		if (sb->s_op->freeze_fs) {
-			error = sb->s_op->freeze_fs(sb);
-			if (error) {
-				printk(KERN_ERR
-					"VFS:Filesystem freeze failed\n");
-				sb->s_frozen = SB_UNFROZEN;
-				drop_super(sb);
-				up(&bdev->bd_mount_sem);
-				bdev->bd_fsfreeze_count--;
-				mutex_unlock(&bdev->bd_fsfreeze_mutex);
-				return ERR_PTR(error);
-			}
-		}
-	}
-
-	sync_blockdev(bdev);
-	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-
-	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
-}
-EXPORT_SYMBOL(freeze_bdev);
-
-/**
- * thaw_bdev  -- unlock filesystem
- * @bdev:	blockdevice to unlock
- * @sb:		associated superblock
- *
- * Unlocks the filesystem and marks it writeable again after freeze_bdev().
- */
-int thaw_bdev(struct block_device *bdev, struct super_block *sb)
-{
-	int error = 0;
-
-	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (!bdev->bd_fsfreeze_count) {
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return -EINVAL;
-	}
-
-	bdev->bd_fsfreeze_count--;
-	if (bdev->bd_fsfreeze_count > 0) {
-		if (sb)
-			drop_super(sb);
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return 0;
-	}
-
-	if (sb) {
-		BUG_ON(sb->s_bdev != bdev);
-		if (!(sb->s_flags & MS_RDONLY)) {
-			if (sb->s_op->unfreeze_fs) {
-				error = sb->s_op->unfreeze_fs(sb);
-				if (error) {
-					printk(KERN_ERR
-						"VFS:Filesystem thaw failed\n");
-					sb->s_frozen = SB_FREEZE_TRANS;
-					bdev->bd_fsfreeze_count++;
-					mutex_unlock(&bdev->bd_fsfreeze_mutex);
-					return error;
-				}
-			}
-			sb->s_frozen = SB_UNFROZEN;
-			smp_wmb();
-			wake_up(&sb->s_wait_unfrozen);
-		}
-		drop_super(sb);
-	}
-
-	up(&bdev->bd_mount_sem);
-	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-	return 0;
-}
-EXPORT_SYMBOL(thaw_bdev);
-
 /*
  * Various filesystems appear to want __find_get_block to be non-blocking.
  * But it's the page lock which protects the buffers.  To get around this,
-- 
cgit v1.2.2


From a64c8610bd3b753c6aff58f51c04cdf0ae478c18 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 27 Mar 2009 22:14:10 -0400
Subject: block_write_full_page: Use synchronous writes for WBC_SYNC_ALL
 writebacks

When doing synchronous writes because wbc->sync_mode is set to
WBC_SYNC_ALL, send the write request using WRITE_SYNC, so that we
don't unduly block system calls such as fsync().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Jan Kara <jack@suse.cz>
---
 fs/buffer.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs/buffer.c')

diff --git a/fs/buffer.c b/fs/buffer.c
index 891e1c78e4f1..e7ebd95e0c68 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1714,6 +1714,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	struct buffer_head *bh, *head;
 	const unsigned blocksize = 1 << inode->i_blkbits;
 	int nr_underway = 0;
+	int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
 
 	BUG_ON(!PageLocked(page));
 
@@ -1805,7 +1806,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
-			submit_bh(WRITE, bh);
+			submit_bh(write_op, bh);
 			nr_underway++;
 		}
 		bh = next;
@@ -1859,7 +1860,7 @@ recover:
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			clear_buffer_dirty(bh);
-			submit_bh(WRITE, bh);
+			submit_bh(write_op, bh);
 			nr_underway++;
 		}
 		bh = next;
-- 
cgit v1.2.2


From 47e4491b40df73c3b117e3d80b31b5b512a4b19f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 1 Apr 2009 07:07:16 -0400
Subject: Cleanup after commit 585d3bc06f4ca57f975a5a1f698f65a45ea66225

fsync_bdev() export and a bunch of stubs for !CONFIG_BLOCK case had
been left behind

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/buffer.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/buffer.c')

diff --git a/fs/buffer.c b/fs/buffer.c
index a2fd743d97cb..b71e52925c83 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3281,7 +3281,6 @@ EXPORT_SYMBOL(cont_write_begin);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
 EXPORT_SYMBOL(file_fsync);
-EXPORT_SYMBOL(fsync_bdev);
 EXPORT_SYMBOL(generic_block_bmap);
 EXPORT_SYMBOL(generic_cont_expand_simple);
 EXPORT_SYMBOL(init_buffer);
-- 
cgit v1.2.2


From e3a7cca1ef4c1af9b0acef9bd66eff6582a737b5 Mon Sep 17 00:00:00 2001
From: Edward Shishkin <edward.shishkin@gmail.com>
Date: Tue, 31 Mar 2009 15:19:39 -0700
Subject: vfs: add/use account_page_dirtied()

Add a helper function account_page_dirtied().  Use that from two
callsites.  reiser4 adds a function which adds a third callsite.

Signed-off-by: Edward Shishkin<edward.shishkin@gmail.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'fs/buffer.c')

diff --git a/fs/buffer.c b/fs/buffer.c
index a2fd743d97cb..73abe6d8218c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -621,14 +621,7 @@ static void __set_page_dirty(struct page *page,
 	spin_lock_irq(&mapping->tree_lock);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
-
-		if (mapping_cap_account_dirty(mapping)) {
-			__inc_zone_page_state(page, NR_FILE_DIRTY);
-			__inc_bdi_stat(mapping->backing_dev_info,
-					BDI_RECLAIMABLE);
-			task_dirty_inc(current);
-			task_io_account_write(PAGE_CACHE_SIZE);
-		}
+		account_page_dirtied(page, mapping);
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 	}
-- 
cgit v1.2.2


From c2ec175c39f62949438354f603f4aa170846aabb Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 31 Mar 2009 15:23:21 -0700
Subject: mm: page_mkwrite change prototype to match fault

Change the page_mkwrite prototype to take a struct vm_fault, and return
VM_FAULT_xxx flags.  There should be no functional change.

This makes it possible to return much more detailed error information to
the VM (and also can provide more information eg.  virtual_address to the
driver, which might be important in some special cases).

This is required for a subsequent fix.  And will also make it easier to
merge page_mkwrite() with fault() in future.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <joel.becker@oracle.com>
Cc: Artem Bityutskiy <dedekind@infradead.org>
Cc: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs/buffer.c')

diff --git a/fs/buffer.c b/fs/buffer.c
index 73abe6d8218c..6d51a3da362c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2313,9 +2313,10 @@ int block_commit_write(struct page *page, unsigned from, unsigned to)
  * unlock the page.
  */
 int
-block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 		   get_block_t get_block)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	unsigned long end;
 	loff_t size;
@@ -2340,6 +2341,9 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
 		ret = block_commit_write(page, 0, end);
 
 out_unlock:
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
+
 	unlock_page(page);
 	return ret;
 }
-- 
cgit v1.2.2


From 56a76f8275c379ed73c8a43cfa1dfa2f5e9cfa19 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 31 Mar 2009 15:23:23 -0700
Subject: fs: fix page_mkwrite error cases in core code and btrfs

page_mkwrite is called with neither the page lock nor the ptl held.  This
means a page can be concurrently truncated or invalidated out from
underneath it.  Callers are supposed to prevent truncate races themselves,
however previously the only thing they can do in case they hit one is to
raise a SIGBUS.  A sigbus is wrong for the case that the page has been
invalidated or truncated within i_size (eg.  hole punched).  Callers may
also have to perform memory allocations in this path, where again, SIGBUS
would be wrong.

The previous patch ("mm: page_mkwrite change prototype to match fault")
made it possible to properly specify errors.  Convert the generic buffer.c
code and btrfs to return sane error values (in the case of page removed
from pagecache, VM_FAULT_NOPAGE will cause the fault handler to exit
without doing anything, and the fault will be retried properly).

This fixes core code, and converts btrfs as a template/example.  All other
filesystems defining their own page_mkwrite should be fixed in a similar
manner.

Acked-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs/buffer.c')

diff --git a/fs/buffer.c b/fs/buffer.c
index 6d51a3da362c..0c14f8d52ee5 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2320,7 +2320,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	unsigned long end;
 	loff_t size;
-	int ret = -EINVAL;
+	int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 
 	lock_page(page);
 	size = i_size_read(inode);
@@ -2340,10 +2340,14 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 	if (!ret)
 		ret = block_commit_write(page, 0, end);
 
-out_unlock:
-	if (ret)
-		ret = VM_FAULT_SIGBUS;
+	if (unlikely(ret)) {
+		if (ret == -ENOMEM)
+			ret = VM_FAULT_OOM;
+		else /* -ENOSPC, -EIO, etc */
+			ret = VM_FAULT_SIGBUS;
+	}
 
+out_unlock:
 	unlock_page(page);
 	return ret;
 }
-- 
cgit v1.2.2


From 327c0e968645f2601a43f5ea7c19c7b3a5fa0a34 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 31 Mar 2009 15:23:31 -0700
Subject: vmscan: fix it to take care of nodemask

try_to_free_pages() is used for the direct reclaim of up to
SWAP_CLUSTER_MAX pages when watermarks are low.  The caller to
alloc_pages_nodemask() can specify a nodemask of nodes that are allowed to
be used but this is not passed to try_to_free_pages().  This can lead to
unnecessary reclaim of pages that are unusable by the caller and int the
worst case lead to allocation failure as progress was not been make where
it is needed.

This patch passes the nodemask used for alloc_pages_nodemask() to
try_to_free_pages().

Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/buffer.c')

diff --git a/fs/buffer.c b/fs/buffer.c
index 0c14f8d52ee5..c77b848c3d43 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -290,7 +290,7 @@ static void free_more_memory(void)
 						&zone);
 		if (zone)
 			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
-						GFP_NOFS);
+						GFP_NOFS, NULL);
 	}
 }
 
-- 
cgit v1.2.2


From c2d7543851849a6923680cdd7e1047ed1a84a1c5 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 31 Mar 2009 15:23:46 -0700
Subject: filesystem freeze: allow SysRq emergency thaw to thaw frozen
 filesystems

Now that the filesystem freeze operation has been elevated to the VFS, and
is just an ioctl away, some sort of safety net for unintentionally frozen
root filesystems may be in order.

The timeout thaw originally proposed did not get merged, but perhaps
something like this would be useful in emergencies.

For example, freeze /path/to/mountpoint may freeze your root filesystem if
you forgot that you had that unmounted.

I chose 'j' as the last remaining character other than 'h' which is sort
of reserved for help (because help is generated on any unknown character).

I've tested this on a non-root fs with multiple (nested) freezers, as well
as on a system rendered unresponsive due to a frozen root fs.

[randy.dunlap@oracle.com: emergency thaw only if CONFIG_BLOCK enabled]
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Cc: Takashi Sato <t-sato@yk.jp.nec.com>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'fs/buffer.c')

diff --git a/fs/buffer.c b/fs/buffer.c
index c77b848c3d43..f5f8b15a6e40 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -547,6 +547,39 @@ repeat:
 	return err;
 }
 
+void do_thaw_all(unsigned long unused)
+{
+	struct super_block *sb;
+	char b[BDEVNAME_SIZE];
+
+	spin_lock(&sb_lock);
+restart:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_read(&sb->s_umount);
+		while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+			printk(KERN_WARNING "Emergency Thaw on %s\n",
+			       bdevname(sb->s_bdev, b));
+		up_read(&sb->s_umount);
+		spin_lock(&sb_lock);
+		if (__put_super_and_need_restart(sb))
+			goto restart;
+	}
+	spin_unlock(&sb_lock);
+	printk(KERN_WARNING "Emergency Thaw complete\n");
+}
+
+/**
+ * emergency_thaw_all -- forcibly thaw every frozen filesystem
+ *
+ * Used for emergency unfreeze of all filesystems via SysRq
+ */
+void emergency_thaw_all(void)
+{
+	pdflush_operation(do_thaw_all, 0);
+}
+
 /**
  * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
  * @mapping: the mapping which wants those buffers written
-- 
cgit v1.2.2


From 97f76d3d197f201ac8a8a3ced5b8fef81568e50e Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Thu, 2 Apr 2009 16:56:46 -0700
Subject: vfs: check bh->b_blocknr only if BH_Mapped is set

Check bh->b_blocknr only if BH_Mapped is set.

akpm: I doubt if b_blocknr is ever uninitialised here, but it could
conceivably cause a problem if we're doing a lookup for block zero.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/buffer.c')

diff --git a/fs/buffer.c b/fs/buffer.c
index f5f8b15a6e40..2963858f0f31 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -199,13 +199,13 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
 	head = page_buffers(page);
 	bh = head;
 	do {
-		if (bh->b_blocknr == block) {
+		if (!buffer_mapped(bh))
+			all_mapped = 0;
+		else if (bh->b_blocknr == block) {
 			ret = bh;
 			get_bh(bh);
 			goto out_unlock;
 		}
-		if (!buffer_mapped(bh))
-			all_mapped = 0;
 		bh = bh->b_this_page;
 	} while (bh != head);
 
-- 
cgit v1.2.2