From a893a84e8799270fbec5c3708d001650aab47138 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 23 Jun 2009 21:25:32 +0200
Subject: mm_for_maps: simplify, use ptrace_may_access()

It would be nice to kill __ptrace_may_access(). It requires task_lock(),
but this lock is only needed to read mm->flags in the middle.

Convert mm_for_maps() to use ptrace_may_access(), this also simplifies
the code a little bit.

Also, we do not need to take ->mmap_sem in advance. In fact I think
mm_for_maps() should not play with ->mmap_sem at all, the caller should
take this lock.

With or without this patch, without ->cred_guard_mutex held we can race
with exec() and get the new ->mm but check old creds.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/proc/base.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3ce5ae9e3d2d..917f338a6739 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -237,20 +237,19 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 	struct mm_struct *mm = get_task_mm(task);
 	if (!mm)
 		return NULL;
+	if (mm != current->mm) {
+		/*
+		 * task->mm can be changed before security check,
+		 * in that case we must notice the change after.
+		 */
+		if (!ptrace_may_access(task, PTRACE_MODE_READ) ||
+		    mm != task->mm) {
+			mmput(mm);
+			return NULL;
+		}
+	}
 	down_read(&mm->mmap_sem);
-	task_lock(task);
-	if (task->mm != mm)
-		goto out;
-	if (task->mm != current->mm &&
-	    __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
-		goto out;
-	task_unlock(task);
 	return mm;
-out:
-	task_unlock(task);
-	up_read(&mm->mmap_sem);
-	mmput(mm);
-	return NULL;
 }
 
 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
-- 
cgit v1.2.2


From 024eab4d5bf7e3168a2b71038b3e04e6b1f376ed Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 17 Jul 2009 09:01:04 -0400
Subject: ext4: Fix memory leak fix when mounting an ext4 filesystem

The allocation of the ext4_group_info array was moved to a new
function ext4_mb_add_group_info() in commit 5f21b0e6 so that online
resize would use a common (and correct) codepath.  Unfortunately, the
call to the new ext4_mb_add_group_info() function was added without
removing the code which originally allocated the array.  This caused a
memory leak each time an ext4 filesystem was mounted.

The fix is simple; remove the code that did the original allocation,
since it is no longer needed.

Reported-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cd258463e2a9..cf9972090ad9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2571,13 +2571,11 @@ static int ext4_mb_init_backend(struct super_block *sb)
 {
 	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	ext4_group_t i;
-	int metalen;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	int num_meta_group_infos;
 	int num_meta_group_infos_max;
 	int array_size;
-	struct ext4_group_info **meta_group_info;
 	struct ext4_group_desc *desc;
 
 	/* This is the number of blocks used by GDT */
@@ -2622,22 +2620,6 @@ static int ext4_mb_init_backend(struct super_block *sb)
 		goto err_freesgi;
 	}
 	EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
-
-	metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
-	for (i = 0; i < num_meta_group_infos; i++) {
-		if ((i + 1) == num_meta_group_infos)
-			metalen = sizeof(*meta_group_info) *
-				(ngroups -
-					(i << EXT4_DESC_PER_BLOCK_BITS(sb)));
-		meta_group_info = kmalloc(metalen, GFP_KERNEL);
-		if (meta_group_info == NULL) {
-			printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
-			       "buddy group\n");
-			goto err_freemeta;
-		}
-		sbi->s_group_info[i] = meta_group_info;
-	}
-
 	for (i = 0; i < ngroups; i++) {
 		desc = ext4_get_group_desc(sb, i, NULL);
 		if (desc == NULL) {
@@ -2655,7 +2637,6 @@ err_freebuddy:
 	while (i-- > 0)
 		kfree(ext4_get_group_info(sb, i));
 	i = num_meta_group_infos;
-err_freemeta:
 	while (i-- > 0)
 		kfree(sbi->s_group_info[i]);
 	iput(sbi->s_buddy_cache);
-- 
cgit v1.2.2


From 43b38520296d0c1730c78c878e1d6d1a49122091 Mon Sep 17 00:00:00 2001
From: Manish Katiyar <mkatiyar@gmail.com>
Date: Mon, 27 Jul 2009 21:38:17 -0400
Subject: ext4: Fix typo in ext4/Kconfig

Signed-off-by: Manish Katiyar <mkatiyar@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 418b6f3b0ae8..152304624644 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -37,7 +37,7 @@ config EXT4DEV_COMPAT
 
 	  To enable backwards compatibility so that systems that are
 	  still expecting to mount ext4 filesystems using ext4dev,
-	  chose Y here.   This feature will go away by 2.6.31, so
+	  choose Y here.   This feature will go away by 2.6.31, so
 	  please arrange to get your userspace programs fixed!
 
 config EXT4_FS_XATTR
-- 
cgit v1.2.2


From 78f1ddbb498283c2445c11b0dfa666424c301803 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 27 Jul 2009 23:09:47 -0400
Subject: ext4: Avoid null pointer dereference when decoding EROFS w/o a
 journal

We need to check to make sure a journal is present before checking the
journal flags in ext4_decode_error().

Signed-off-by: Eric Sesterhenn <eric.sesterhenn@lsexperts.de>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8f4f079e6b9a..fe3f376b7df2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -344,7 +344,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
 		errstr = "Out of memory";
 		break;
 	case -EROFS:
-		if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
+		if (!sb || (EXT4_SB(sb)->s_journal &&
+			    EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
 			errstr = "Journal has aborted";
 		else
 			errstr = "Readonly filesystem";
-- 
cgit v1.2.2


From f6f50e28f0cb8d7bcdfaacc83129f005dede11b1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 17 Jul 2009 10:40:01 -0400
Subject: jbd2: Fail to load a journal if it is too short

Due to on disk corruption, it can happen that journal is too short. Fail
to load it in such case so that we don't oops somewhere later.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/journal.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e378cb383979..a8a358bc0f21 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1187,6 +1187,12 @@ static int journal_reset(journal_t *journal)
 
 	first = be32_to_cpu(sb->s_first);
 	last = be32_to_cpu(sb->s_maxlen);
+	if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
+		printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
+		       first, last);
+		journal_fail_superblock(journal);
+		return -EINVAL;
+	}
 
 	journal->j_first = first;
 	journal->j_last = last;
-- 
cgit v1.2.2


From 6487a9d3b5476ffd1bbbe97e58cf26dbeb7a5d4a Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Fri, 17 Jul 2009 10:54:08 -0400
Subject: ext4: More buffer head reference leaks

After the patch I posted last week regarding buffer head ref leaks in
no-journal mode, I looked at all the code that uses buffer heads and
searched for more potential leaks.

The patch below fixes the issues I found; these can occur even when a
journal is present.

The change to inode.c fixes a double release if
ext4_journal_get_create_access() fails.

The changes to namei.c are more complicated.  add_dirent_to_buf() will
release the input buffer head EXCEPT when it returns -ENOSPC.  There are
some callers of this routine that don't always do the brelse() in the event
that -ENOSPC is returned.  Unfortunately, to put this fix into ext4_add_entry()
required capturing the return value of make_indexed_dir() and
add_dirent_to_buf().

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c |  3 ++-
 fs/ext4/namei.c | 16 ++++++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f9c642b22efa..deb14a728791 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -762,8 +762,9 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 		BUFFER_TRACE(bh, "call get_create_access");
 		err = ext4_journal_get_create_access(handle, bh);
 		if (err) {
+			/* Don't brelse(bh) here; it's done in
+			 * ext4_journal_forget() below */
 			unlock_buffer(bh);
-			brelse(bh);
 			goto failed;
 		}
 
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 114abe5d2c1d..fea14dbd3c22 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1518,8 +1518,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 			return retval;
 
 		if (blocks == 1 && !dx_fallback &&
-		    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
-			return make_indexed_dir(handle, dentry, inode, bh);
+		    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+			retval = make_indexed_dir(handle, dentry, inode, bh);
+			if (retval == -ENOSPC)
+				brelse(bh);
+			return retval;
+		}
 		brelse(bh);
 	}
 	bh = ext4_append(handle, dir, &block, &retval);
@@ -1528,7 +1532,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	de->inode = 0;
 	de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
-	return add_dirent_to_buf(handle, dentry, inode, de, bh);
+	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	if (retval == -ENOSPC)
+		brelse(bh);
+	return retval;
 }
 
 /*
@@ -1657,7 +1664,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 	if (!de)
 		goto cleanup;
 	err = add_dirent_to_buf(handle, dentry, inode, de, bh);
-	bh = NULL;
+	if (err != -ENOSPC)
+		bh = NULL;
 	goto cleanup;
 
 journal_error:
-- 
cgit v1.2.2


From 5a4a798937f92413cb7dbdb6bd554186024092e9 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 5 Jul 2009 22:33:08 -0400
Subject: ext4: Remove unnecessary semicolons in mballoc.c

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cf9972090ad9..ba49e0c596f2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2532,7 +2532,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
 	init_rwsem(&meta_group_info[i]->alloc_sem);
-	meta_group_info[i]->bb_free_root.rb_node = NULL;;
+	meta_group_info[i]->bb_free_root.rb_node = NULL;
 
 #ifdef DOUBLE_CHECK
 	{
-- 
cgit v1.2.2


From 1c718505171b06dbb60eafcb3fddba877dae5f7b Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Sun, 5 Jul 2009 23:04:36 -0400
Subject: ext4: Fix compile warnings with MB_DEBUG

When MB_DEBUG is enabled, we get some compile warnings because
ext4_group_t is unsigned int.  This patch fixes them.

Signed-off-by Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ba49e0c596f2..68cde598b3bf 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1851,7 +1851,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 	struct inode *inode = sbi->s_buddy_cache;
 	struct page *page = NULL, *bitmap_page = NULL;
 
-	mb_debug("init group %lu\n", group);
+	mb_debug("init group %u\n", group);
 	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
 	this_grp = ext4_get_group_info(sb, group);
 	/*
@@ -4120,14 +4120,14 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 			ext4_get_group_no_and_offset(sb, pa->pa_pstart,
 						     NULL, &start);
 			spin_unlock(&pa->pa_lock);
-			printk(KERN_ERR "PA:%lu:%d:%u \n", i,
-							start, pa->pa_len);
+			printk(KERN_ERR "PA:%u:%d:%u \n", i,
+			       start, pa->pa_len);
 		}
 		ext4_unlock_group(sb, i);
 
 		if (grp->bb_free == 0)
 			continue;
-		printk(KERN_ERR "%lu: %d/%d \n",
+		printk(KERN_ERR "%u: %d/%d \n",
 		       i, grp->bb_free, grp->bb_fragments);
 	}
 	printk(KERN_ERR "\n");
-- 
cgit v1.2.2


From 7a6d3c8b3049d07123628f2bf57127bba2cc878f Mon Sep 17 00:00:00 2001
From: Csaba Henk <csaba@gluster.com>
Date: Wed, 1 Jul 2009 17:28:41 -0700
Subject: fuse: make the number of max background requests and congestion
 threshold tunable

The practical values for these limits depend on the design of the
filesystem server so let userspace set them at initialization time.

Signed-off-by: Csaba Henk <csaba@gluster.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c    | 10 +++++-----
 fs/fuse/fuse_i.h | 12 ++++++------
 fs/fuse/inode.c  | 14 ++++++++++++++
 3 files changed, 25 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index f58ecbc416c8..b152761c1bf6 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -250,7 +250,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 
 static void flush_bg_queue(struct fuse_conn *fc)
 {
-	while (fc->active_background < FUSE_MAX_BACKGROUND &&
+	while (fc->active_background < fc->max_background &&
 	       !list_empty(&fc->bg_queue)) {
 		struct fuse_req *req;
 
@@ -280,11 +280,11 @@ __releases(&fc->lock)
 	list_del(&req->intr_entry);
 	req->state = FUSE_REQ_FINISHED;
 	if (req->background) {
-		if (fc->num_background == FUSE_MAX_BACKGROUND) {
+		if (fc->num_background == fc->max_background) {
 			fc->blocked = 0;
 			wake_up_all(&fc->blocked_waitq);
 		}
-		if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+		if (fc->num_background == fc->congestion_threshold &&
 		    fc->connected && fc->bdi_initialized) {
 			clear_bdi_congested(&fc->bdi, READ);
 			clear_bdi_congested(&fc->bdi, WRITE);
@@ -410,9 +410,9 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
 {
 	req->background = 1;
 	fc->num_background++;
-	if (fc->num_background == FUSE_MAX_BACKGROUND)
+	if (fc->num_background == fc->max_background)
 		fc->blocked = 1;
-	if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+	if (fc->num_background == fc->congestion_threshold &&
 	    fc->bdi_initialized) {
 		set_bdi_congested(&fc->bdi, READ);
 		set_bdi_congested(&fc->bdi, WRITE);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 52b641fc0faf..6bcfab04396f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -25,12 +25,6 @@
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
 
-/** Maximum number of outstanding background requests */
-#define FUSE_MAX_BACKGROUND 12
-
-/** Congestion starts at 75% of maximum */
-#define FUSE_CONGESTION_THRESHOLD (FUSE_MAX_BACKGROUND * 75 / 100)
-
 /** Bias for fi->writectr, meaning new writepages must not be sent */
 #define FUSE_NOWRITE INT_MIN
 
@@ -349,6 +343,12 @@ struct fuse_conn {
 	/** rbtree of fuse_files waiting for poll events indexed by ph */
 	struct rb_root polled_files;
 
+	/** Maximum number of outstanding background requests */
+	unsigned max_background;
+
+	/** Number of background requests at which congestion starts */
+	unsigned congestion_threshold;
+
 	/** Number of requests currently in the background */
 	unsigned num_background;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f91ccc4a189d..9aa6f46d0c32 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -32,6 +32,12 @@ DEFINE_MUTEX(fuse_mutex);
 
 #define FUSE_DEFAULT_BLKSIZE 512
 
+/** Maximum number of outstanding background requests */
+#define FUSE_DEFAULT_MAX_BACKGROUND 12
+
+/** Congestion starts at 75% of maximum */
+#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
+
 struct fuse_mount_data {
 	int fd;
 	unsigned rootmode;
@@ -517,6 +523,8 @@ void fuse_conn_init(struct fuse_conn *fc)
 	INIT_LIST_HEAD(&fc->bg_queue);
 	INIT_LIST_HEAD(&fc->entry);
 	atomic_set(&fc->num_waiting, 0);
+	fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
+	fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
 	fc->khctr = 0;
 	fc->polled_files = RB_ROOT;
 	fc->reqctr = 0;
@@ -736,6 +744,12 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 	else {
 		unsigned long ra_pages;
 
+		if (arg->minor >= 13) {
+			if (arg->max_background)
+				fc->max_background = arg->max_background;
+			if (arg->congestion_threshold)
+				fc->congestion_threshold = arg->congestion_threshold;
+		}
 		if (arg->minor >= 6) {
 			ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
 			if (arg->flags & FUSE_ASYNC_READ)
-- 
cgit v1.2.2


From 134e63756d5f3d0f7604dfcca847b09d1b14fd66 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Fri, 10 Jul 2009 09:51:34 +0000
Subject: genetlink: make netns aware

This makes generic netlink network namespace aware. No
generic netlink families except for the controller family
are made namespace aware, they need to be checked one by
one and then set the family->netnsok member to true.

A new function genlmsg_multicast_netns() is introduced to
allow sending a multicast message in a given namespace,
for example when it applies to an object that lives in
that namespace, a new function genlmsg_multicast_allns()
to send a message to all network namespaces (for objects
that do not have an associated netns).

The function genlmsg_multicast() is changed to multicast
the message in just init_net, which is currently correct
for all generic netlink families since they only work in
init_net right now. Some will later want to work in all
net namespaces because they do not care about the netns
at all -- those will have to be converted to use one of
the new functions genlmsg_multicast_allns() or
genlmsg_multicast_netns() whenever they are made netns
aware in some way.

After this patch families can easily decide whether or
not they should be available in all net namespaces. Many
genl families us it for objects not related to networking
and should therefore be available in all namespaces, but
that will have to be done on a per family basis.

Note that this doesn't touch on the checkpoint/restart
problem where network namespaces could be used, genl
families and multicast groups are numbered globally and
I see no easy way of changing that, especially since it
must be possible to multicast to all network namespaces
for those families that do not care about netns.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/dlm/netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index ccc9d62c462d..55ea369f43a9 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -63,7 +63,7 @@ static int send_data(struct sk_buff *skb)
 		return rv;
 	}
 
-	return genlmsg_unicast(skb, listener_nlpid);
+	return genlmsg_unicast(&init_net, skb, listener_nlpid);
 }
 
 static int user_cmd(struct sk_buff *skb, struct genl_info *info)
-- 
cgit v1.2.2


From 713c0ecdb888e9ef6f085e828555455c5916b07f Mon Sep 17 00:00:00 2001
From: Sten Spans <Sten_Spans@genua.de>
Date: Thu, 16 Jul 2009 09:41:39 +0200
Subject: security: fix security_file_lock cmd argument

Pass posix-translated lock operations to security_file_lock
when invoked via sys_flock.

Signed-off-by: Sten Spans <Sten_Spans@genua.de>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/locks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index b6440f52178f..52366e877d76 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1591,7 +1591,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
 	if (can_sleep)
 		lock->fl_flags |= FL_SLEEP;
 
-	error = security_file_lock(filp, cmd);
+	error = security_file_lock(filp, lock->fl_type);
 	if (error)
 		goto out_free;
 
-- 
cgit v1.2.2


From 613afbf83298efaead05ebcac23d2285609d7160 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: sched: Pull up the might_sleep() check into cond_resched()

might_sleep() is called late-ish in cond_resched(), after the
need_resched()/preempt enabled/system running tests are
checked.

It's better to check the sleeps while atomic earlier and not
depend on some environment datas that reduce the chances to
detect a problem.

Also define cond_resched_*() helpers as macros, so that the
FILE/LINE reported in the sleeping while atomic warning
displays the real origin and not sched.h

Changes in v2:

 - Call __might_sleep() directly instead of might_sleep() which
   may call cond_resched()

 - Turn cond_resched() into a macro so that the file:line
   couple reported refers to the caller of cond_resched() and
   not __cond_resched() itself.

Changes in v3:

 - Also propagate this __might_sleep() pull up to
   cond_resched_lock() and cond_resched_softirq()

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1247725694-6082-6-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/dcache.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 9e5cd3c3a6ba..a100fa35a48f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
 #include <linux/swap.h>
 #include <linux/bootmem.h>
 #include <linux/fs_struct.h>
+#include <linux/hardirq.h>
 #include "internal.h"
 
 int sysctl_vfs_cache_pressure __read_mostly = 100;
-- 
cgit v1.2.2


From def01bc53d03881acfc393bd10a5c7575187e008 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: sched: Convert the only user of cond_resched_bkl to use
 cond_resched()

fs/locks.c:flock_lock_file() is the only user of
cond_resched_bkl()

This helper doesn't do anything more than cond_resched(). The
latter naming is enough to explain that we are rescheduling if
needed.

The bkl suffix suggests another semantics but it's actually a
synonym of cond_resched().

Reported-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1247725694-6082-7-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/locks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index b6440f52178f..2eb81975c99c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 	 * give it the opportunity to lock the file.
 	 */
 	if (found)
-		cond_resched_bkl();
+		cond_resched();
 
 find_conflict:
 	for_each_lock(inode, before) {
-- 
cgit v1.2.2


From 785b4b3a5a992eeb8b495a89b6f690b383010aba Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Mon, 27 Jul 2009 21:44:40 -0400
Subject: ext4: fix build warning when EXT4FS_DEBUG is on
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When compiling with EXT4FS_DEBUG on, gcc will complain with following warnings:

linux-2.6/fs/ext4/ialloc.c: In function ‘ext4_count_free_inodes’:
linux-2.6/fs/ext4/ialloc.c:1192: warning: format ‘%lu’ expects type
‘long unsigned int’, but argument 2 has type ‘ext4_group_t’

So add a type cast to suppress it.

Signed-off-by: Peng Tao <bergwolf@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 29e6dc7299b8..f3624ead4f6c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1189,7 +1189,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 
 		x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
 		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
-			i, ext4_free_inodes_count(sb, gdp), x);
+			(unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
-- 
cgit v1.2.2


From dd8ac1da4190139de70da18823ff8f5992a649ae Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Sun, 9 Aug 2009 15:06:19 -0400
Subject: nfs: Keep index within mnt_errtbl[]

Ensure that index i remains within array mnt_errtbl[] and mnt3_errtbl[].

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/mount_clnt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 38ef9eaec407..8b9affc8bab2 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -258,7 +258,7 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res)
 		return -EIO;
 	status = ntohl(*p);
 
-	for (i = 0; i <= ARRAY_SIZE(mnt_errtbl); i++) {
+	for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
 		if (mnt_errtbl[i].status == status) {
 			res->errno = mnt_errtbl[i].errno;
 			return 0;
@@ -309,7 +309,7 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
 		return -EIO;
 	status = ntohl(*p);
 
-	for (i = 0; i <= ARRAY_SIZE(mnt3_errtbl); i++) {
+	for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
 		if (mnt3_errtbl[i].status == status) {
 			res->errno = mnt3_errtbl[i].errno;
 			return 0;
-- 
cgit v1.2.2


From a78cb57a106fceeba26da2907db9d8886700e1dc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 9 Aug 2009 15:06:19 -0400
Subject: NFSv4: Don't loop forever on state recovery failure...

If the server is broken, then retrying forever won't fix it. We
should just give up after a while, and return an error to the user.
We set the number of retries to 10 for now...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6917311f201c..d95f7f9e60c4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -61,6 +61,8 @@
 #define NFS4_POLL_RETRY_MIN	(HZ/10)
 #define NFS4_POLL_RETRY_MAX	(15*HZ)
 
+#define NFS4_MAX_LOOP_ON_RECOVER (10)
+
 struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
@@ -426,17 +428,19 @@ out:
 static int nfs4_recover_session(struct nfs4_session *session)
 {
 	struct nfs_client *clp = session->clp;
+	unsigned int loop;
 	int ret;
 
-	for (;;) {
+	for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
 		ret = nfs4_wait_clnt_recover(clp);
 		if (ret != 0)
-				return ret;
+			break;
 		if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
 			break;
 		nfs4_schedule_state_manager(clp);
+		ret = -EIO;
 	}
-	return 0;
+	return ret;
 }
 
 static int nfs41_setup_sequence(struct nfs4_session *session,
@@ -1444,18 +1448,20 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 static int nfs4_recover_expired_lease(struct nfs_server *server)
 {
 	struct nfs_client *clp = server->nfs_client;
+	unsigned int loop;
 	int ret;
 
-	for (;;) {
+	for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
 		ret = nfs4_wait_clnt_recover(clp);
 		if (ret != 0)
-			return ret;
+			break;
 		if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
 		    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
 			break;
 		nfs4_schedule_state_recovery(clp);
+		ret = -EIO;
 	}
-	return 0;
+	return ret;
 }
 
 /*
-- 
cgit v1.2.2


From 62ab460cf5e450e1d207a98a9c6cf2e4a6a78fd1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 9 Aug 2009 15:06:19 -0400
Subject: NFSv4: Add 'server capability' flags for NFSv4 recommended attributes

If the NFSv4 server doesn't support a POSIX attribute, the generic NFS code
needs to know that, so that it don't keep trying to poll for it.

However, by the same count, if the NFSv4 server does support that
attribute, then we should ensure that the inode metadata is appropriately
labelled as being untrusted. For instance, if we don't know the correct
value of the file's uid, we should certainly not be caching ACLs or ACCESS
results.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c   |  7 +++--
 fs/nfs/inode.c    | 92 ++++++++++++++++++++++++++++++++++++++++++++++---------
 fs/nfs/nfs4proc.c | 22 +++++++++++++
 3 files changed, 105 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8d25ccb2d51d..8f34fd8028fc 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -809,6 +809,9 @@ static int nfs_init_server(struct nfs_server *server,
 	/* Initialise the client representation from the mount data */
 	server->flags = data->flags;
 	server->options = data->options;
+	server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
+		NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
+		NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
 
 	if (data->rsize)
 		server->rsize = nfs_block_size(data->rsize, NULL);
@@ -1274,7 +1277,7 @@ static int nfs4_init_server(struct nfs_server *server,
 
 	/* Initialise the client representation from the mount data */
 	server->flags = data->flags;
-	server->caps |= NFS_CAP_ATOMIC_OPEN;
+	server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
 	server->options = data->options;
 
 	/* Get a client record */
@@ -1400,7 +1403,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 
 	/* Initialise the client representation from the parent server */
 	nfs_server_copy_userdata(server, parent_server);
-	server->caps |= NFS_CAP_ATOMIC_OPEN;
+	server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
 
 	/* Get a client representation.
 	 * Note: NFSv4 always uses TCP, */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bd7938eda6a8..fe5a8b45d867 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -286,6 +286,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		/* We can't support update_atime(), since the server will reset it */
 		inode->i_flags |= S_NOATIME|S_NOCMTIME;
 		inode->i_mode = fattr->mode;
+		if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
+				&& nfs_server_capable(inode, NFS_CAP_MODE))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL;
 		/* Why so? Because we want revalidate for devices/FIFOs, and
 		 * that's precisely what we have in nfs_file_inode_operations.
 		 */
@@ -330,20 +335,46 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		nfsi->attr_gencount = fattr->gencount;
 		if (fattr->valid & NFS_ATTR_FATTR_ATIME)
 			inode->i_atime = fattr->atime;
+		else if (nfs_server_capable(inode, NFS_CAP_ATIME))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		if (fattr->valid & NFS_ATTR_FATTR_MTIME)
 			inode->i_mtime = fattr->mtime;
+		else if (nfs_server_capable(inode, NFS_CAP_MTIME))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_DATA;
 		if (fattr->valid & NFS_ATTR_FATTR_CTIME)
 			inode->i_ctime = fattr->ctime;
+		else if (nfs_server_capable(inode, NFS_CAP_CTIME))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL;
 		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
 			nfsi->change_attr = fattr->change_attr;
+		else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_DATA;
 		if (fattr->valid & NFS_ATTR_FATTR_SIZE)
 			inode->i_size = nfs_size_to_loff_t(fattr->size);
+		else
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_DATA
+				| NFS_INO_REVAL_PAGECACHE;
 		if (fattr->valid & NFS_ATTR_FATTR_NLINK)
 			inode->i_nlink = fattr->nlink;
+		else if (nfs_server_capable(inode, NFS_CAP_NLINK))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		if (fattr->valid & NFS_ATTR_FATTR_OWNER)
 			inode->i_uid = fattr->uid;
+		else if (nfs_server_capable(inode, NFS_CAP_OWNER))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL;
 		if (fattr->valid & NFS_ATTR_FATTR_GROUP)
 			inode->i_gid = fattr->gid;
+		else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL;
 		if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
 			inode->i_blocks = fattr->du.nfs2.blocks;
 		if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
@@ -1145,6 +1176,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	loff_t cur_isize, new_isize;
 	unsigned long invalid = 0;
 	unsigned long now = jiffies;
+	unsigned long save_cache_validity;
 
 	dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
 			__func__, inode->i_sb->s_id, inode->i_ino,
@@ -1171,10 +1203,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	 */
 	nfsi->read_cache_jiffies = fattr->time_start;
 
-	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
-	    nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
-		    | NFS_INO_INVALID_ATIME
-		    | NFS_INO_REVAL_PAGECACHE);
+	save_cache_validity = nfsi->cache_validity;
+	nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+			| NFS_INO_INVALID_ATIME
+			| NFS_INO_REVAL_FORCED
+			| NFS_INO_REVAL_PAGECACHE);
 
 	/* Do atomic weak cache consistency updates */
 	nfs_wcc_update_inode(inode, fattr);
@@ -1189,7 +1222,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				nfs_force_lookup_revalidate(inode);
 			nfsi->change_attr = fattr->change_attr;
 		}
-	}
+	} else if (server->caps & NFS_CAP_CHANGE_ATTR)
+		invalid |= save_cache_validity;
 
 	if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
 		/* NFSv2/v3: Check if the mtime agrees */
@@ -1201,7 +1235,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				nfs_force_lookup_revalidate(inode);
 			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
 		}
-	}
+	} else if (server->caps & NFS_CAP_MTIME)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_DATA
+				| NFS_INO_REVAL_PAGECACHE
+				| NFS_INO_REVAL_FORCED);
+
 	if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
 		/* If ctime has changed we should definitely clear access+acl caches */
 		if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
@@ -1215,7 +1254,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			}
 			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
 		}
-	}
+	} else if (server->caps & NFS_CAP_CTIME)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL
+				| NFS_INO_REVAL_FORCED);
 
 	/* Check if our cached file size is stale */
 	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
@@ -1231,30 +1274,50 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			dprintk("NFS: isize change on server for file %s/%ld\n",
 					inode->i_sb->s_id, inode->i_ino);
 		}
-	}
+	} else
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_REVAL_PAGECACHE
+				| NFS_INO_REVAL_FORCED);
 
 
 	if (fattr->valid & NFS_ATTR_FATTR_ATIME)
 		memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
+	else if (server->caps & NFS_CAP_ATIME)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME
+				| NFS_INO_REVAL_FORCED);
 
 	if (fattr->valid & NFS_ATTR_FATTR_MODE) {
 		if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 			inode->i_mode = fattr->mode;
 		}
-	}
+	} else if (server->caps & NFS_CAP_MODE)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL
+				| NFS_INO_REVAL_FORCED);
+
 	if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
 		if (inode->i_uid != fattr->uid) {
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 			inode->i_uid = fattr->uid;
 		}
-	}
+	} else if (server->caps & NFS_CAP_OWNER)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL
+				| NFS_INO_REVAL_FORCED);
+
 	if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
 		if (inode->i_gid != fattr->gid) {
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 			inode->i_gid = fattr->gid;
 		}
-	}
+	} else if (server->caps & NFS_CAP_OWNER_GROUP)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL
+				| NFS_INO_REVAL_FORCED);
 
 	if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
 		if (inode->i_nlink != fattr->nlink) {
@@ -1263,7 +1326,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				invalid |= NFS_INO_INVALID_DATA;
 			inode->i_nlink = fattr->nlink;
 		}
-	}
+	} else if (server->caps & NFS_CAP_NLINK)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_REVAL_FORCED);
 
 	if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
 		/*
@@ -1293,9 +1358,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				|| S_ISLNK(inode->i_mode)))
 		invalid &= ~NFS_INO_INVALID_DATA;
 	if (!nfs_have_delegation(inode, FMODE_READ) ||
-			(nfsi->cache_validity & NFS_INO_REVAL_FORCED))
+			(save_cache_validity & NFS_INO_REVAL_FORCED))
 		nfsi->cache_validity |= invalid;
-	nfsi->cache_validity &= ~NFS_INO_REVAL_FORCED;
 
 	return 0;
  out_changed:
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d95f7f9e60c4..be6544aef41f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2003,12 +2003,34 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 	status = nfs4_call_sync(server, &msg, &args, &res, 0);
 	if (status == 0) {
 		memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
+		server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
+				NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
+				NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|
+				NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
+				NFS_CAP_CTIME|NFS_CAP_MTIME);
 		if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
 			server->caps |= NFS_CAP_ACLS;
 		if (res.has_links != 0)
 			server->caps |= NFS_CAP_HARDLINKS;
 		if (res.has_symlinks != 0)
 			server->caps |= NFS_CAP_SYMLINKS;
+		if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID)
+			server->caps |= NFS_CAP_FILEID;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_MODE)
+			server->caps |= NFS_CAP_MODE;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS)
+			server->caps |= NFS_CAP_NLINK;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER)
+			server->caps |= NFS_CAP_OWNER;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP)
+			server->caps |= NFS_CAP_OWNER_GROUP;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS)
+			server->caps |= NFS_CAP_ATIME;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA)
+			server->caps |= NFS_CAP_CTIME;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
+			server->caps |= NFS_CAP_MTIME;
+
 		memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
 		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
 		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
-- 
cgit v1.2.2


From 80e52aced138bb41b045a8595a87510f27d8d8c5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 9 Aug 2009 15:06:19 -0400
Subject: NFSv4: Don't do idmapper upcalls for asynchronous RPC calls

We don't want to cause rpciod to hang...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 86 ++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 58 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 617273e7d47f..e65cc2e650c8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3075,7 +3075,8 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 	return ret;
 }
 
-static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
+static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
+		struct nfs_client *clp, uint32_t *uid, int may_sleep)
 {
 	uint32_t len;
 	__be32 *p;
@@ -3088,7 +3089,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		READ_BUF(4);
 		READ32(len);
 		READ_BUF(len);
-		if (len < XDR_MAX_NETOBJ) {
+		if (!may_sleep) {
+			/* do nothing */
+		} else if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
 				ret = NFS_ATTR_FATTR_OWNER;
 			else
@@ -3103,7 +3106,8 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 	return ret;
 }
 
-static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
+static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
+		struct nfs_client *clp, uint32_t *gid, int may_sleep)
 {
 	uint32_t len;
 	__be32 *p;
@@ -3116,7 +3120,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		READ_BUF(4);
 		READ32(len);
 		READ_BUF(len);
-		if (len < XDR_MAX_NETOBJ) {
+		if (!may_sleep) {
+			/* do nothing */
+		} else if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
 				ret = NFS_ATTR_FATTR_GROUP;
 			else
@@ -3466,7 +3472,8 @@ xdr_error:
 	return status;
 }
 
-static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server)
+static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+		const struct nfs_server *server, int may_sleep)
 {
 	__be32 *savep;
 	uint32_t attrlen,
@@ -3538,12 +3545,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 		goto xdr_error;
 	fattr->valid |= status;
 
-	status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid);
+	status = decode_attr_owner(xdr, bitmap, server->nfs_client,
+			&fattr->uid, may_sleep);
 	if (status < 0)
 		goto xdr_error;
 	fattr->valid |= status;
 
-	status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid);
+	status = decode_attr_group(xdr, bitmap, server->nfs_client,
+			&fattr->gid, may_sleep);
 	if (status < 0)
 		goto xdr_error;
 	fattr->valid |= status;
@@ -4370,7 +4379,8 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct
 	status = decode_open_downgrade(&xdr, res);
 	if (status != 0)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server);
+	decode_getfattr(&xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4397,7 +4407,8 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
 	status = decode_access(&xdr, res);
 	if (status != 0)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server);
+	decode_getfattr(&xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4424,7 +4435,8 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo
 		goto out;
 	if ((status = decode_getfh(&xdr, res->fh)) != 0)
 		goto out;
-	status = decode_getfattr(&xdr, res->fattr, res->server);
+	status = decode_getfattr(&xdr, res->fattr, res->server
+			,!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4448,7 +4460,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf
 	if ((status = decode_putrootfh(&xdr)) != 0)
 		goto out;
 	if ((status = decode_getfh(&xdr, res->fh)) == 0)
-		status = decode_getfattr(&xdr, res->fattr, res->server);
+		status = decode_getfattr(&xdr, res->fattr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4473,7 +4486,8 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
 		goto out;
 	if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
 		goto out;
-	decode_getfattr(&xdr, &res->dir_attr, res->server);
+	decode_getfattr(&xdr, &res->dir_attr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4503,11 +4517,13 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re
 	if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0)
 		goto out;
 	/* Current FH is target directory */
-	if (decode_getfattr(&xdr, res->new_fattr, res->server) != 0)
+	if (decode_getfattr(&xdr, res->new_fattr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
 		goto out;
 	if ((status = decode_restorefh(&xdr)) != 0)
 		goto out;
-	decode_getfattr(&xdr, res->old_fattr, res->server);
+	decode_getfattr(&xdr, res->old_fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4540,11 +4556,13 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link
 	 * Note order: OP_LINK leaves the directory as the current
 	 *             filehandle.
 	 */
-	if (decode_getfattr(&xdr, res->dir_attr, res->server) != 0)
+	if (decode_getfattr(&xdr, res->dir_attr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
 		goto out;
 	if ((status = decode_restorefh(&xdr)) != 0)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server);
+	decode_getfattr(&xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4573,11 +4591,13 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr
 		goto out;
 	if ((status = decode_getfh(&xdr, res->fh)) != 0)
 		goto out;
-	if (decode_getfattr(&xdr, res->fattr, res->server) != 0)
+	if (decode_getfattr(&xdr, res->fattr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
 		goto out;
 	if ((status = decode_restorefh(&xdr)) != 0)
 		goto out;
-	decode_getfattr(&xdr, res->dir_fattr, res->server);
+	decode_getfattr(&xdr, res->dir_fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4609,7 +4629,8 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
 	status = decode_putfh(&xdr);
 	if (status)
 		goto out;
-	status = decode_getfattr(&xdr, res->fattr, res->server);
+	status = decode_getfattr(&xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4716,7 +4737,8 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
 	 * 	an ESTALE error. Shouldn't be a problem,
 	 * 	though, since fattr->valid will remain unset.
 	 */
-	decode_getfattr(&xdr, res->fattr, res->server);
+	decode_getfattr(&xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4748,11 +4770,13 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr
 		goto out;
 	if (decode_getfh(&xdr, &res->fh) != 0)
 		goto out;
-	if (decode_getfattr(&xdr, res->f_attr, res->server) != 0)
+	if (decode_getfattr(&xdr, res->f_attr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
 		goto out;
 	if (decode_restorefh(&xdr) != 0)
 		goto out;
-	decode_getfattr(&xdr, res->dir_attr, res->server);
+	decode_getfattr(&xdr, res->dir_attr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4800,7 +4824,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nf
 	status = decode_open(&xdr, res);
 	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->f_attr, res->server);
+	decode_getfattr(&xdr, res->f_attr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4827,7 +4852,8 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
 	status = decode_setattr(&xdr);
 	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server);
+	decode_getfattr(&xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -5001,7 +5027,8 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writ
 	status = decode_write(&xdr, res);
 	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server);
+	decode_getfattr(&xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 	if (!status)
 		status = res->count;
 out:
@@ -5030,7 +5057,8 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_wri
 	status = decode_commit(&xdr, res);
 	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server);
+	decode_getfattr(&xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -5194,7 +5222,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
 	if (status != 0)
 		goto out;
 	status = decode_delegreturn(&xdr);
-	decode_getfattr(&xdr, res->fattr, res->server);
+	decode_getfattr(&xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -5222,7 +5251,8 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
 		goto out;
 	xdr_enter_page(&xdr, PAGE_SIZE);
 	status = decode_getfattr(&xdr, &res->fs_locations->fattr,
-				 res->fs_locations->server);
+				 res->fs_locations->server,
+				 !RPC_IS_ASYNC(req->rq_task));
 out:
 	return status;
 }
-- 
cgit v1.2.2


From c140aa91357c415c91269884518fa1d6fdebc20d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 9 Aug 2009 15:06:19 -0400
Subject: NFSv4: Clean up the nfs.callback_tcpport option

Tighten up the validity checking in param_set_port: check for NULL pointers.
Ensure that the option shows up on 'modinfo' output.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 7f604c7941fb..293fa0528a6e 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -43,21 +43,29 @@ static struct svc_program nfs4_callback_program;
 unsigned int nfs_callback_set_tcpport;
 unsigned short nfs_callback_tcpport;
 unsigned short nfs_callback_tcpport6;
-static const int nfs_set_port_min = 0;
-static const int nfs_set_port_max = 65535;
+#define NFS_CALLBACK_MAXPORTNR (65535U)
 
-static int param_set_port(const char *val, struct kernel_param *kp)
+static int param_set_portnr(const char *val, struct kernel_param *kp)
 {
-	char *endp;
-	int num = simple_strtol(val, &endp, 0);
-	if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
+	unsigned long num;
+	int ret;
+
+	if (!val)
+		return -EINVAL;
+	ret = strict_strtoul(val, 0, &num);
+	if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR)
 		return -EINVAL;
-	*((int *)kp->arg) = num;
+	*((unsigned int *)kp->arg) = num;
 	return 0;
 }
 
-module_param_call(callback_tcpport, param_set_port, param_get_int,
-		 &nfs_callback_set_tcpport, 0644);
+static int param_get_portnr(char *buffer, struct kernel_param *kp)
+{
+	return param_get_uint(buffer, kp);
+}
+#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
+
+module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
 
 /*
  * This is the NFSv4 callback kernel thread.
-- 
cgit v1.2.2


From f3f4f4ed26b116f621596f74d42d2b736171e968 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:29 -0400
Subject: NFS: Fix up new minorversion= option

The new minorversion= mount option (commit 3fd5be9e) was merged at
the same time as the recent sloppy parser fixes (commit a5a16bae),
so minorversion= still uses the old value parsing logic.

If the minorversion= option specifies a bogus value, it should fail
with "bad value" not "bad option."

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0b4cbdc60abd..83a31070bc1a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -158,7 +158,7 @@ static const match_table_t nfs_mount_option_tokens = {
 	{ Opt_mountvers, "mountvers=%s" },
 	{ Opt_nfsvers, "nfsvers=%s" },
 	{ Opt_nfsvers, "vers=%s" },
-	{ Opt_minorversion, "minorversion=%u" },
+	{ Opt_minorversion, "minorversion=%s" },
 
 	{ Opt_sec, "sec=%s" },
 	{ Opt_proto, "proto=%s" },
@@ -1001,7 +1001,6 @@ static int nfs_parse_mount_options(char *raw,
 	while ((p = strsep(&raw, ",")) != NULL) {
 		substring_t args[MAX_OPT_ARGS];
 		unsigned long option;
-		int int_option;
 		int token;
 
 		if (!*p)
@@ -1273,11 +1272,16 @@ static int nfs_parse_mount_options(char *raw,
 			}
 			break;
 		case Opt_minorversion:
-			if (match_int(args, &int_option))
-				return 0;
-			if (int_option < 0 || int_option > NFS4_MAX_MINOR_VERSION)
-				return 0;
-			mnt->minorversion = int_option;
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			rc = strict_strtoul(string, 10, &option);
+			kfree(string);
+			if (rc != 0)
+				goto out_invalid_value;
+			if (option > NFS4_MAX_MINOR_VERSION)
+				goto out_invalid_value;
+			mnt->minorversion = option;
 			break;
 
 		/*
-- 
cgit v1.2.2


From 0b524123c93893391ec9e6c9b04998a45235f9c8 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:30 -0400
Subject: NFS: Add ability to send MOUNTPROC_UMNT to the kernel's mountd client

After certain failure modes of an NFS mount, an NFS client should send
a MOUNTPROC_UMNT request to remove the just-added mount entry from the
server's mount table.  While no-one should rely on the accuracy of the
server's mount table, sending a UMNT is simply being a good internet
neighbor.

Since NFS mount processing is handled in the kernel now, we will need
a function in the kernel's mountd client that can post a MOUNTRPC_UMNT
request, in order to handle these failure modes.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h   |  1 +
 fs/nfs/mount_clnt.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7dd90a6769d0..8d2b71d57d29 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -102,6 +102,7 @@ struct nfs_mount_request {
 };
 
 extern int nfs_mount(struct nfs_mount_request *info);
+extern void nfs_umount(const struct nfs_mount_request *info);
 
 /* client.c */
 extern struct rpc_program nfs_program;
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 38ef9eaec407..72dd8b6ccafa 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -209,6 +209,71 @@ out_mnt_err:
 	goto out;
 }
 
+/**
+ * nfs_umount - Notify a server that we have unmounted this export
+ * @info: pointer to umount request arguments
+ *
+ * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always
+ * use UDP.
+ */
+void nfs_umount(const struct nfs_mount_request *info)
+{
+	static const struct rpc_timeout nfs_umnt_timeout = {
+		.to_initval = 1 * HZ,
+		.to_maxval = 3 * HZ,
+		.to_retries = 2,
+	};
+	struct rpc_create_args args = {
+		.protocol	= IPPROTO_UDP,
+		.address	= info->sap,
+		.addrsize	= info->salen,
+		.timeout	= &nfs_umnt_timeout,
+		.servername	= info->hostname,
+		.program	= &mnt_program,
+		.version	= info->version,
+		.authflavor	= RPC_AUTH_UNIX,
+		.flags		= RPC_CLNT_CREATE_NOPING,
+	};
+	struct mountres	result;
+	struct rpc_message msg	= {
+		.rpc_argp	= info->dirpath,
+		.rpc_resp	= &result,
+	};
+	struct rpc_clnt *clnt;
+	int status;
+
+	if (info->noresvport)
+		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+
+	clnt = rpc_create(&args);
+	if (unlikely(IS_ERR(clnt)))
+		goto out_clnt_err;
+
+	dprintk("NFS: sending UMNT request for %s:%s\n",
+		(info->hostname ? info->hostname : "server"), info->dirpath);
+
+	if (info->version == NFS_MNT3_VERSION)
+		msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT];
+	else
+		msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT];
+
+	status = rpc_call_sync(clnt, &msg, 0);
+	rpc_shutdown_client(clnt);
+
+	if (unlikely(status < 0))
+		goto out_call_err;
+
+	return;
+
+out_clnt_err:
+	dprintk("NFS: failed to create UMNT RPC client, status=%ld\n",
+			PTR_ERR(clnt));
+	return;
+
+out_call_err:
+	dprintk("NFS: UMNT request failed, status=%d\n", status);
+}
+
 /*
  * XDR encode/decode functions for MOUNT
  */
@@ -407,6 +472,13 @@ static struct rpc_procinfo mnt_procedures[] = {
 		.p_statidx	= MOUNTPROC_MNT,
 		.p_name		= "MOUNT",
 	},
+	[MOUNTPROC_UMNT] = {
+		.p_proc		= MOUNTPROC_UMNT,
+		.p_encode	= (kxdrproc_t)mnt_enc_dirpath,
+		.p_arglen	= MNT_enc_dirpath_sz,
+		.p_statidx	= MOUNTPROC_UMNT,
+		.p_name		= "UMOUNT",
+	},
 };
 
 static struct rpc_procinfo mnt3_procedures[] = {
@@ -419,6 +491,13 @@ static struct rpc_procinfo mnt3_procedures[] = {
 		.p_statidx	= MOUNTPROC3_MNT,
 		.p_name		= "MOUNT",
 	},
+	[MOUNTPROC3_UMNT] = {
+		.p_proc		= MOUNTPROC3_UMNT,
+		.p_encode	= (kxdrproc_t)mnt_enc_dirpath,
+		.p_arglen	= MNT_enc_dirpath_sz,
+		.p_statidx	= MOUNTPROC3_UMNT,
+		.p_name		= "UMOUNT",
+	},
 };
 
 
-- 
cgit v1.2.2


From 059f90b323c0f5d34656ab7e0548d7d033c2a51a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:31 -0400
Subject: NFS: Fix auth flavor len accounting

Previous logic in the NFS mount parsing code path assumed
auth_flavor_len was set to zero for simple authentication flavors
(like AUTH_UNIX), and 1 for compound flavors (like AUTH_GSS).

At some earlier point (maybe even before the option parsers were
merged?) specific checks for auth_flavor_len being zero were removed
from the functions that validate the mount option that sets the mount
point's authentication flavor.

Since we are populating an array for authentication flavors, the
auth_flavor_len should always be set to the number of flavors.  Let's
eliminate some cleverness here, and prepare for new logic that needs
to know the number of flavors in the auth_flavors[] array.

(auth_flavors[] is an array because at some point we want to allow a
list of acceptable authentication flavors to be specified via the sec=
mount option.  For now it remains a single element array).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 83a31070bc1a..a33e608713e9 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -904,8 +904,6 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
 
 /*
  * Parse the value of the 'sec=' option.
- *
- * The flavor_len setting is for v4 mounts.
  */
 static int nfs_parse_security_flavors(char *value,
 				      struct nfs_parsed_mount_data *mnt)
@@ -916,53 +914,43 @@ static int nfs_parse_security_flavors(char *value,
 
 	switch (match_token(value, nfs_secflavor_tokens, args)) {
 	case Opt_sec_none:
-		mnt->auth_flavor_len = 0;
 		mnt->auth_flavors[0] = RPC_AUTH_NULL;
 		break;
 	case Opt_sec_sys:
-		mnt->auth_flavor_len = 0;
 		mnt->auth_flavors[0] = RPC_AUTH_UNIX;
 		break;
 	case Opt_sec_krb5:
-		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
 		break;
 	case Opt_sec_krb5i:
-		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
 		break;
 	case Opt_sec_krb5p:
-		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
 		break;
 	case Opt_sec_lkey:
-		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
 		break;
 	case Opt_sec_lkeyi:
-		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
 		break;
 	case Opt_sec_lkeyp:
-		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
 		break;
 	case Opt_sec_spkm:
-		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
 		break;
 	case Opt_sec_spkmi:
-		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
 		break;
 	case Opt_sec_spkmp:
-		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
 		break;
 	default:
 		return 0;
 	}
 
+	mnt->auth_flavor_len = 1;
 	return 1;
 }
 
@@ -1680,6 +1668,7 @@ static int nfs_validate_mount_data(void *options,
 	args->nfs_server.port	= 0;	/* autobind unless user sets port */
 	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 	args->auth_flavors[0]	= RPC_AUTH_UNIX;
+	args->auth_flavor_len	= 1;
 
 	switch (data->version) {
 	case 1:
@@ -2343,7 +2332,7 @@ static int nfs4_validate_mount_data(void *options,
 	args->acdirmax		= NFS_DEF_ACDIRMAX;
 	args->nfs_server.port	= NFS_PORT; /* 2049 unless user set port= */
 	args->auth_flavors[0]	= RPC_AUTH_UNIX;
-	args->auth_flavor_len	= 0;
+	args->auth_flavor_len	= 1;
 	args->minorversion	= 0;
 
 	switch (data->version) {
-- 
cgit v1.2.2


From ec88f28d1eb77346f19ca324ceec76e645cdd9da Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:32 -0400
Subject: NFS: Use the authentication flavor list returned by mountd

Commit a14017db added support in the kernel's NFS mount client to
decode the authentication flavor list returned by mountd.

The NFS client can now use this list to determine whether the
authentication flavor requested by the user is actually supported
by the server.

Note we don't actually negotiate the security flavor if none was
specified by the user.  Instead, we try to use AUTH_SYS, and fail if
the server does not support it.  This prevents us from negotiating
an inappropriate security flavor (some servers list AUTH_NULL first).

If the server does not support AUTH_SYS, the user must provide an
appropriate security flavor by specifying the "sec=" mount option.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 51 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a33e608713e9..8526008eba72 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1436,6 +1436,42 @@ out_security_failure:
 	return 0;
 }
 
+/*
+ * Match the requested auth flavors with the list returned by
+ * the server.  Returns zero and sets the mount's authentication
+ * flavor on success; returns -EACCES if server does not support
+ * the requested flavor.
+ */
+static int nfs_walk_authlist(struct nfs_parsed_mount_data *args,
+			     struct nfs_mount_request *request)
+{
+	unsigned int i, j, server_authlist_len = *(request->auth_flav_len);
+
+	/*
+	 * We avoid sophisticated negotiating here, as there are
+	 * plenty of cases where we can get it wrong, providing
+	 * either too little or too much security.
+	 *
+	 * RFC 2623, section 2.7 suggests we SHOULD prefer the
+	 * flavor listed first.  However, some servers list
+	 * AUTH_NULL first.  Our caller plants AUTH_SYS, the
+	 * preferred default, in args->auth_flavors[0] if user
+	 * didn't specify sec= mount option.
+	 */
+	for (i = 0; i < args->auth_flavor_len; i++)
+		for (j = 0; j < server_authlist_len; j++)
+			if (args->auth_flavors[i] == request->auth_flavs[j]) {
+				dfprintk(MOUNT, "NFS: using auth flavor %d\n",
+					request->auth_flavs[j]);
+				args->auth_flavors[0] = request->auth_flavs[j];
+				return 0;
+			}
+
+	dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n");
+	nfs_umount(request);
+	return -EACCES;
+}
+
 /*
  * Use the remote server's MOUNT service to request the NFS file handle
  * corresponding to the provided path.
@@ -1443,7 +1479,8 @@ out_security_failure:
 static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 			 struct nfs_fh *root_fh)
 {
-	unsigned int auth_flavor_len = 0;
+	rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
+	unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
 	struct nfs_mount_request request = {
 		.sap		= (struct sockaddr *)
 						&args->mount_server.address,
@@ -1451,7 +1488,8 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 		.protocol	= args->mount_server.protocol,
 		.fh		= root_fh,
 		.noresvport	= args->flags & NFS_MOUNT_NORESVPORT,
-		.auth_flav_len	= &auth_flavor_len,
+		.auth_flav_len	= &server_authlist_len,
+		.auth_flavs	= server_authlist,
 	};
 	int status;
 
@@ -1488,12 +1526,18 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 	 * to a file handle.
 	 */
 	status = nfs_mount(&request);
-	if (status == 0)
-		return 0;
+	if (status != 0) {
+		dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
+				request.hostname, status);
+		return status;
+	}
 
-	dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
-			request.hostname, status);
-	return status;
+	/*
+	 * MNTv1 (NFSv2) does not support auth flavor negotiation.
+	 */
+	if (args->mount_server.version != NFS_MNT3_VERSION)
+		return 0;
+	return nfs_walk_authlist(args, &request);
 }
 
 static int nfs_parse_simple_hostname(const char *dev_name,
-- 
cgit v1.2.2


From a02d692611348f11ee1bc37431a883c3ff2de23e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:34 -0400
Subject: SUNRPC: Provide functions for managing universal addresses

Introduce a set of functions in the kernel's RPC implementation for
converting between a socket address and either a standard
presentation address string or an RPC universal address.

The universal address functions will be used to encode and decode
RPCB_FOO and NFSv4 SETCLIENTID arguments.  The other functions are
part of a previous promise to deliver shared functions that can be
used by upper-layer protocols to display and manipulate IP
addresses.

The kernel's current address printf formatters were designed
specifically for kernel to user-space APIs that require a particular
string format for socket addresses, thus are somewhat limited for the
purposes of sunrpc.ko.  The formatter for IPv6 addresses, %pI6, does
not support short-handing or scope IDs.  Also, these printf formatters
are unique per address family, so a separate formatter string is
required for printing AF_INET and AF_INET6 addresses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 8d2b71d57d29..ff68397f9b19 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -370,8 +370,6 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
 		PAGE_SIZE - 1) >> PAGE_SHIFT;
 }
 
-#define IPV6_SCOPE_DELIMITER	'%'
-
 /*
  * Set the port number in an address.  Be agnostic about the address
  * family.
-- 
cgit v1.2.2


From 53a0b9c4c99ab0085a06421f71592722e5b3fd5f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:36 -0400
Subject: NFS: Replace nfs_parse_ip_address() with rpc_pton()

Clean up: Use the common routine now provided in sunrpc.ko for parsing mount
addresses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h      |   1 -
 fs/nfs/nfs4namespace.c |   6 +-
 fs/nfs/super.c         | 148 +++++++------------------------------------------
 3 files changed, 22 insertions(+), 133 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ff68397f9b19..cf1da3e22004 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -214,7 +214,6 @@ void nfs_zap_acl_cache(struct inode *inode);
 extern int nfs_wait_bit_killable(void *word);
 
 /* super.c */
-void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
 extern struct file_system_type nfs_xdev_fs_type;
 #ifdef CONFIG_NFS_V4
 extern struct file_system_type nfs4_xdev_fs_type;
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 2a2a0a7143ad..43c86b7556e1 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -121,9 +121,9 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 
 		if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
 			continue;
-		nfs_parse_ip_address(buf->data, buf->len,
-				mountdata->addr, &mountdata->addrlen);
-		if (mountdata->addr->sa_family == AF_UNSPEC)
+		mountdata->addrlen = rpc_pton(buf->data, buf->len,
+				mountdata->addr, mountdata->addrlen);
+		if (mountdata->addrlen == 0)
 			continue;
 		nfs_set_port(mountdata->addr, NFS_PORT);
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 8526008eba72..1eeba8e53802 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -742,129 +742,10 @@ static int nfs_verify_server_address(struct sockaddr *addr)
 	}
 	}
 
+	dfprintk(MOUNT, "NFS: Invalid IP address specified\n");
 	return 0;
 }
 
-static void nfs_parse_ipv4_address(char *string, size_t str_len,
-				   struct sockaddr *sap, size_t *addr_len)
-{
-	struct sockaddr_in *sin = (struct sockaddr_in *)sap;
-	u8 *addr = (u8 *)&sin->sin_addr.s_addr;
-
-	if (str_len <= INET_ADDRSTRLEN) {
-		dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n",
-				(int)str_len, string);
-
-		sin->sin_family = AF_INET;
-		*addr_len = sizeof(*sin);
-		if (in4_pton(string, str_len, addr, '\0', NULL))
-			return;
-	}
-
-	sap->sa_family = AF_UNSPEC;
-	*addr_len = 0;
-}
-
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
-				   const char *delim,
-				   struct sockaddr_in6 *sin6)
-{
-	char *p;
-	size_t len;
-
-	if ((string + str_len) == delim)
-		return 1;
-
-	if (*delim != IPV6_SCOPE_DELIMITER)
-		return 0;
-
-	if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
-		return 0;
-
-	len = (string + str_len) - delim - 1;
-	p = kstrndup(delim + 1, len, GFP_KERNEL);
-	if (p) {
-		unsigned long scope_id = 0;
-		struct net_device *dev;
-
-		dev = dev_get_by_name(&init_net, p);
-		if (dev != NULL) {
-			scope_id = dev->ifindex;
-			dev_put(dev);
-		} else {
-			if (strict_strtoul(p, 10, &scope_id) == 0) {
-				kfree(p);
-				return 0;
-			}
-		}
-
-		kfree(p);
-
-		sin6->sin6_scope_id = scope_id;
-		dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
-		return 1;
-	}
-
-	return 0;
-}
-
-static void nfs_parse_ipv6_address(char *string, size_t str_len,
-				   struct sockaddr *sap, size_t *addr_len)
-{
-	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-	u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
-	const char *delim;
-
-	if (str_len <= INET6_ADDRSTRLEN) {
-		dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n",
-				(int)str_len, string);
-
-		sin6->sin6_family = AF_INET6;
-		*addr_len = sizeof(*sin6);
-		if (in6_pton(string, str_len, addr,
-					IPV6_SCOPE_DELIMITER, &delim) != 0) {
-			if (nfs_parse_ipv6_scope_id(string, str_len,
-							delim, sin6) != 0)
-				return;
-		}
-	}
-
-	sap->sa_family = AF_UNSPEC;
-	*addr_len = 0;
-}
-#else
-static void nfs_parse_ipv6_address(char *string, size_t str_len,
-				   struct sockaddr *sap, size_t *addr_len)
-{
-	sap->sa_family = AF_UNSPEC;
-	*addr_len = 0;
-}
-#endif
-
-/*
- * Construct a sockaddr based on the contents of a string that contains
- * an IP address in presentation format.
- *
- * If there is a problem constructing the new sockaddr, set the address
- * family to AF_UNSPEC.
- */
-void nfs_parse_ip_address(char *string, size_t str_len,
-				 struct sockaddr *sap, size_t *addr_len)
-{
-	unsigned int i, colons;
-
-	colons = 0;
-	for (i = 0; i < str_len; i++)
-		if (string[i] == ':')
-			colons++;
-
-	if (colons >= 2)
-		nfs_parse_ipv6_address(string, str_len, sap, addr_len);
-	else
-		nfs_parse_ipv4_address(string, str_len, sap, addr_len);
-}
-
 /*
  * Sanity check the NFS transport protocol.
  *
@@ -1344,11 +1225,14 @@ static int nfs_parse_mount_options(char *raw,
 			string = match_strdup(args);
 			if (string == NULL)
 				goto out_nomem;
-			nfs_parse_ip_address(string, strlen(string),
-					     (struct sockaddr *)
-						&mnt->nfs_server.address,
-					     &mnt->nfs_server.addrlen);
+			mnt->nfs_server.addrlen =
+				rpc_pton(string, strlen(string),
+					(struct sockaddr *)
+					&mnt->nfs_server.address,
+					sizeof(mnt->nfs_server.address));
 			kfree(string);
+			if (mnt->nfs_server.addrlen == 0)
+				goto out_invalid_address;
 			break;
 		case Opt_clientaddr:
 			string = match_strdup(args);
@@ -1368,11 +1252,14 @@ static int nfs_parse_mount_options(char *raw,
 			string = match_strdup(args);
 			if (string == NULL)
 				goto out_nomem;
-			nfs_parse_ip_address(string, strlen(string),
-					     (struct sockaddr *)
-						&mnt->mount_server.address,
-					     &mnt->mount_server.addrlen);
+			mnt->mount_server.addrlen =
+				rpc_pton(string, strlen(string),
+					(struct sockaddr *)
+					&mnt->mount_server.address,
+					sizeof(mnt->mount_server.address));
 			kfree(string);
+			if (mnt->mount_server.addrlen == 0)
+				goto out_invalid_address;
 			break;
 		case Opt_lookupcache:
 			string = match_strdup(args);
@@ -1424,8 +1311,11 @@ static int nfs_parse_mount_options(char *raw,
 
 	return 1;
 
+out_invalid_address:
+	printk(KERN_INFO "NFS: bad IP address specified: %s\n", p);
+	return 0;
 out_invalid_value:
-	printk(KERN_INFO "NFS: bad mount option value specified: %s \n", p);
+	printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p);
 	return 0;
 out_nomem:
 	printk(KERN_INFO "NFS: not enough memory to parse option\n");
-- 
cgit v1.2.2


From ec6ee61250acfccbc5578dd4014735fb2cbe53b5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:37 -0400
Subject: NFS: Replace nfs_set_port() with rpc_set_port()

Clean up.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h      | 19 -------------------
 fs/nfs/nfs4namespace.c |  2 +-
 fs/nfs/super.c         |  6 +++---
 3 files changed, 4 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index cf1da3e22004..c2f171a3d70e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -368,22 +368,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
 	return ((unsigned long)len + (unsigned long)base +
 		PAGE_SIZE - 1) >> PAGE_SHIFT;
 }
-
-/*
- * Set the port number in an address.  Be agnostic about the address
- * family.
- */
-static inline void nfs_set_port(struct sockaddr *sap, unsigned short port)
-{
-	struct sockaddr_in *ap = (struct sockaddr_in *)sap;
-	struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap;
-
-	switch (sap->sa_family) {
-	case AF_INET:
-		ap->sin_port = htons(port);
-		break;
-	case AF_INET6:
-		ap6->sin6_port = htons(port);
-		break;
-	}
-}
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 43c86b7556e1..ef22ee89aa77 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -125,7 +125,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 				mountdata->addr, mountdata->addrlen);
 		if (mountdata->addrlen == 0)
 			continue;
-		nfs_set_port(mountdata->addr, NFS_PORT);
+		rpc_set_port(mountdata->addr, NFS_PORT);
 
 		memcpy(page2, buf->data, buf->len);
 		page2[buf->len] = '\0';
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 1eeba8e53802..9c85cdb353aa 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1409,7 +1409,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 	/*
 	 * autobind will be used if mount_server.port == 0
 	 */
-	nfs_set_port(request.sap, args->mount_server.port);
+	rpc_set_port(request.sap, args->mount_server.port);
 
 	/*
 	 * Now ask the mount server to map our export path
@@ -1703,7 +1703,7 @@ static int nfs_validate_mount_data(void *options,
 						&args->nfs_server.address))
 			goto out_no_address;
 
-		nfs_set_port((struct sockaddr *)&args->nfs_server.address,
+		rpc_set_port((struct sockaddr *)&args->nfs_server.address,
 				args->nfs_server.port);
 
 		nfs_set_mount_transport_protocol(args);
@@ -2336,7 +2336,7 @@ static int nfs4_validate_mount_data(void *options,
 						&args->nfs_server.address))
 			return -EINVAL;
 
-		nfs_set_port((struct sockaddr *)&args->nfs_server.address,
+		rpc_set_port((struct sockaddr *)&args->nfs_server.address,
 				args->nfs_server.port);
 
 		nfs_validate_transport_protocol(args);
-- 
cgit v1.2.2


From b97a56747ea3f6c1a27dd0719bf1424959f1ebae Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:38 -0400
Subject: lockd: Replace nlm_clear_port()

Clean up: Use shared rpc_set_port() function instead of nlm_clear_port().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/host.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 99d737bd4325..7cb076ac6b45 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -87,18 +87,6 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
 	return hash & (NLM_HOST_NRHASH - 1);
 }
 
-static void nlm_clear_port(struct sockaddr *sap)
-{
-	switch (sap->sa_family) {
-	case AF_INET:
-		((struct sockaddr_in *)sap)->sin_port = 0;
-		break;
-	case AF_INET6:
-		((struct sockaddr_in6 *)sap)->sin6_port = 0;
-		break;
-	}
-}
-
 /*
  * Common host lookup routine for server & client
  */
@@ -177,7 +165,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
 	host->h_addrbuf    = nsm->sm_addrbuf;
 	memcpy(nlm_addr(host), ni->sap, ni->salen);
 	host->h_addrlen = ni->salen;
-	nlm_clear_port(nlm_addr(host));
+	rpc_set_port(nlm_addr(host), 0);
 	memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
 	host->h_version    = ni->version;
 	host->h_proto      = ni->protocol;
-- 
cgit v1.2.2


From c15128c5e428af1e8ada1476484a74c970665edd Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:39 -0400
Subject: lockd: Replace nsm_display_address() with rpc_ntop()

Clean up.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/mon.c | 44 +++++---------------------------------------
 1 file changed, 5 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 7fce1b525849..30c933188dd7 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -61,43 +61,6 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
 	return (struct sockaddr *)&nsm->sm_addr;
 }
 
-static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
-				     const size_t len)
-{
-	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
-	snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
-}
-
-static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
-				     const size_t len)
-{
-	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-
-	if (ipv6_addr_v4mapped(&sin6->sin6_addr))
-		snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
-	else if (sin6->sin6_scope_id != 0)
-		snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
-				sin6->sin6_scope_id);
-	else
-		snprintf(buf, len, "%pI6", &sin6->sin6_addr);
-}
-
-static void nsm_display_address(const struct sockaddr *sap,
-				char *buf, const size_t len)
-{
-	switch (sap->sa_family) {
-	case AF_INET:
-		nsm_display_ipv4_address(sap, buf, len);
-		break;
-	case AF_INET6:
-		nsm_display_ipv6_address(sap, buf, len);
-		break;
-	default:
-		snprintf(buf, len, "unsupported address family");
-		break;
-	}
-}
-
 static struct rpc_clnt *nsm_create(void)
 {
 	struct sockaddr_in sin = {
@@ -307,8 +270,11 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
 	memcpy(nsm_addr(new), sap, salen);
 	new->sm_addrlen = salen;
 	nsm_init_private(new);
-	nsm_display_address((const struct sockaddr *)&new->sm_addr,
-				new->sm_addrbuf, sizeof(new->sm_addrbuf));
+
+	if (rpc_ntop(nsm_addr(new), new->sm_addrbuf,
+					sizeof(new->sm_addrbuf)) == 0)
+		(void)snprintf(new->sm_addrbuf, sizeof(new->sm_addrbuf),
+				"unsupported address family");
 	memcpy(new->sm_name, hostname, hostname_len);
 	new->sm_name[hostname_len] = '\0';
 
-- 
cgit v1.2.2


From 4116092b92f859e5e9a90c99d740933e651ee8c0 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:40 -0400
Subject: NFSD: Support IPv6 addresses in write_failover_ip()

In write_failover_ip(), replace the sscanf() with a call to the common
sunrpc.ko presentation address parser.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfsd/nfsctl.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6d0847562d87..7e906c5b7671 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -37,6 +37,7 @@
 #include <linux/nfsd/xdr.h>
 #include <linux/nfsd/syscall.h>
 #include <linux/lockd/lockd.h>
+#include <linux/sunrpc/clnt.h>
 
 #include <asm/uaccess.h>
 #include <net/ipv6.h>
@@ -490,22 +491,18 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
  *
  * Input:
  *			buf:	'\n'-terminated C string containing a
- *				presentation format IPv4 address
+ *				presentation format IP address
  *			size:	length of C string in @buf
  * Output:
  *	On success:	returns zero if all specified locks were released;
  *			returns one if one or more locks were not released
  *	On error:	return code is negative errno value
- *
- * Note: Only AF_INET client addresses are passed in
  */
 static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
 {
-	struct sockaddr_in sin = {
-		.sin_family	= AF_INET,
-	};
-	int b1, b2, b3, b4;
-	char c;
+	struct sockaddr_storage address;
+	struct sockaddr *sap = (struct sockaddr *)&address;
+	size_t salen = sizeof(address);
 	char *fo_path;
 
 	/* sanity check */
@@ -519,14 +516,10 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
 	if (qword_get(&buf, fo_path, size) < 0)
 		return -EINVAL;
 
-	/* get ipv4 address */
-	if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4)
-		return -EINVAL;
-	if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255)
+	if (rpc_pton(fo_path, size, sap, salen) == 0)
 		return -EINVAL;
-	sin.sin_addr.s_addr = htonl((b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
 
-	return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
+	return nlmsvc_unlock_all_by_ip(sap);
 }
 
 /**
-- 
cgit v1.2.2


From b693ba4a338da15db1db4b5ebaa36e4ab9781c82 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 9 Aug 2009 15:14:15 -0400
Subject: SUNRPC: Constify rpc_pipe_ops...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/idmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 86147b0ab2cf..fae0d3e52b44 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -101,7 +101,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
 
 static unsigned int fnvhash32(const void *, size_t);
 
-static struct rpc_pipe_ops idmap_upcall_ops = {
+static const struct rpc_pipe_ops idmap_upcall_ops = {
 	.upcall		= idmap_pipe_upcall,
 	.downcall	= idmap_pipe_downcall,
 	.destroy_msg	= idmap_pipe_destroy_msg,
-- 
cgit v1.2.2


From 7d217caca5d704e48aa5e59aba0b3ad4c7af4fd2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 9 Aug 2009 15:14:24 -0400
Subject: SUNRPC: Replace rpc_client->cl_dentry and cl_mnt, with a cl_path

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/idmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index fae0d3e52b44..21a84d45916f 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -119,8 +119,8 @@ nfs_idmap_new(struct nfs_client *clp)
 	if (idmap == NULL)
 		return -ENOMEM;
 
-	idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap",
-					 idmap, &idmap_upcall_ops, 0);
+	idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry,
+			"idmap", idmap, &idmap_upcall_ops, 0);
 	if (IS_ERR(idmap->idmap_dentry)) {
 		error = PTR_ERR(idmap->idmap_dentry);
 		kfree(idmap);
-- 
cgit v1.2.2


From 2da8ca26c6bfad685bfddf39728eac1c83906aa9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 9 Aug 2009 15:14:26 -0400
Subject: NFSD: Clean up the idmapper warning...

What part of 'internal use' is so hard to understand?

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfsd/nfs4idmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 5b398421b051..e9012ad36ac0 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -175,10 +175,10 @@ idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h)
 }
 
 static void
-warn_no_idmapd(struct cache_detail *detail)
+warn_no_idmapd(struct cache_detail *detail, int has_died)
 {
 	printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n",
-			detail->last_close? "died" : "not been started");
+			has_died ? "died" : "not been started");
 }
 
 
-- 
cgit v1.2.2


From bc74b4f5e63a09fb78e245794a0de1e5a2716bbe Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 9 Aug 2009 15:14:29 -0400
Subject: SUNRPC: Allow the cache_detail to specify alternative upcall
 mechanisms

For events that are rare, such as referral DNS lookups, it makes limited
sense to have a daemon constantly listening for upcalls on a channel. An
alternative in those cases might simply be to run the app that fills the
cache using call_usermodehelper_exec() and friends.

The following patch allows the cache_detail to specify alternative upcall
mechanisms for these particular cases.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfsd/export.c    | 14 ++++++++++++--
 fs/nfsd/nfs4idmap.c | 16 ++++++++++++++--
 2 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index b92a27629fb7..d9462643155c 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -85,6 +85,11 @@ static void expkey_request(struct cache_detail *cd,
 	(*bpp)[-1] = '\n';
 }
 
+static int expkey_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+	return sunrpc_cache_pipe_upcall(cd, h, expkey_request);
+}
+
 static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old);
 static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *);
 static struct cache_detail svc_expkey_cache;
@@ -259,7 +264,7 @@ static struct cache_detail svc_expkey_cache = {
 	.hash_table	= expkey_table,
 	.name		= "nfsd.fh",
 	.cache_put	= expkey_put,
-	.cache_request	= expkey_request,
+	.cache_upcall	= expkey_upcall,
 	.cache_parse	= expkey_parse,
 	.cache_show	= expkey_show,
 	.match		= expkey_match,
@@ -355,6 +360,11 @@ static void svc_export_request(struct cache_detail *cd,
 	(*bpp)[-1] = '\n';
 }
 
+static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+	return sunrpc_cache_pipe_upcall(cd, h, svc_export_request);
+}
+
 static struct svc_export *svc_export_update(struct svc_export *new,
 					    struct svc_export *old);
 static struct svc_export *svc_export_lookup(struct svc_export *);
@@ -724,7 +734,7 @@ struct cache_detail svc_export_cache = {
 	.hash_table	= export_table,
 	.name		= "nfsd.export",
 	.cache_put	= svc_export_put,
-	.cache_request	= svc_export_request,
+	.cache_upcall	= svc_export_upcall,
 	.cache_parse	= svc_export_parse,
 	.cache_show	= svc_export_show,
 	.match		= svc_export_match,
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index e9012ad36ac0..cdfa86fa1471 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -145,6 +145,12 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
 	(*bpp)[-1] = '\n';
 }
 
+static int
+idtoname_upcall(struct cache_detail *cd, struct cache_head *ch)
+{
+	return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request);
+}
+
 static int
 idtoname_match(struct cache_head *ca, struct cache_head *cb)
 {
@@ -192,7 +198,7 @@ static struct cache_detail idtoname_cache = {
 	.hash_table	= idtoname_table,
 	.name		= "nfs4.idtoname",
 	.cache_put	= ent_put,
-	.cache_request	= idtoname_request,
+	.cache_upcall	= idtoname_upcall,
 	.cache_parse	= idtoname_parse,
 	.cache_show	= idtoname_show,
 	.warn_no_listener = warn_no_idmapd,
@@ -324,6 +330,12 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
 	(*bpp)[-1] = '\n';
 }
 
+static int
+nametoid_upcall(struct cache_detail *cd, struct cache_head *ch)
+{
+	return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request);
+}
+
 static int
 nametoid_match(struct cache_head *ca, struct cache_head *cb)
 {
@@ -363,7 +375,7 @@ static struct cache_detail nametoid_cache = {
 	.hash_table	= nametoid_table,
 	.name		= "nfs4.nametoid",
 	.cache_put	= ent_put,
-	.cache_request	= nametoid_request,
+	.cache_upcall	= nametoid_upcall,
 	.cache_parse	= nametoid_parse,
 	.cache_show	= nametoid_show,
 	.warn_no_listener = warn_no_idmapd,
-- 
cgit v1.2.2


From c333e073b7bf76dc819a6b4ce6bef88ee5fa5e50 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Mon, 10 Aug 2009 22:47:22 -0400
Subject: ext4: remove redundant test on unsigned

unsigned i_block cannot be less than 0.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index deb14a728791..9a4c929b16dc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -341,9 +341,7 @@ static int ext4_block_to_path(struct inode *inode,
 	int n = 0;
 	int final = 0;
 
-	if (i_block < 0) {
-		ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
-	} else if (i_block < direct_blocks) {
+	if (i_block < direct_blocks) {
 		offsets[n++] = i_block;
 		final = direct_blocks;
 	} else if ((i_block -= direct_blocks) < indirect_blocks) {
-- 
cgit v1.2.2


From b1f485f20eb9b02cc7d2009556287f3939d480cc Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@sun.com>
Date: Mon, 10 Aug 2009 22:51:53 -0400
Subject: jbd2: round commit timer up to avoid uncommitted transaction

fix jiffie rounding in jbd commit timer setup code.  Rounding down
could cause the timer to be fired before the corresponding transaction
has expired.  That transaction can stay not committed forever if no
new transaction is created or expicit sync/umount happens.

Signed-off-by: Alex Zhuravlev (Tomas) <alex.zhuravlev@sun.com>
Signed-off-by: Andreas Dilger <adilger@sun.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/transaction.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6213ac728f30..4516a87ca55c 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 	INIT_LIST_HEAD(&transaction->t_private_list);
 
 	/* Set up the commit timer for the new transaction. */
-	journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
+	journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
 	add_timer(&journal->j_commit_timer);
 
 	J_ASSERT(journal->j_running_transaction == NULL);
-- 
cgit v1.2.2


From 91cc219ad963731191247c5f2db4118be2bc341a Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Mon, 10 Aug 2009 23:05:28 -0400
Subject: ext4: fix journal ref count in move_extent_par_page

move_extent_par_page calls a_ops->write_begin() to increase journal
handler's reference count. However, if either mext_replace_branches()
or ext4_get_block fails, the increased reference count isn't
decreased. This will cause a later attempt to umount of the fs to hang
forever. The patch addresses the issue by calling ext4_journal_stop()
if page is not NULL (which means a_ops->write_end() isn't invoked).

Signed-off-by: Peng Tao <bergwolf@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index bbf2dd9404dc..5821e0bee917 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -871,6 +871,7 @@ out:
 		if (PageLocked(page))
 			unlock_page(page);
 		page_cache_release(page);
+		ext4_journal_stop(handle);
 	}
 out2:
 	ext4_journal_stop(handle);
-- 
cgit v1.2.2


From 6ba495e9259cd9a0b40ebd6c315143535c92542f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 18 Sep 2009 13:38:55 -0400
Subject: ext4: Add configurable run-time mballoc debugging

Allow mballoc debugging to be enabled at run-time instead of just at
compile time.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/Kconfig   |  9 +++++++
 fs/ext4/mballoc.c | 81 ++++++++++++++++++++++++++++++++++++++++---------------
 fs/ext4/mballoc.h | 16 ++++++++---
 3 files changed, 80 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 152304624644..d5c0ea2e8f2d 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -77,3 +77,12 @@ config EXT4_FS_SECURITY
 
 	  If you are not using a security module that requires using
 	  extended attributes for file security labels, say N.
+
+config EXT4_DEBUG
+	bool "EXT4 debugging support"
+	depends on EXT4_FS
+	help
+	  Enables run-time debugging support for the ext4 filesystem.
+
+	  If you select Y here, then you will be able to turn on debugging
+	  with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 68cde598b3bf..d80b300c1779 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,7 @@
  */
 
 #include "mballoc.h"
+#include <linux/debugfs.h>
 #include <trace/events/ext4.h>
 
 /*
@@ -743,7 +744,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 	char *data;
 	char *bitmap;
 
-	mb_debug("init page %lu\n", page->index);
+	mb_debug(1, "init page %lu\n", page->index);
 
 	inode = page->mapping->host;
 	sb = inode->i_sb;
@@ -822,7 +823,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		set_bitmap_uptodate(bh[i]);
 		bh[i]->b_end_io = end_buffer_read_sync;
 		submit_bh(READ, bh[i]);
-		mb_debug("read bitmap for group %u\n", first_group + i);
+		mb_debug(1, "read bitmap for group %u\n", first_group + i);
 	}
 
 	/* wait for I/O completion */
@@ -862,7 +863,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		if ((first_block + i) & 1) {
 			/* this is block of buddy */
 			BUG_ON(incore == NULL);
-			mb_debug("put buddy for group %u in page %lu/%x\n",
+			mb_debug(1, "put buddy for group %u in page %lu/%x\n",
 				group, page->index, i * blocksize);
 			grinfo = ext4_get_group_info(sb, group);
 			grinfo->bb_fragments = 0;
@@ -878,7 +879,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		} else {
 			/* this is block of bitmap */
 			BUG_ON(incore != NULL);
-			mb_debug("put bitmap for group %u in page %lu/%x\n",
+			mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
 				group, page->index, i * blocksize);
 
 			/* see comments in ext4_mb_put_pa() */
@@ -922,7 +923,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct inode *inode = sbi->s_buddy_cache;
 
-	mb_debug("load group %u\n", group);
+	mb_debug(1, "load group %u\n", group);
 
 	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
 	grp = ext4_get_group_info(sb, group);
@@ -1851,7 +1852,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 	struct inode *inode = sbi->s_buddy_cache;
 	struct page *page = NULL, *bitmap_page = NULL;
 
-	mb_debug("init group %u\n", group);
+	mb_debug(1, "init group %u\n", group);
 	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
 	this_grp = ext4_get_group_info(sb, group);
 	/*
@@ -2739,7 +2740,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 		kmem_cache_free(ext4_pspace_cachep, pa);
 	}
 	if (count)
-		mb_debug("mballoc: %u PAs left\n", count);
+		mb_debug(1, "mballoc: %u PAs left\n", count);
 
 }
 
@@ -2820,7 +2821,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 	list_for_each_safe(l, ltmp, &txn->t_private_list) {
 		entry = list_entry(l, struct ext4_free_data, list);
 
-		mb_debug("gonna free %u blocks in group %u (0x%p):",
+		mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
 			 entry->count, entry->group, entry);
 
 		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2855,9 +2856,43 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 		ext4_mb_release_desc(&e4b);
 	}
 
-	mb_debug("freed %u blocks in %u structures\n", count, count2);
+	mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
 }
 
+#ifdef CONFIG_EXT4_DEBUG
+u8 mb_enable_debug __read_mostly;
+
+static struct dentry *debugfs_dir;
+static struct dentry *debugfs_debug;
+
+static void __init ext4_create_debugfs_entry(void)
+{
+	debugfs_dir = debugfs_create_dir("ext4", NULL);
+	if (debugfs_dir)
+		debugfs_debug = debugfs_create_u8("mballoc-debug",
+						  S_IRUGO | S_IWUSR,
+						  debugfs_dir,
+						  &mb_enable_debug);
+}
+
+static void ext4_remove_debugfs_entry(void)
+{
+	debugfs_remove(debugfs_debug);
+	debugfs_remove(debugfs_dir);
+}
+
+#else
+
+static void __init ext4_create_debugfs_entry(void)
+{
+}
+
+static void ext4_remove_debugfs_entry(void)
+{
+}
+
+#endif
+
 int __init init_ext4_mballoc(void)
 {
 	ext4_pspace_cachep =
@@ -2885,6 +2920,7 @@ int __init init_ext4_mballoc(void)
 		kmem_cache_destroy(ext4_ac_cachep);
 		return -ENOMEM;
 	}
+	ext4_create_debugfs_entry();
 	return 0;
 }
 
@@ -2898,6 +2934,7 @@ void exit_ext4_mballoc(void)
 	kmem_cache_destroy(ext4_pspace_cachep);
 	kmem_cache_destroy(ext4_ac_cachep);
 	kmem_cache_destroy(ext4_free_ext_cachep);
+	ext4_remove_debugfs_entry();
 }
 
 
@@ -3042,7 +3079,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
 		ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
 	else
 		ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
-	mb_debug("#%u: goal %u blocks for locality group\n",
+	mb_debug(1, "#%u: goal %u blocks for locality group\n",
 		current->pid, ac->ac_g_ex.fe_len);
 }
 
@@ -3232,7 +3269,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 		ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
 	}
 
-	mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size,
+	mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
 		(unsigned) orig_size, (unsigned) start);
 }
 
@@ -3281,7 +3318,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
 	BUG_ON(pa->pa_free < len);
 	pa->pa_free -= len;
 
-	mb_debug("use %llu/%u from inode pa %p\n", start, len, pa);
+	mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
 }
 
 /*
@@ -3305,7 +3342,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
 	 * in on-disk bitmap -- see ext4_mb_release_context()
 	 * Other CPUs are prevented from allocating from this pa by lg_mutex
 	 */
-	mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+	mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
 }
 
 /*
@@ -3484,7 +3521,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 		preallocated += len;
 		count++;
 	}
-	mb_debug("prellocated %u for group %u\n", preallocated, group);
+	mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
 }
 
 static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3619,7 +3656,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 	pa->pa_deleted = 0;
 	pa->pa_type = MB_INODE_PA;
 
-	mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
+	mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
 			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
 	trace_ext4_mb_new_inode_pa(ac, pa);
 
@@ -3679,7 +3716,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 	pa->pa_deleted = 0;
 	pa->pa_type = MB_GROUP_PA;
 
-	mb_debug("new group pa %p: %llu/%u for %u\n", pa,
+	mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
 			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
 	trace_ext4_mb_new_group_pa(ac, pa);
 
@@ -3758,7 +3795,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 		next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
 		start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
 				le32_to_cpu(sbi->s_es->s_first_data_block);
-		mb_debug("    free preallocated %u/%u in group %u\n",
+		mb_debug(1, "    free preallocated %u/%u in group %u\n",
 				(unsigned) start, (unsigned) next - bit,
 				(unsigned) group);
 		free += next - bit;
@@ -3849,7 +3886,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 	int busy = 0;
 	int free = 0;
 
-	mb_debug("discard preallocation for group %u\n", group);
+	mb_debug(1, "discard preallocation for group %u\n", group);
 
 	if (list_empty(&grp->bb_prealloc_list))
 		return 0;
@@ -3973,7 +4010,7 @@ void ext4_discard_preallocations(struct inode *inode)
 		return;
 	}
 
-	mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
+	mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
 	trace_ext4_discard_preallocations(inode);
 
 	INIT_LIST_HEAD(&list);
@@ -4078,7 +4115,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
 {
 	BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
 }
-#ifdef MB_DEBUG
+#ifdef CONFIG_EXT4_DEBUG
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
 	struct super_block *sb = ac->ac_sb;
@@ -4227,7 +4264,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 	 * locality group. this is a policy, actually */
 	ext4_mb_group_or_file(ac);
 
-	mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
+	mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
 			"left: %u/%u, right %u/%u to %swritable\n",
 			(unsigned) ar->len, (unsigned) ar->logical,
 			(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
@@ -4249,7 +4286,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 	struct ext4_prealloc_space *pa, *tmp;
 	struct ext4_allocation_context *ac;
 
-	mb_debug("discard locality group preallocation\n");
+	mb_debug(1, "discard locality group preallocation\n");
 
 	INIT_LIST_HEAD(&discard_list);
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c96bb19f58f9..9db890d4d275 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -37,11 +37,19 @@
 
 /*
  */
-#define MB_DEBUG__
-#ifdef MB_DEBUG
-#define mb_debug(fmt, a...)	printk(fmt, ##a)
+#ifdef CONFIG_EXT4_DEBUG
+extern u8 mb_enable_debug;
+
+#define mb_debug(n, fmt, a...)	                                        \
+	do {								\
+		if ((n) <= mb_enable_debug) {		        	\
+			printk(KERN_DEBUG "(%s, %d): %s: ",		\
+			       __FILE__, __LINE__, __func__);		\
+			printk(fmt, ## a);				\
+		}							\
+	} while (0)
 #else
-#define mb_debug(fmt, a...)
+#define mb_debug(n, fmt, a...)
 #endif
 
 /*
-- 
cgit v1.2.2


From 0ef90db93a4ddfc300af288c2a1bfc1e6c79da64 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 9 Aug 2009 16:46:13 -0400
Subject: ext4: Display the mballoc flags in mb_history in hex instead of
 decimal

Displaying the flags in base 16 makes it easier to see which flags
have been set.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 22 +++++++++++-----------
 fs/ext4/mballoc.c |  4 ++--
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9714db393efe..e267727cc62d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -67,27 +67,27 @@ typedef unsigned int ext4_group_t;
 
 
 /* prefer goal again. length */
-#define EXT4_MB_HINT_MERGE		1
+#define EXT4_MB_HINT_MERGE		0x0001
 /* blocks already reserved */
-#define EXT4_MB_HINT_RESERVED		2
+#define EXT4_MB_HINT_RESERVED		0x0002
 /* metadata is being allocated */
-#define EXT4_MB_HINT_METADATA		4
+#define EXT4_MB_HINT_METADATA		0x0004
 /* first blocks in the file */
-#define EXT4_MB_HINT_FIRST		8
+#define EXT4_MB_HINT_FIRST		0x0008
 /* search for the best chunk */
-#define EXT4_MB_HINT_BEST		16
+#define EXT4_MB_HINT_BEST		0x0010
 /* data is being allocated */
-#define EXT4_MB_HINT_DATA		32
+#define EXT4_MB_HINT_DATA		0x0020
 /* don't preallocate (for tails) */
-#define EXT4_MB_HINT_NOPREALLOC		64
+#define EXT4_MB_HINT_NOPREALLOC		0x0040
 /* allocate for locality group */
-#define EXT4_MB_HINT_GROUP_ALLOC	128
+#define EXT4_MB_HINT_GROUP_ALLOC	0x0080
 /* allocate goal blocks or none */
-#define EXT4_MB_HINT_GOAL_ONLY		256
+#define EXT4_MB_HINT_GOAL_ONLY		0x0100
 /* goal is meaningful */
-#define EXT4_MB_HINT_TRY_GOAL		512
+#define EXT4_MB_HINT_TRY_GOAL		0x0200
 /* blocks already pre-reserved by delayed allocation */
-#define EXT4_MB_DELALLOC_RESERVED      1024
+#define EXT4_MB_DELALLOC_RESERVED	0x0400
 
 
 struct ext4_allocation_request {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d80b300c1779..3434c603432d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2157,7 +2157,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
 
 	if (v == SEQ_START_TOKEN) {
 		seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
-				"%-5s %-2s %-5s %-5s %-5s %-6s\n",
+				"%-5s %-2s %-6s %-5s %-5s %-6s\n",
 			  "pid", "inode", "original", "goal", "result", "found",
 			   "grps", "cr", "flags", "merge", "tail", "broken");
 		return 0;
@@ -2165,7 +2165,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
 
 	if (hs->op == EXT4_MB_HISTORY_ALLOC) {
 		fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
-			"%-5u %-5s %-5u %-6u\n";
+			"0x%04x %-5s %-5u %-6u\n";
 		sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
 			hs->result.fe_start, hs->result.fe_len,
 			hs->result.fe_logical);
-- 
cgit v1.2.2


From 4ba74d00a20256e22f159cb288ff34b587608917 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 9 Aug 2009 22:01:13 -0400
Subject: ext4: Fix bugs in mballoc's stream allocation mode

The logic around sbi->s_mb_last_group and sbi->s_mb_last_start was all
screwed up.  These fields were getting unconditionally all the time,
set even when stream allocation had not taken place, and if they were
being used when the file was smaller than s_mb_stream_request, which
is when the allocation should _not_ be doing stream allocation.

Fix this by determining whether or not we stream allocation should
take place once, in ext4_mb_group_or_file(), and setting a flag which
gets used in ext4_mb_regular_allocator() and ext4_mb_use_best_found().
This simplifies the code and assures that we are consistently using
(or not using) the stream allocation logic.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  2 ++
 fs/ext4/mballoc.c | 23 ++++++++++-------------
 2 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e267727cc62d..70aa951ecb3c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -88,6 +88,8 @@ typedef unsigned int ext4_group_t;
 #define EXT4_MB_HINT_TRY_GOAL		0x0200
 /* blocks already pre-reserved by delayed allocation */
 #define EXT4_MB_DELALLOC_RESERVED	0x0400
+/* We are doing stream allocation */
+#define EXT4_MB_STREAM_ALLOC		0x0800
 
 
 struct ext4_allocation_request {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 3434c603432d..90a30ce822fc 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1361,7 +1361,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
 	ac->alloc_semp =  e4b->alloc_semp;
 	e4b->alloc_semp = NULL;
 	/* store last allocated for subsequent stream allocation */
-	if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
+	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
 		spin_lock(&sbi->s_md_lock);
 		sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
 		sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
@@ -1939,7 +1939,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	struct ext4_buddy e4b;
-	loff_t size, isize;
 
 	sb = ac->ac_sb;
 	sbi = EXT4_SB(sb);
@@ -1975,20 +1974,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	}
 
 	bsbits = ac->ac_sb->s_blocksize_bits;
-	/* if stream allocation is enabled, use global goal */
-	size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
-	isize = i_size_read(ac->ac_inode) >> bsbits;
-	if (size < isize)
-		size = isize;
 
-	if (size < sbi->s_mb_stream_request &&
-			(ac->ac_flags & EXT4_MB_HINT_DATA)) {
+	/* if stream allocation is enabled, use global goal */
+	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
 		/* TBD: may be hot point */
 		spin_lock(&sbi->s_md_lock);
 		ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
 		ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
 		spin_unlock(&sbi->s_md_lock);
 	}
+
 	/* Let's just scan groups to find more-less suitable blocks */
 	cr = ac->ac_2order ? 0 : 1;
 	/*
@@ -4192,16 +4187,18 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
 		return;
 
+	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+		return;
+
 	size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
 	isize = i_size_read(ac->ac_inode) >> bsbits;
 	size = max(size, isize);
 
 	/* don't use group allocation for large files */
-	if (size >= sbi->s_mb_stream_request)
-		return;
-
-	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+	if (size >= sbi->s_mb_stream_request) {
+		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
 		return;
+	}
 
 	BUG_ON(ac->ac_lg != NULL);
 	/*
-- 
cgit v1.2.2


From d3c8660233d3f2801e14eaf722937ff4ed49bfb7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 10 Jul 2009 03:27:38 +0200
Subject: mm_for_maps: shift down_read(mmap_sem) to the caller

mm_for_maps() takes ->mmap_sem after security checks, this looks
strange and obfuscates the locking rules. Move this lock to its
single caller, m_start().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/proc/base.c       | 8 +++-----
 fs/proc/task_mmu.c   | 1 +
 fs/proc/task_nommu.c | 1 +
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 917f338a6739..f3c2e4085fed 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -235,9 +235,8 @@ static int check_mem_permission(struct task_struct *task)
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
 	struct mm_struct *mm = get_task_mm(task);
-	if (!mm)
-		return NULL;
-	if (mm != current->mm) {
+
+	if (mm && mm != current->mm) {
 		/*
 		 * task->mm can be changed before security check,
 		 * in that case we must notice the change after.
@@ -245,10 +244,9 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 		if (!ptrace_may_access(task, PTRACE_MODE_READ) ||
 		    mm != task->mm) {
 			mmput(mm);
-			return NULL;
+			mm = NULL;
 		}
 	}
-	down_read(&mm->mmap_sem);
 	return mm;
 }
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6f61b7cc32e0..9bd8be1d235c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -119,6 +119,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	mm = mm_for_maps(priv->task);
 	if (!mm)
 		return NULL;
+	down_read(&mm->mmap_sem);
 
 	tail_vma = get_gate_vma(priv->task);
 	priv->tail_vma = tail_vma;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 64a72e2e7650..8f5c05d3dbd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -189,6 +189,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 		priv->task = NULL;
 		return NULL;
 	}
+	down_read(&mm->mmap_sem);
 
 	/* start from the Nth VMA */
 	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
-- 
cgit v1.2.2


From 896a6de40ef3814525632609799af909338f50c3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 10 Jul 2009 03:27:40 +0200
Subject: mm_for_maps: take ->cred_guard_mutex to fix the race with exec

The problem is minor, but without ->cred_guard_mutex held we can race
with exec() and get the new ->mm but check old creds.

Now we do not need to re-check task->mm after ptrace_may_access(), it
can't be changed to the new mm under us.

Strictly speaking, this also fixes another very minor problem. Unless
security check fails or the task exits mm_for_maps() should never
return NULL, the caller should get either old or new ->mm.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/proc/base.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index f3c2e4085fed..175db258942f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -234,19 +234,19 @@ static int check_mem_permission(struct task_struct *task)
 
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
-	struct mm_struct *mm = get_task_mm(task);
+	struct mm_struct *mm;
 
-	if (mm && mm != current->mm) {
-		/*
-		 * task->mm can be changed before security check,
-		 * in that case we must notice the change after.
-		 */
-		if (!ptrace_may_access(task, PTRACE_MODE_READ) ||
-		    mm != task->mm) {
-			mmput(mm);
-			mm = NULL;
-		}
+	if (mutex_lock_killable(&task->cred_guard_mutex))
+		return NULL;
+
+	mm = get_task_mm(task);
+	if (mm && mm != current->mm &&
+			!ptrace_may_access(task, PTRACE_MODE_READ)) {
+		mmput(mm);
+		mm = NULL;
 	}
+	mutex_unlock(&task->cred_guard_mutex);
+
 	return mm;
 }
 
-- 
cgit v1.2.2


From 1905b1bfc0de6f69a61dc03cac0d86a04b3216bd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 6 Aug 2009 18:13:23 +0900
Subject: chrdev: implement __[un]register_chrdev()

[un]register_chrdev() assume minor range 0-255.  This patch adds __
prefixed versions which take @minorbase and @count explicitly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 fs/char_dev.c | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/char_dev.c b/fs/char_dev.c
index a173551e19d7..2f18c1e4e301 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -237,8 +237,10 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
 }
 
 /**
- * register_chrdev() - Register a major number for character devices.
+ * __register_chrdev() - create and register a cdev occupying a range of minors
  * @major: major device number or 0 for dynamic allocation
+ * @baseminor: first of the requested range of minor numbers
+ * @count: the number of minor numbers required
  * @name: name of this range of devices
  * @fops: file operations associated with this devices
  *
@@ -254,19 +256,17 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
  * /dev. It only helps to keep track of the different owners of devices. If
  * your module name has only one type of devices it's ok to use e.g. the name
  * of the module here.
- *
- * This function registers a range of 256 minor numbers. The first minor number
- * is 0.
  */
-int register_chrdev(unsigned int major, const char *name,
-		    const struct file_operations *fops)
+int __register_chrdev(unsigned int major, unsigned int baseminor,
+		      unsigned int count, const char *name,
+		      const struct file_operations *fops)
 {
 	struct char_device_struct *cd;
 	struct cdev *cdev;
 	char *s;
 	int err = -ENOMEM;
 
-	cd = __register_chrdev_region(major, 0, 256, name);
+	cd = __register_chrdev_region(major, baseminor, count, name);
 	if (IS_ERR(cd))
 		return PTR_ERR(cd);
 	
@@ -280,7 +280,7 @@ int register_chrdev(unsigned int major, const char *name,
 	for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/'))
 		*s = '!';
 		
-	err = cdev_add(cdev, MKDEV(cd->major, 0), 256);
+	err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
 	if (err)
 		goto out;
 
@@ -290,7 +290,7 @@ int register_chrdev(unsigned int major, const char *name,
 out:
 	kobject_put(&cdev->kobj);
 out2:
-	kfree(__unregister_chrdev_region(cd->major, 0, 256));
+	kfree(__unregister_chrdev_region(cd->major, baseminor, count));
 	return err;
 }
 
@@ -316,10 +316,23 @@ void unregister_chrdev_region(dev_t from, unsigned count)
 	}
 }
 
-void unregister_chrdev(unsigned int major, const char *name)
+/**
+ * __unregister_chrdev - unregister and destroy a cdev
+ * @major: major device number
+ * @baseminor: first of the range of minor numbers
+ * @count: the number of minor numbers this cdev is occupying
+ * @name: name of this range of devices
+ *
+ * Unregister and destroy the cdev occupying the region described by
+ * @major, @baseminor and @count.  This function undoes what
+ * __register_chrdev() did.
+ */
+void __unregister_chrdev(unsigned int major, unsigned int baseminor,
+			 unsigned int count, const char *name)
 {
 	struct char_device_struct *cd;
-	cd = __unregister_chrdev_region(major, 0, 256);
+
+	cd = __unregister_chrdev_region(major, baseminor, count);
 	if (cd && cd->cdev)
 		cdev_del(cd->cdev);
 	kfree(cd);
@@ -568,6 +581,6 @@ EXPORT_SYMBOL(cdev_alloc);
 EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
 EXPORT_SYMBOL(cdev_index);
-EXPORT_SYMBOL(register_chrdev);
-EXPORT_SYMBOL(unregister_chrdev);
+EXPORT_SYMBOL(__register_chrdev);
+EXPORT_SYMBOL(__unregister_chrdev);
 EXPORT_SYMBOL(directly_mappable_cdev_bdi);
-- 
cgit v1.2.2


From 074cc1deec5dee63fcd5d966b36fa4f3765b50fc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 10 Aug 2009 08:54:13 -0400
Subject: NFS: Add a ->migratepage() aop for NFS

Make NFS a bit more friendly to NUMA and memory hot removal...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c     |  1 +
 fs/nfs/internal.h |  6 ++++
 fs/nfs/write.c    | 91 +++++++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 76 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 05062329b678..dfc89671dc94 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -479,6 +479,7 @@ const struct address_space_operations nfs_file_aops = {
 	.invalidatepage = nfs_invalidate_page,
 	.releasepage = nfs_release_page,
 	.direct_IO = nfs_direct_IO,
+	.migratepage = nfs_migrate_page,
 	.launder_page = nfs_launder_page,
 };
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7dd90a6769d0..e2ccb4a4398a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -248,6 +248,12 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 
 /* write.c */
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
+#ifdef CONFIG_MIGRATION
+extern int nfs_migrate_page(struct address_space *,
+		struct page *, struct page *);
+#else
+#define nfs_migrate_page NULL
+#endif
 
 /* nfs4proc.c */
 extern int _nfs4_call_sync(struct nfs_server *server,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0a0a2ff767c3..6240e644f249 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -13,6 +13,7 @@
 #include <linux/file.h>
 #include <linux/writeback.h>
 #include <linux/swap.h>
+#include <linux/migrate.h>
 
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs_fs.h>
@@ -26,6 +27,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "nfs4_fs.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
@@ -220,24 +222,17 @@ static void nfs_end_page_writeback(struct page *page)
 		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 }
 
-/*
- * Find an associated nfs write request, and prepare to flush it out
- * May return an error if the user signalled nfs_wait_on_request().
- */
-static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
-				struct page *page)
+static struct nfs_page *nfs_find_and_lock_request(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct nfs_page *req;
 	int ret;
 
 	spin_lock(&inode->i_lock);
-	for(;;) {
+	for (;;) {
 		req = nfs_page_find_request_locked(page);
-		if (req == NULL) {
-			spin_unlock(&inode->i_lock);
-			return 0;
-		}
+		if (req == NULL)
+			break;
 		if (nfs_set_page_tag_locked(req))
 			break;
 		/* Note: If we hold the page lock, as is the case in nfs_writepage,
@@ -249,23 +244,40 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 		ret = nfs_wait_on_request(req);
 		nfs_release_request(req);
 		if (ret != 0)
-			return ret;
+			return ERR_PTR(ret);
 		spin_lock(&inode->i_lock);
 	}
-	if (test_bit(PG_CLEAN, &req->wb_flags)) {
-		spin_unlock(&inode->i_lock);
-		BUG();
-	}
-	if (nfs_set_page_writeback(page) != 0) {
-		spin_unlock(&inode->i_lock);
-		BUG();
-	}
 	spin_unlock(&inode->i_lock);
+	return req;
+}
+
+/*
+ * Find an associated nfs write request, and prepare to flush it out
+ * May return an error if the user signalled nfs_wait_on_request().
+ */
+static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
+				struct page *page)
+{
+	struct nfs_page *req;
+	int ret = 0;
+
+	req = nfs_find_and_lock_request(page);
+	if (!req)
+		goto out;
+	ret = PTR_ERR(req);
+	if (IS_ERR(req))
+		goto out;
+
+	ret = nfs_set_page_writeback(page);
+	BUG_ON(ret != 0);
+	BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
+
 	if (!nfs_pageio_add_request(pgio, req)) {
 		nfs_redirty_request(req);
-		return pgio->pg_error;
+		ret = pgio->pg_error;
 	}
-	return 0;
+out:
+	return ret;
 }
 
 static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
@@ -1582,6 +1594,41 @@ int nfs_wb_page(struct inode *inode, struct page* page)
 	return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
 }
 
+#ifdef CONFIG_MIGRATION
+int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
+		struct page *page)
+{
+	struct nfs_page *req;
+	int ret;
+
+	if (PageFsCache(page))
+		nfs_fscache_release_page(page, GFP_KERNEL);
+
+	req = nfs_find_and_lock_request(page);
+	ret = PTR_ERR(req);
+	if (IS_ERR(req))
+		goto out;
+
+	ret = migrate_page(mapping, newpage, page);
+	if (!req)
+		goto out;
+	if (ret)
+		goto out_unlock;
+	page_cache_get(newpage);
+	req->wb_page = newpage;
+	SetPagePrivate(newpage);
+	set_page_private(newpage, page_private(page));
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	page_cache_release(page);
+out_unlock:
+	nfs_clear_page_tag_locked(req);
+	nfs_release_request(req);
+out:
+	return ret;
+}
+#endif
+
 int __init nfs_init_writepagecache(void)
 {
 	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
-- 
cgit v1.2.2


From 38c73044f5f4da2ef4339319b170e5e19f8dec87 Mon Sep 17 00:00:00 2001
From: Peter Staubach <staubach@redhat.com>
Date: Mon, 10 Aug 2009 08:54:16 -0400
Subject: NFS: read-modify-write page updating

Hi.

I have a proposal for possibly resolving this issue.

I believe that this situation occurs due to the way that the
Linux NFS client handles writes which modify partial pages.

The Linux NFS client handles partial page modifications by
allocating a page from the page cache, copying the data from
the user level into the page, and then keeping track of the
offset and length of the modified portions of the page.  The
page is not marked as up to date because there are portions
of the page which do not contain valid file contents.

When a read call comes in for a portion of the page, the
contents of the page must be read in the from the server.
However, since the page may already contain some modified
data, that modified data must be written to the server
before the file contents can be read back in the from server.
And, since the writing and reading can not be done atomically,
the data must be written and committed to stable storage on
the server for safety purposes.  This means either a
FILE_SYNC WRITE or a UNSTABLE WRITE followed by a COMMIT.
This has been discussed at length previously.

This algorithm could be described as modify-write-read.  It
is most efficient when the application only updates pages
and does not read them.

My proposed solution is to add a heuristic to decide whether
to do this modify-write-read algorithm or switch to a read-
modify-write algorithm when initially allocating the page
in the write system call path.  The heuristic uses the modes
that the file was opened with, the offset in the page to
read from, and the size of the region to read.

If the file was opened for reading in addition to writing
and the page would not be filled completely with data from
the user level, then read in the old contents of the page
and mark it as Uptodate before copying in the new data.  If
the page would be completely filled with data from the user
level, then there would be no reason to read in the old
contents because they would just be copied over.

This would optimize for applications which randomly access
and update portions of files.  The linkage editor for the
C compiler is an example of such a thing.

I tested the attached patch by using rpmbuild to build the
current Fedora rawhide kernel.  The kernel without the
patch generated about 269,500 WRITE requests.  The modified
kernel containing the patch generated about 261,000 WRITE
requests.  Thus, about 8,500 fewer WRITE requests were
generated.  I suspect that many of these additional
WRITE requests were probably FILE_SYNC requests to WRITE
a single page, but I didn't test this theory.

The difference between this patch and the previous one was
to remove the unneeded PageDirty() test.  I then retested to
ensure that the resulting system continued to behave as
desired.

	Thanx...

		ps

Signed-off-by: Peter Staubach <staubach@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index dfc89671dc94..5021b75d2d1e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -327,6 +327,42 @@ nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
 	return nfs_do_fsync(ctx, inode);
 }
 
+/*
+ * Decide whether a read/modify/write cycle may be more efficient
+ * then a modify/write/read cycle when writing to a page in the
+ * page cache.
+ *
+ * The modify/write/read cycle may occur if a page is read before
+ * being completely filled by the writer.  In this situation, the
+ * page must be completely written to stable storage on the server
+ * before it can be refilled by reading in the page from the server.
+ * This can lead to expensive, small, FILE_SYNC mode writes being
+ * done.
+ *
+ * It may be more efficient to read the page first if the file is
+ * open for reading in addition to writing, the page is not marked
+ * as Uptodate, it is not dirty or waiting to be committed,
+ * indicating that it was previously allocated and then modified,
+ * that there were valid bytes of data in that range of the file,
+ * and that the new data won't completely replace the old data in
+ * that range of the file.
+ */
+static int nfs_want_read_modify_write(struct file *file, struct page *page,
+			loff_t pos, unsigned len)
+{
+	unsigned int pglen = nfs_page_length(page);
+	unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned int end = offset + len;
+
+	if ((file->f_mode & FMODE_READ) &&	/* open for read? */
+	    !PageUptodate(page) &&		/* Uptodate? */
+	    !PagePrivate(page) &&		/* i/o request already? */
+	    pglen &&				/* valid bytes of file? */
+	    (end < pglen || offset))		/* replace all valid bytes? */
+		return 1;
+	return 0;
+}
+
 /*
  * This does the "real" work of the write. We must allocate and lock the
  * page to be sent back to the generic routine, which then copies the
@@ -340,15 +376,16 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 			struct page **pagep, void **fsdata)
 {
 	int ret;
-	pgoff_t index;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	struct page *page;
-	index = pos >> PAGE_CACHE_SHIFT;
+	int once_thru = 0;
 
 	dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
 		file->f_path.dentry->d_parent->d_name.name,
 		file->f_path.dentry->d_name.name,
 		mapping->host->i_ino, len, (long long) pos);
 
+start:
 	/*
 	 * Prevent starvation issues if someone is doing a consistency
 	 * sync-to-disk
@@ -367,6 +404,13 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 	if (ret) {
 		unlock_page(page);
 		page_cache_release(page);
+	} else if (!once_thru &&
+		   nfs_want_read_modify_write(file, page, pos, len)) {
+		once_thru = 1;
+		ret = nfs_readpage(file, page);
+		page_cache_release(page);
+		if (!ret)
+			goto start;
 	}
 	return ret;
 }
-- 
cgit v1.2.2


From e576e05a73bc1a00cdf56630942dbada1bf280a1 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Mon, 10 Aug 2009 08:54:16 -0400
Subject: nfs: remove superfluous BUG_ON()s Subject: [PATCH] nfs: remove
 superfluous BUG_ON()s

Remove duplicated BUG_ON()s from nfs[4]_create_server()
(we make the same checks earlier in both functions).

This takes care of the following entries from Dan's list:

fs/nfs/client.c +1078 nfs_create_server(47) warning: variable derefenced before check 'server->nfs_client'
fs/nfs/client.c +1079 nfs_create_server(48) warning: variable derefenced before check 'server->nfs_client->rpc_ops'
fs/nfs/client.c +1363 nfs4_create_server(43) warning: variable derefenced before check 'server->nfs_client'
fs/nfs/client.c +1364 nfs4_create_server(44) warning: variable derefenced before check 'server->nfs_

Reported-by: Dan Carpenter <error27@gmail.com>
Cc: corbet@lwn.net
Cc: eteo@redhat.com
Cc: Julia Lawall <julia@diku.dk>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8f34fd8028fc..d36925f9b952 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1077,10 +1077,6 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
 		(unsigned long long) server->fsid.major,
 		(unsigned long long) server->fsid.minor);
 
-	BUG_ON(!server->nfs_client);
-	BUG_ON(!server->nfs_client->rpc_ops);
-	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-
 	spin_lock(&nfs_client_lock);
 	list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
 	list_add_tail(&server->master_link, &nfs_volume_list);
@@ -1362,10 +1358,6 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
 	if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
 		server->namelen = NFS4_MAXNAMLEN;
 
-	BUG_ON(!server->nfs_client);
-	BUG_ON(!server->nfs_client->rpc_ops);
-	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-
 	spin_lock(&nfs_client_lock);
 	list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
 	list_add_tail(&server->master_link, &nfs_volume_list);
-- 
cgit v1.2.2


From e75bc1c89e0c7dda0b140408ddee2ffaef7ba6d4 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:18:54 +0300
Subject: nfs: nfs4xdr: get rid of WRITE32

s/WRITE32/*p++ = cpu_to_be32/

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 291 +++++++++++++++++++++++++++----------------------------
 1 file changed, 145 insertions(+), 146 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 617273e7d47f..9a03b24ead5f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -712,7 +712,6 @@ struct compound_hdr {
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define WRITE32(n)               *p++ = htonl(n)
 #define WRITE64(n)               do {				\
 	*p++ = htonl((uint32_t)((n) >> 32));				\
 	*p++ = htonl((uint32_t)(n));					\
@@ -750,11 +749,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 	dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
 	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
 	RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2));
-	WRITE32(hdr->taglen);
+	*p++ = cpu_to_be32(hdr->taglen);
 	WRITEMEM(hdr->tag, hdr->taglen);
-	WRITE32(hdr->minorversion);
+	*p++ = cpu_to_be32(hdr->minorversion);
 	hdr->nops_p = p;
-	WRITE32(hdr->nops);
+	*p++ = cpu_to_be32(hdr->nops);
 }
 
 static void encode_nops(struct compound_hdr *hdr)
@@ -835,7 +834,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	 * We write the bitmap length now, but leave the bitmap and the attribute
 	 * buffer length to be backfilled at the end of this routine.
 	 */
-	WRITE32(2);
+	*p++ = cpu_to_be32(2);
 	q = p;
 	p += 3;
 
@@ -845,39 +844,39 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	}
 	if (iap->ia_valid & ATTR_MODE) {
 		bmval1 |= FATTR4_WORD1_MODE;
-		WRITE32(iap->ia_mode & S_IALLUGO);
+		*p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
 	}
 	if (iap->ia_valid & ATTR_UID) {
 		bmval1 |= FATTR4_WORD1_OWNER;
-		WRITE32(owner_namelen);
+		*p++ = cpu_to_be32(owner_namelen);
 		WRITEMEM(owner_name, owner_namelen);
 	}
 	if (iap->ia_valid & ATTR_GID) {
 		bmval1 |= FATTR4_WORD1_OWNER_GROUP;
-		WRITE32(owner_grouplen);
+		*p++ = cpu_to_be32(owner_grouplen);
 		WRITEMEM(owner_group, owner_grouplen);
 	}
 	if (iap->ia_valid & ATTR_ATIME_SET) {
 		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
-		WRITE32(NFS4_SET_TO_CLIENT_TIME);
-		WRITE32(0);
-		WRITE32(iap->ia_mtime.tv_sec);
-		WRITE32(iap->ia_mtime.tv_nsec);
+		*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+		*p++ = cpu_to_be32(0);
+		*p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
+		*p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
 	}
 	else if (iap->ia_valid & ATTR_ATIME) {
 		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
-		WRITE32(NFS4_SET_TO_SERVER_TIME);
+		*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
 	}
 	if (iap->ia_valid & ATTR_MTIME_SET) {
 		bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
-		WRITE32(NFS4_SET_TO_CLIENT_TIME);
-		WRITE32(0);
-		WRITE32(iap->ia_mtime.tv_sec);
-		WRITE32(iap->ia_mtime.tv_nsec);
+		*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+		*p++ = cpu_to_be32(0);
+		*p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
+		*p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
 	}
 	else if (iap->ia_valid & ATTR_MTIME) {
 		bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
-		WRITE32(NFS4_SET_TO_SERVER_TIME);
+		*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
 	}
 
 	/*
@@ -901,8 +900,8 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd
 	__be32 *p;
 
 	RESERVE_SPACE(8);
-	WRITE32(OP_ACCESS);
-	WRITE32(access);
+	*p++ = cpu_to_be32(OP_ACCESS);
+	*p++ = cpu_to_be32(access);
 	hdr->nops++;
 	hdr->replen += decode_access_maxsz;
 }
@@ -912,8 +911,8 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
 	__be32 *p;
 
 	RESERVE_SPACE(8+NFS4_STATEID_SIZE);
-	WRITE32(OP_CLOSE);
-	WRITE32(arg->seqid->sequence->counter);
+	*p++ = cpu_to_be32(OP_CLOSE);
+	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_close_maxsz;
@@ -924,9 +923,9 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
 	__be32 *p;
 
 	RESERVE_SPACE(16);
-	WRITE32(OP_COMMIT);
+	*p++ = cpu_to_be32(OP_COMMIT);
 	WRITE64(args->offset);
-	WRITE32(args->count);
+	*p++ = cpu_to_be32(args->count);
 	hdr->nops++;
 	hdr->replen += decode_commit_maxsz;
 }
@@ -936,20 +935,20 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 	__be32 *p;
 
 	RESERVE_SPACE(8);
-	WRITE32(OP_CREATE);
-	WRITE32(create->ftype);
+	*p++ = cpu_to_be32(OP_CREATE);
+	*p++ = cpu_to_be32(create->ftype);
 
 	switch (create->ftype) {
 	case NF4LNK:
 		RESERVE_SPACE(4);
-		WRITE32(create->u.symlink.len);
+		*p++ = cpu_to_be32(create->u.symlink.len);
 		xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
 		break;
 
 	case NF4BLK: case NF4CHR:
 		RESERVE_SPACE(8);
-		WRITE32(create->u.device.specdata1);
-		WRITE32(create->u.device.specdata2);
+		*p++ = cpu_to_be32(create->u.device.specdata1);
+		*p++ = cpu_to_be32(create->u.device.specdata2);
 		break;
 
 	default:
@@ -957,7 +956,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 	}
 
 	RESERVE_SPACE(4 + create->name->len);
-	WRITE32(create->name->len);
+	*p++ = cpu_to_be32(create->name->len);
 	WRITEMEM(create->name->name, create->name->len);
 	hdr->nops++;
 	hdr->replen += decode_create_maxsz;
@@ -970,9 +969,9 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
 	__be32 *p;
 
 	RESERVE_SPACE(12);
-	WRITE32(OP_GETATTR);
-	WRITE32(1);
-	WRITE32(bitmap);
+	*p++ = cpu_to_be32(OP_GETATTR);
+	*p++ = cpu_to_be32(1);
+	*p++ = cpu_to_be32(bitmap);
 	hdr->nops++;
 	hdr->replen += decode_getattr_maxsz;
 }
@@ -982,10 +981,10 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
 	__be32 *p;
 
 	RESERVE_SPACE(16);
-	WRITE32(OP_GETATTR);
-	WRITE32(2);
-	WRITE32(bm0);
-	WRITE32(bm1);
+	*p++ = cpu_to_be32(OP_GETATTR);
+	*p++ = cpu_to_be32(2);
+	*p++ = cpu_to_be32(bm0);
+	*p++ = cpu_to_be32(bm1);
 	hdr->nops++;
 	hdr->replen += decode_getattr_maxsz;
 }
@@ -1013,7 +1012,7 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_GETFH);
+	*p++ = cpu_to_be32(OP_GETFH);
 	hdr->nops++;
 	hdr->replen += decode_getfh_maxsz;
 }
@@ -1023,8 +1022,8 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
 	__be32 *p;
 
 	RESERVE_SPACE(8 + name->len);
-	WRITE32(OP_LINK);
-	WRITE32(name->len);
+	*p++ = cpu_to_be32(OP_LINK);
+	*p++ = cpu_to_be32(name->len);
 	WRITEMEM(name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_link_maxsz;
@@ -1053,26 +1052,26 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 	__be32 *p;
 
 	RESERVE_SPACE(32);
-	WRITE32(OP_LOCK);
-	WRITE32(nfs4_lock_type(args->fl, args->block));
-	WRITE32(args->reclaim);
+	*p++ = cpu_to_be32(OP_LOCK);
+	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
+	*p++ = cpu_to_be32(args->reclaim);
 	WRITE64(args->fl->fl_start);
 	WRITE64(nfs4_lock_length(args->fl));
-	WRITE32(args->new_lock_owner);
+	*p++ = cpu_to_be32(args->new_lock_owner);
 	if (args->new_lock_owner){
 		RESERVE_SPACE(4+NFS4_STATEID_SIZE+32);
-		WRITE32(args->open_seqid->sequence->counter);
+		*p++ = cpu_to_be32(args->open_seqid->sequence->counter);
 		WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE);
-		WRITE32(args->lock_seqid->sequence->counter);
+		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
 		WRITE64(args->lock_owner.clientid);
-		WRITE32(16);
+		*p++ = cpu_to_be32(16);
 		WRITEMEM("lock id:", 8);
 		WRITE64(args->lock_owner.id);
 	}
 	else {
 		RESERVE_SPACE(NFS4_STATEID_SIZE+4);
 		WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE);
-		WRITE32(args->lock_seqid->sequence->counter);
+		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
 	}
 	hdr->nops++;
 	hdr->replen += decode_lock_maxsz;
@@ -1083,12 +1082,12 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 	__be32 *p;
 
 	RESERVE_SPACE(52);
-	WRITE32(OP_LOCKT);
-	WRITE32(nfs4_lock_type(args->fl, 0));
+	*p++ = cpu_to_be32(OP_LOCKT);
+	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
 	WRITE64(args->fl->fl_start);
 	WRITE64(nfs4_lock_length(args->fl));
 	WRITE64(args->lock_owner.clientid);
-	WRITE32(16);
+	*p++ = cpu_to_be32(16);
 	WRITEMEM("lock id:", 8);
 	WRITE64(args->lock_owner.id);
 	hdr->nops++;
@@ -1100,9 +1099,9 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 	__be32 *p;
 
 	RESERVE_SPACE(12+NFS4_STATEID_SIZE+16);
-	WRITE32(OP_LOCKU);
-	WRITE32(nfs4_lock_type(args->fl, 0));
-	WRITE32(args->seqid->sequence->counter);
+	*p++ = cpu_to_be32(OP_LOCKU);
+	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
+	*p++ = cpu_to_be32(args->seqid->sequence->counter);
 	WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE);
 	WRITE64(args->fl->fl_start);
 	WRITE64(nfs4_lock_length(args->fl));
@@ -1116,8 +1115,8 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
 	__be32 *p;
 
 	RESERVE_SPACE(8 + len);
-	WRITE32(OP_LOOKUP);
-	WRITE32(len);
+	*p++ = cpu_to_be32(OP_LOOKUP);
+	*p++ = cpu_to_be32(len);
 	WRITEMEM(name->name, len);
 	hdr->nops++;
 	hdr->replen += decode_lookup_maxsz;
@@ -1130,18 +1129,18 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
 	RESERVE_SPACE(8);
 	switch (fmode & (FMODE_READ|FMODE_WRITE)) {
 	case FMODE_READ:
-		WRITE32(NFS4_SHARE_ACCESS_READ);
+		*p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
 		break;
 	case FMODE_WRITE:
-		WRITE32(NFS4_SHARE_ACCESS_WRITE);
+		*p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE);
 		break;
 	case FMODE_READ|FMODE_WRITE:
-		WRITE32(NFS4_SHARE_ACCESS_BOTH);
+		*p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH);
 		break;
 	default:
-		WRITE32(0);
+		*p++ = cpu_to_be32(0);
 	}
-	WRITE32(0);		/* for linux, share_deny = 0 always */
+	*p++ = cpu_to_be32(0);		/* for linux, share_deny = 0 always */
 }
 
 static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1152,12 +1151,12 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
  * owner 4 = 32
  */
 	RESERVE_SPACE(8);
-	WRITE32(OP_OPEN);
-	WRITE32(arg->seqid->sequence->counter);
+	*p++ = cpu_to_be32(OP_OPEN);
+	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
 	RESERVE_SPACE(28);
 	WRITE64(arg->clientid);
-	WRITE32(16);
+	*p++ = cpu_to_be32(16);
 	WRITEMEM("open id:", 8);
 	WRITE64(arg->id);
 }
@@ -1169,11 +1168,11 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
 	RESERVE_SPACE(4);
 	switch(arg->open_flags & O_EXCL) {
 	case 0:
-		WRITE32(NFS4_CREATE_UNCHECKED);
+		*p++ = cpu_to_be32(NFS4_CREATE_UNCHECKED);
 		encode_attrs(xdr, arg->u.attrs, arg->server);
 		break;
 	default:
-		WRITE32(NFS4_CREATE_EXCLUSIVE);
+		*p++ = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
 		encode_nfs4_verifier(xdr, &arg->u.verifier);
 	}
 }
@@ -1185,11 +1184,11 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
 	RESERVE_SPACE(4);
 	switch (arg->open_flags & O_CREAT) {
 	case 0:
-		WRITE32(NFS4_OPEN_NOCREATE);
+		*p++ = cpu_to_be32(NFS4_OPEN_NOCREATE);
 		break;
 	default:
 		BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
-		WRITE32(NFS4_OPEN_CREATE);
+		*p++ = cpu_to_be32(NFS4_OPEN_CREATE);
 		encode_createmode(xdr, arg);
 	}
 }
@@ -1201,13 +1200,13 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega
 	RESERVE_SPACE(4);
 	switch (delegation_type) {
 	case 0:
-		WRITE32(NFS4_OPEN_DELEGATE_NONE);
+		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
 		break;
 	case FMODE_READ:
-		WRITE32(NFS4_OPEN_DELEGATE_READ);
+		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
 		break;
 	case FMODE_WRITE|FMODE_READ:
-		WRITE32(NFS4_OPEN_DELEGATE_WRITE);
+		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
 		break;
 	default:
 		BUG();
@@ -1219,7 +1218,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(NFS4_OPEN_CLAIM_NULL);
+	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
 	encode_string(xdr, name->len, name->name);
 }
 
@@ -1228,7 +1227,7 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(NFS4_OPEN_CLAIM_PREVIOUS);
+	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
 	encode_delegation_type(xdr, type);
 }
 
@@ -1237,7 +1236,7 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
 	__be32 *p;
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
-	WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
+	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
 	WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
 	encode_string(xdr, name->len, name->name);
 }
@@ -1268,9 +1267,9 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
 	__be32 *p;
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
-	WRITE32(OP_OPEN_CONFIRM);
+	*p++ = cpu_to_be32(OP_OPEN_CONFIRM);
 	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
-	WRITE32(arg->seqid->sequence->counter);
+	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	hdr->nops++;
 	hdr->replen += decode_open_confirm_maxsz;
 }
@@ -1280,9 +1279,9 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
 	__be32 *p;
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
-	WRITE32(OP_OPEN_DOWNGRADE);
+	*p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
 	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
-	WRITE32(arg->seqid->sequence->counter);
+	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
 	hdr->nops++;
 	hdr->replen += decode_open_downgrade_maxsz;
@@ -1295,8 +1294,8 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
 	__be32 *p;
 
 	RESERVE_SPACE(8 + len);
-	WRITE32(OP_PUTFH);
-	WRITE32(len);
+	*p++ = cpu_to_be32(OP_PUTFH);
+	*p++ = cpu_to_be32(len);
 	WRITEMEM(fh->data, len);
 	hdr->nops++;
 	hdr->replen += decode_putfh_maxsz;
@@ -1307,7 +1306,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_PUTROOTFH);
+	*p++ = cpu_to_be32(OP_PUTROOTFH);
 	hdr->nops++;
 	hdr->replen += decode_putrootfh_maxsz;
 }
@@ -1330,13 +1329,13 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_READ);
+	*p++ = cpu_to_be32(OP_READ);
 
 	encode_stateid(xdr, args->context);
 
 	RESERVE_SPACE(12);
 	WRITE64(args->offset);
-	WRITE32(args->count);
+	*p++ = cpu_to_be32(args->count);
 	hdr->nops++;
 	hdr->replen += decode_read_maxsz;
 }
@@ -1350,19 +1349,19 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	__be32 *p;
 
 	RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20);
-	WRITE32(OP_READDIR);
+	*p++ = cpu_to_be32(OP_READDIR);
 	WRITE64(readdir->cookie);
 	WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE);
-	WRITE32(readdir->count >> 1);  /* We're not doing readdirplus */
-	WRITE32(readdir->count);
-	WRITE32(2);
+	*p++ = cpu_to_be32(readdir->count >> 1);  /* We're not doing readdirplus */
+	*p++ = cpu_to_be32(readdir->count);
+	*p++ = cpu_to_be32(2);
 	/* Switch to mounted_on_fileid if the server supports it */
 	if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
 		attrs[0] &= ~FATTR4_WORD0_FILEID;
 	else
 		attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
-	WRITE32(attrs[0] & readdir->bitmask[0]);
-	WRITE32(attrs[1] & readdir->bitmask[1]);
+	*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
+	*p++ = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
 	hdr->nops++;
 	hdr->replen += decode_readdir_maxsz;
 	dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
@@ -1379,7 +1378,7 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_READLINK);
+	*p++ = cpu_to_be32(OP_READLINK);
 	hdr->nops++;
 	hdr->replen += decode_readlink_maxsz;
 }
@@ -1389,8 +1388,8 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
 	__be32 *p;
 
 	RESERVE_SPACE(8 + name->len);
-	WRITE32(OP_REMOVE);
-	WRITE32(name->len);
+	*p++ = cpu_to_be32(OP_REMOVE);
+	*p++ = cpu_to_be32(name->len);
 	WRITEMEM(name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_remove_maxsz;
@@ -1401,12 +1400,12 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
 	__be32 *p;
 
 	RESERVE_SPACE(8 + oldname->len);
-	WRITE32(OP_RENAME);
-	WRITE32(oldname->len);
+	*p++ = cpu_to_be32(OP_RENAME);
+	*p++ = cpu_to_be32(oldname->len);
 	WRITEMEM(oldname->name, oldname->len);
 
 	RESERVE_SPACE(4 + newname->len);
-	WRITE32(newname->len);
+	*p++ = cpu_to_be32(newname->len);
 	WRITEMEM(newname->name, newname->len);
 	hdr->nops++;
 	hdr->replen += decode_rename_maxsz;
@@ -1417,7 +1416,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
 	__be32 *p;
 
 	RESERVE_SPACE(12);
-	WRITE32(OP_RENEW);
+	*p++ = cpu_to_be32(OP_RENEW);
 	WRITE64(client_stateid->cl_clientid);
 	hdr->nops++;
 	hdr->replen += decode_renew_maxsz;
@@ -1429,7 +1428,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_RESTOREFH);
+	*p++ = cpu_to_be32(OP_RESTOREFH);
 	hdr->nops++;
 	hdr->replen += decode_restorefh_maxsz;
 }
@@ -1440,15 +1439,15 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
 	__be32 *p;
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
-	WRITE32(OP_SETATTR);
+	*p++ = cpu_to_be32(OP_SETATTR);
 	WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
 	RESERVE_SPACE(2*4);
-	WRITE32(1);
-	WRITE32(FATTR4_WORD0_ACL);
+	*p++ = cpu_to_be32(1);
+	*p++ = cpu_to_be32(FATTR4_WORD0_ACL);
 	if (arg->acl_len % 4)
 		return -EINVAL;
 	RESERVE_SPACE(4);
-	WRITE32(arg->acl_len);
+	*p++ = cpu_to_be32(arg->acl_len);
 	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
 	hdr->nops++;
 	hdr->replen += decode_setacl_maxsz;
@@ -1461,7 +1460,7 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_SAVEFH);
+	*p++ = cpu_to_be32(OP_SAVEFH);
 	hdr->nops++;
 	hdr->replen += decode_savefh_maxsz;
 }
@@ -1471,7 +1470,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 	__be32 *p;
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
-	WRITE32(OP_SETATTR);
+	*p++ = cpu_to_be32(OP_SETATTR);
 	WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setattr_maxsz;
@@ -1483,16 +1482,16 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
 	__be32 *p;
 
 	RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE);
-	WRITE32(OP_SETCLIENTID);
+	*p++ = cpu_to_be32(OP_SETCLIENTID);
 	WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
 
 	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
 	RESERVE_SPACE(4);
-	WRITE32(setclientid->sc_prog);
+	*p++ = cpu_to_be32(setclientid->sc_prog);
 	encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
 	encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
 	RESERVE_SPACE(4);
-	WRITE32(setclientid->sc_cb_ident);
+	*p++ = cpu_to_be32(setclientid->sc_cb_ident);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_maxsz;
 }
@@ -1502,7 +1501,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
 	__be32 *p;
 
 	RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
-	WRITE32(OP_SETCLIENTID_CONFIRM);
+	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
 	WRITE64(client_state->cl_clientid);
 	WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
 	hdr->nops++;
@@ -1514,14 +1513,14 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_WRITE);
+	*p++ = cpu_to_be32(OP_WRITE);
 
 	encode_stateid(xdr, args->context);
 
 	RESERVE_SPACE(16);
 	WRITE64(args->offset);
-	WRITE32(args->stable);
-	WRITE32(args->count);
+	*p++ = cpu_to_be32(args->stable);
+	*p++ = cpu_to_be32(args->count);
 
 	xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
 	hdr->nops++;
@@ -1534,7 +1533,7 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
 
-	WRITE32(OP_DELEGRETURN);
+	*p++ = cpu_to_be32(OP_DELEGRETURN);
 	WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_delegreturn_maxsz;
@@ -1549,15 +1548,15 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 	__be32 *p;
 
 	RESERVE_SPACE(4 + sizeof(args->verifier->data));
-	WRITE32(OP_EXCHANGE_ID);
+	*p++ = cpu_to_be32(OP_EXCHANGE_ID);
 	WRITEMEM(args->verifier->data, sizeof(args->verifier->data));
 
 	encode_string(xdr, args->id_len, args->id);
 
 	RESERVE_SPACE(12);
-	WRITE32(args->flags);
-	WRITE32(0);	/* zero length state_protect4_a */
-	WRITE32(0);	/* zero length implementation id array */
+	*p++ = cpu_to_be32(args->flags);
+	*p++ = cpu_to_be32(0);	/* zero length state_protect4_a */
+	*p++ = cpu_to_be32(0);	/* zero length implementation id array */
 	hdr->nops++;
 	hdr->replen += decode_exchange_id_maxsz;
 }
@@ -1572,54 +1571,54 @@ static void encode_create_session(struct xdr_stream *xdr,
 	struct nfs_client *clp = args->client;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_CREATE_SESSION);
+	*p++ = cpu_to_be32(OP_CREATE_SESSION);
 
 	RESERVE_SPACE(8);
 	WRITE64(clp->cl_ex_clid);
 
 	RESERVE_SPACE(8);
-	WRITE32(clp->cl_seqid);			/*Sequence id */
-	WRITE32(args->flags);			/*flags */
+	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */
+	*p++ = cpu_to_be32(args->flags);			/*flags */
 
 	RESERVE_SPACE(2*28);			/* 2 channel_attrs */
 	/* Fore Channel */
-	WRITE32(args->fc_attrs.headerpadsz);	/* header padding size */
-	WRITE32(args->fc_attrs.max_rqst_sz);	/* max req size */
-	WRITE32(args->fc_attrs.max_resp_sz);	/* max resp size */
-	WRITE32(args->fc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
-	WRITE32(args->fc_attrs.max_ops);	/* max operations */
-	WRITE32(args->fc_attrs.max_reqs);	/* max requests */
-	WRITE32(0);				/* rdmachannel_attrs */
+	*p++ = cpu_to_be32(args->fc_attrs.headerpadsz);	/* header padding size */
+	*p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz);	/* max req size */
+	*p++ = cpu_to_be32(args->fc_attrs.max_resp_sz);	/* max resp size */
+	*p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
+	*p++ = cpu_to_be32(args->fc_attrs.max_ops);	/* max operations */
+	*p++ = cpu_to_be32(args->fc_attrs.max_reqs);	/* max requests */
+	*p++ = cpu_to_be32(0);				/* rdmachannel_attrs */
 
 	/* Back Channel */
-	WRITE32(args->fc_attrs.headerpadsz);	/* header padding size */
-	WRITE32(args->bc_attrs.max_rqst_sz);	/* max req size */
-	WRITE32(args->bc_attrs.max_resp_sz);	/* max resp size */
-	WRITE32(args->bc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
-	WRITE32(args->bc_attrs.max_ops);	/* max operations */
-	WRITE32(args->bc_attrs.max_reqs);	/* max requests */
-	WRITE32(0);				/* rdmachannel_attrs */
+	*p++ = cpu_to_be32(args->fc_attrs.headerpadsz);	/* header padding size */
+	*p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz);	/* max req size */
+	*p++ = cpu_to_be32(args->bc_attrs.max_resp_sz);	/* max resp size */
+	*p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
+	*p++ = cpu_to_be32(args->bc_attrs.max_ops);	/* max operations */
+	*p++ = cpu_to_be32(args->bc_attrs.max_reqs);	/* max requests */
+	*p++ = cpu_to_be32(0);				/* rdmachannel_attrs */
 
 	RESERVE_SPACE(4);
-	WRITE32(args->cb_program);		/* cb_program */
+	*p++ = cpu_to_be32(args->cb_program);		/* cb_program */
 
 	RESERVE_SPACE(4);			/* # of security flavors */
-	WRITE32(1);
+	*p++ = cpu_to_be32(1);
 
 	RESERVE_SPACE(4);
-	WRITE32(RPC_AUTH_UNIX);			/* auth_sys */
+	*p++ = cpu_to_be32(RPC_AUTH_UNIX);			/* auth_sys */
 
 	/* authsys_parms rfc1831 */
 	RESERVE_SPACE(4);
-	WRITE32((u32)clp->cl_boot_time.tv_nsec);	/* stamp */
+	*p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec);	/* stamp */
 	len = scnprintf(machine_name, sizeof(machine_name), "%s",
 			clp->cl_ipaddr);
 	RESERVE_SPACE(16 + len);
-	WRITE32(len);
+	*p++ = cpu_to_be32(len);
 	WRITEMEM(machine_name, len);
-	WRITE32(0);				/* UID */
-	WRITE32(0);				/* GID */
-	WRITE32(0);				/* No more gids */
+	*p++ = cpu_to_be32(0);				/* UID */
+	*p++ = cpu_to_be32(0);				/* GID */
+	*p++ = cpu_to_be32(0);				/* No more gids */
 	hdr->nops++;
 	hdr->replen += decode_create_session_maxsz;
 }
@@ -1630,7 +1629,7 @@ static void encode_destroy_session(struct xdr_stream *xdr,
 {
 	__be32 *p;
 	RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN);
-	WRITE32(OP_DESTROY_SESSION);
+	*p++ = cpu_to_be32(OP_DESTROY_SESSION);
 	WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	hdr->nops++;
 	hdr->replen += decode_destroy_session_maxsz;
@@ -1656,7 +1655,7 @@ static void encode_sequence(struct xdr_stream *xdr,
 	slot = tp->slots + args->sa_slotid;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_SEQUENCE);
+	*p++ = cpu_to_be32(OP_SEQUENCE);
 
 	/*
 	 * Sessionid + seqid + slotid + max slotid + cache_this
@@ -1672,10 +1671,10 @@ static void encode_sequence(struct xdr_stream *xdr,
 		tp->highest_used_slotid, args->sa_cache_this);
 	RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16);
 	WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
-	WRITE32(slot->seq_nr);
-	WRITE32(args->sa_slotid);
-	WRITE32(tp->highest_used_slotid);
-	WRITE32(args->sa_cache_this);
+	*p++ = cpu_to_be32(slot->seq_nr);
+	*p++ = cpu_to_be32(args->sa_slotid);
+	*p++ = cpu_to_be32(tp->highest_used_slotid);
+	*p++ = cpu_to_be32(args->sa_cache_this);
 	hdr->nops++;
 	hdr->replen += decode_sequence_maxsz;
 #endif /* CONFIG_NFS_V4_1 */
-- 
cgit v1.2.2


From b95be5a976848febff82edb21d5b4351b3997bf6 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:01 +0300
Subject: nfs: nfs4xdr: get rid of WRITE64

s/WRITE64/p = xdr_encode_hyper(p, /

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 44 ++++++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 9a03b24ead5f..5d80766fc672 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -712,10 +712,6 @@ struct compound_hdr {
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define WRITE64(n)               do {				\
-	*p++ = htonl((uint32_t)((n) >> 32));				\
-	*p++ = htonl((uint32_t)(n));					\
-} while (0)
 #define WRITEMEM(ptr,nbytes)     do {				\
 	p = xdr_encode_opaque_fixed(p, ptr, nbytes);		\
 } while (0)
@@ -840,7 +836,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 
 	if (iap->ia_valid & ATTR_SIZE) {
 		bmval0 |= FATTR4_WORD0_SIZE;
-		WRITE64(iap->ia_size);
+		p = xdr_encode_hyper(p, iap->ia_size);
 	}
 	if (iap->ia_valid & ATTR_MODE) {
 		bmval1 |= FATTR4_WORD1_MODE;
@@ -924,7 +920,7 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
 
 	RESERVE_SPACE(16);
 	*p++ = cpu_to_be32(OP_COMMIT);
-	WRITE64(args->offset);
+	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->count);
 	hdr->nops++;
 	hdr->replen += decode_commit_maxsz;
@@ -1055,18 +1051,18 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 	*p++ = cpu_to_be32(OP_LOCK);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
 	*p++ = cpu_to_be32(args->reclaim);
-	WRITE64(args->fl->fl_start);
-	WRITE64(nfs4_lock_length(args->fl));
+	p = xdr_encode_hyper(p, args->fl->fl_start);
+	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	*p++ = cpu_to_be32(args->new_lock_owner);
 	if (args->new_lock_owner){
 		RESERVE_SPACE(4+NFS4_STATEID_SIZE+32);
 		*p++ = cpu_to_be32(args->open_seqid->sequence->counter);
 		WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE);
 		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
-		WRITE64(args->lock_owner.clientid);
+		p = xdr_encode_hyper(p, args->lock_owner.clientid);
 		*p++ = cpu_to_be32(16);
 		WRITEMEM("lock id:", 8);
-		WRITE64(args->lock_owner.id);
+		p = xdr_encode_hyper(p, args->lock_owner.id);
 	}
 	else {
 		RESERVE_SPACE(NFS4_STATEID_SIZE+4);
@@ -1084,12 +1080,12 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 	RESERVE_SPACE(52);
 	*p++ = cpu_to_be32(OP_LOCKT);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
-	WRITE64(args->fl->fl_start);
-	WRITE64(nfs4_lock_length(args->fl));
-	WRITE64(args->lock_owner.clientid);
+	p = xdr_encode_hyper(p, args->fl->fl_start);
+	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+	p = xdr_encode_hyper(p, args->lock_owner.clientid);
 	*p++ = cpu_to_be32(16);
 	WRITEMEM("lock id:", 8);
-	WRITE64(args->lock_owner.id);
+	p = xdr_encode_hyper(p, args->lock_owner.id);
 	hdr->nops++;
 	hdr->replen += decode_lockt_maxsz;
 }
@@ -1103,8 +1099,8 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
 	*p++ = cpu_to_be32(args->seqid->sequence->counter);
 	WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE);
-	WRITE64(args->fl->fl_start);
-	WRITE64(nfs4_lock_length(args->fl));
+	p = xdr_encode_hyper(p, args->fl->fl_start);
+	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	hdr->nops++;
 	hdr->replen += decode_locku_maxsz;
 }
@@ -1155,10 +1151,10 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
 	RESERVE_SPACE(28);
-	WRITE64(arg->clientid);
+	p = xdr_encode_hyper(p, arg->clientid);
 	*p++ = cpu_to_be32(16);
 	WRITEMEM("open id:", 8);
-	WRITE64(arg->id);
+	p = xdr_encode_hyper(p, arg->id);
 }
 
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1334,7 +1330,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 	encode_stateid(xdr, args->context);
 
 	RESERVE_SPACE(12);
-	WRITE64(args->offset);
+	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->count);
 	hdr->nops++;
 	hdr->replen += decode_read_maxsz;
@@ -1350,7 +1346,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 
 	RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20);
 	*p++ = cpu_to_be32(OP_READDIR);
-	WRITE64(readdir->cookie);
+	p = xdr_encode_hyper(p, readdir->cookie);
 	WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(readdir->count >> 1);  /* We're not doing readdirplus */
 	*p++ = cpu_to_be32(readdir->count);
@@ -1417,7 +1413,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
 
 	RESERVE_SPACE(12);
 	*p++ = cpu_to_be32(OP_RENEW);
-	WRITE64(client_stateid->cl_clientid);
+	p = xdr_encode_hyper(p, client_stateid->cl_clientid);
 	hdr->nops++;
 	hdr->replen += decode_renew_maxsz;
 }
@@ -1502,7 +1498,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
 
 	RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
-	WRITE64(client_state->cl_clientid);
+	p = xdr_encode_hyper(p, client_state->cl_clientid);
 	WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_confirm_maxsz;
@@ -1518,7 +1514,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 	encode_stateid(xdr, args->context);
 
 	RESERVE_SPACE(16);
-	WRITE64(args->offset);
+	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->stable);
 	*p++ = cpu_to_be32(args->count);
 
@@ -1574,7 +1570,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 	*p++ = cpu_to_be32(OP_CREATE_SESSION);
 
 	RESERVE_SPACE(8);
-	WRITE64(clp->cl_ex_clid);
+	p = xdr_encode_hyper(p, clp->cl_ex_clid);
 
 	RESERVE_SPACE(8);
 	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */
-- 
cgit v1.2.2


From 93f0cf25944695e1229fe90a2897af0211fbd425 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:06 +0300
Subject: nfs: nfs4xdr: get rid of WRITEMEM

s/WRITEMEM(/p = xdr_encode_opaque_fixed(p, /

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 68 ++++++++++++++++++++++++++------------------------------
 1 file changed, 32 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5d80766fc672..17915c8ea7fa 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -712,10 +712,6 @@ struct compound_hdr {
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define WRITEMEM(ptr,nbytes)     do {				\
-	p = xdr_encode_opaque_fixed(p, ptr, nbytes);		\
-} while (0)
-
 #define RESERVE_SPACE(nbytes)	do {				\
 	p = xdr_reserve_space(xdr, nbytes);			\
 	BUG_ON(!p);						\
@@ -746,7 +742,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
 	RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2));
 	*p++ = cpu_to_be32(hdr->taglen);
-	WRITEMEM(hdr->tag, hdr->taglen);
+	p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen);
 	*p++ = cpu_to_be32(hdr->minorversion);
 	hdr->nops_p = p;
 	*p++ = cpu_to_be32(hdr->nops);
@@ -845,12 +841,12 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	if (iap->ia_valid & ATTR_UID) {
 		bmval1 |= FATTR4_WORD1_OWNER;
 		*p++ = cpu_to_be32(owner_namelen);
-		WRITEMEM(owner_name, owner_namelen);
+		p = xdr_encode_opaque_fixed(p, owner_name, owner_namelen);
 	}
 	if (iap->ia_valid & ATTR_GID) {
 		bmval1 |= FATTR4_WORD1_OWNER_GROUP;
 		*p++ = cpu_to_be32(owner_grouplen);
-		WRITEMEM(owner_group, owner_grouplen);
+		p = xdr_encode_opaque_fixed(p, owner_group, owner_grouplen);
 	}
 	if (iap->ia_valid & ATTR_ATIME_SET) {
 		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
@@ -909,7 +905,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
 	RESERVE_SPACE(8+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_CLOSE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
-	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_close_maxsz;
 }
@@ -953,7 +949,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 
 	RESERVE_SPACE(4 + create->name->len);
 	*p++ = cpu_to_be32(create->name->len);
-	WRITEMEM(create->name->name, create->name->len);
+	p = xdr_encode_opaque_fixed(p, create->name->name, create->name->len);
 	hdr->nops++;
 	hdr->replen += decode_create_maxsz;
 
@@ -1020,7 +1016,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
 	RESERVE_SPACE(8 + name->len);
 	*p++ = cpu_to_be32(OP_LINK);
 	*p++ = cpu_to_be32(name->len);
-	WRITEMEM(name->name, name->len);
+	p = xdr_encode_opaque_fixed(p, name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_link_maxsz;
 }
@@ -1057,16 +1053,16 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 	if (args->new_lock_owner){
 		RESERVE_SPACE(4+NFS4_STATEID_SIZE+32);
 		*p++ = cpu_to_be32(args->open_seqid->sequence->counter);
-		WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE);
+		p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
 		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
 		p = xdr_encode_hyper(p, args->lock_owner.clientid);
 		*p++ = cpu_to_be32(16);
-		WRITEMEM("lock id:", 8);
+		p = xdr_encode_opaque_fixed(p, "lock id:", 8);
 		p = xdr_encode_hyper(p, args->lock_owner.id);
 	}
 	else {
 		RESERVE_SPACE(NFS4_STATEID_SIZE+4);
-		WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE);
+		p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE);
 		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
 	}
 	hdr->nops++;
@@ -1084,7 +1080,7 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	p = xdr_encode_hyper(p, args->lock_owner.clientid);
 	*p++ = cpu_to_be32(16);
-	WRITEMEM("lock id:", 8);
+	p = xdr_encode_opaque_fixed(p, "lock id:", 8);
 	p = xdr_encode_hyper(p, args->lock_owner.id);
 	hdr->nops++;
 	hdr->replen += decode_lockt_maxsz;
@@ -1098,7 +1094,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 	*p++ = cpu_to_be32(OP_LOCKU);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
 	*p++ = cpu_to_be32(args->seqid->sequence->counter);
-	WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
 	p = xdr_encode_hyper(p, args->fl->fl_start);
 	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	hdr->nops++;
@@ -1113,7 +1109,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
 	RESERVE_SPACE(8 + len);
 	*p++ = cpu_to_be32(OP_LOOKUP);
 	*p++ = cpu_to_be32(len);
-	WRITEMEM(name->name, len);
+	p = xdr_encode_opaque_fixed(p, name->name, len);
 	hdr->nops++;
 	hdr->replen += decode_lookup_maxsz;
 }
@@ -1153,7 +1149,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 	RESERVE_SPACE(28);
 	p = xdr_encode_hyper(p, arg->clientid);
 	*p++ = cpu_to_be32(16);
-	WRITEMEM("open id:", 8);
+	p = xdr_encode_opaque_fixed(p, "open id:", 8);
 	p = xdr_encode_hyper(p, arg->id);
 }
 
@@ -1233,7 +1229,7 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
-	WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
 	encode_string(xdr, name->len, name->name);
 }
 
@@ -1264,7 +1260,7 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
 	*p++ = cpu_to_be32(OP_OPEN_CONFIRM);
-	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	hdr->nops++;
 	hdr->replen += decode_open_confirm_maxsz;
@@ -1276,7 +1272,7 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
 	*p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
-	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
 	hdr->nops++;
@@ -1292,7 +1288,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
 	RESERVE_SPACE(8 + len);
 	*p++ = cpu_to_be32(OP_PUTFH);
 	*p++ = cpu_to_be32(len);
-	WRITEMEM(fh->data, len);
+	p = xdr_encode_opaque_fixed(p, fh->data, len);
 	hdr->nops++;
 	hdr->replen += decode_putfh_maxsz;
 }
@@ -1315,9 +1311,9 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
 	RESERVE_SPACE(NFS4_STATEID_SIZE);
 	if (ctx->state != NULL) {
 		nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
-		WRITEMEM(stateid.data, NFS4_STATEID_SIZE);
+		p = xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
 	} else
-		WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
+		p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
 }
 
 static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
@@ -1347,7 +1343,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20);
 	*p++ = cpu_to_be32(OP_READDIR);
 	p = xdr_encode_hyper(p, readdir->cookie);
-	WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE);
+	p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(readdir->count >> 1);  /* We're not doing readdirplus */
 	*p++ = cpu_to_be32(readdir->count);
 	*p++ = cpu_to_be32(2);
@@ -1386,7 +1382,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
 	RESERVE_SPACE(8 + name->len);
 	*p++ = cpu_to_be32(OP_REMOVE);
 	*p++ = cpu_to_be32(name->len);
-	WRITEMEM(name->name, name->len);
+	p = xdr_encode_opaque_fixed(p, name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_remove_maxsz;
 }
@@ -1398,11 +1394,11 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
 	RESERVE_SPACE(8 + oldname->len);
 	*p++ = cpu_to_be32(OP_RENAME);
 	*p++ = cpu_to_be32(oldname->len);
-	WRITEMEM(oldname->name, oldname->len);
+	p = xdr_encode_opaque_fixed(p, oldname->name, oldname->len);
 
 	RESERVE_SPACE(4 + newname->len);
 	*p++ = cpu_to_be32(newname->len);
-	WRITEMEM(newname->name, newname->len);
+	p = xdr_encode_opaque_fixed(p, newname->name, newname->len);
 	hdr->nops++;
 	hdr->replen += decode_rename_maxsz;
 }
@@ -1436,7 +1432,7 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_SETATTR);
-	WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
 	RESERVE_SPACE(2*4);
 	*p++ = cpu_to_be32(1);
 	*p++ = cpu_to_be32(FATTR4_WORD0_ACL);
@@ -1467,7 +1463,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_SETATTR);
-	WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setattr_maxsz;
 	encode_attrs(xdr, arg->iap, server);
@@ -1479,7 +1475,7 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
 
 	RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID);
-	WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
+	p = xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
 
 	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
 	RESERVE_SPACE(4);
@@ -1499,7 +1495,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
 	RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
 	p = xdr_encode_hyper(p, client_state->cl_clientid);
-	WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+	p = xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_confirm_maxsz;
 }
@@ -1530,7 +1526,7 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
 
 	*p++ = cpu_to_be32(OP_DELEGRETURN);
-	WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_delegreturn_maxsz;
 }
@@ -1545,7 +1541,7 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 
 	RESERVE_SPACE(4 + sizeof(args->verifier->data));
 	*p++ = cpu_to_be32(OP_EXCHANGE_ID);
-	WRITEMEM(args->verifier->data, sizeof(args->verifier->data));
+	p = xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
 
 	encode_string(xdr, args->id_len, args->id);
 
@@ -1611,7 +1607,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 			clp->cl_ipaddr);
 	RESERVE_SPACE(16 + len);
 	*p++ = cpu_to_be32(len);
-	WRITEMEM(machine_name, len);
+	p = xdr_encode_opaque_fixed(p, machine_name, len);
 	*p++ = cpu_to_be32(0);				/* UID */
 	*p++ = cpu_to_be32(0);				/* GID */
 	*p++ = cpu_to_be32(0);				/* No more gids */
@@ -1626,7 +1622,7 @@ static void encode_destroy_session(struct xdr_stream *xdr,
 	__be32 *p;
 	RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN);
 	*p++ = cpu_to_be32(OP_DESTROY_SESSION);
-	WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	hdr->nops++;
 	hdr->replen += decode_destroy_session_maxsz;
 }
@@ -1666,7 +1662,7 @@ static void encode_sequence(struct xdr_stream *xdr,
 		slot->seq_nr, args->sa_slotid,
 		tp->highest_used_slotid, args->sa_cache_this);
 	RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16);
-	WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	*p++ = cpu_to_be32(slot->seq_nr);
 	*p++ = cpu_to_be32(args->sa_slotid);
 	*p++ = cpu_to_be32(tp->highest_used_slotid);
-- 
cgit v1.2.2


From 42edd698125b76a38bd9999015202db036dfbc76 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:13 +0300
Subject: nfs: nfs4xdr: optimize RESERVE_SPACE in encode_create_session and
 encode_sequence

Coalesce multilpe constant RESERVE_SPACEs into one

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 17915c8ea7fa..d460d81758ae 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1562,17 +1562,15 @@ static void encode_create_session(struct xdr_stream *xdr,
 	uint32_t len;
 	struct nfs_client *clp = args->client;
 
-	RESERVE_SPACE(4);
-	*p++ = cpu_to_be32(OP_CREATE_SESSION);
+	len = scnprintf(machine_name, sizeof(machine_name), "%s",
+			clp->cl_ipaddr);
 
-	RESERVE_SPACE(8);
+	RESERVE_SPACE(20 + 2*28 + 20 + len + 12);
+	*p++ = cpu_to_be32(OP_CREATE_SESSION);
 	p = xdr_encode_hyper(p, clp->cl_ex_clid);
-
-	RESERVE_SPACE(8);
 	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */
 	*p++ = cpu_to_be32(args->flags);			/*flags */
 
-	RESERVE_SPACE(2*28);			/* 2 channel_attrs */
 	/* Fore Channel */
 	*p++ = cpu_to_be32(args->fc_attrs.headerpadsz);	/* header padding size */
 	*p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz);	/* max req size */
@@ -1591,21 +1589,12 @@ static void encode_create_session(struct xdr_stream *xdr,
 	*p++ = cpu_to_be32(args->bc_attrs.max_reqs);	/* max requests */
 	*p++ = cpu_to_be32(0);				/* rdmachannel_attrs */
 
-	RESERVE_SPACE(4);
 	*p++ = cpu_to_be32(args->cb_program);		/* cb_program */
-
-	RESERVE_SPACE(4);			/* # of security flavors */
 	*p++ = cpu_to_be32(1);
-
-	RESERVE_SPACE(4);
 	*p++ = cpu_to_be32(RPC_AUTH_UNIX);			/* auth_sys */
 
 	/* authsys_parms rfc1831 */
-	RESERVE_SPACE(4);
 	*p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec);	/* stamp */
-	len = scnprintf(machine_name, sizeof(machine_name), "%s",
-			clp->cl_ipaddr);
-	RESERVE_SPACE(16 + len);
 	*p++ = cpu_to_be32(len);
 	p = xdr_encode_opaque_fixed(p, machine_name, len);
 	*p++ = cpu_to_be32(0);				/* UID */
@@ -1646,7 +1635,7 @@ static void encode_sequence(struct xdr_stream *xdr,
 	WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
 	slot = tp->slots + args->sa_slotid;
 
-	RESERVE_SPACE(4);
+	RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN + 16);
 	*p++ = cpu_to_be32(OP_SEQUENCE);
 
 	/*
@@ -1661,7 +1650,6 @@ static void encode_sequence(struct xdr_stream *xdr,
 		((u32 *)session->sess_id.data)[3],
 		slot->seq_nr, args->sa_slotid,
 		tp->highest_used_slotid, args->sa_cache_this);
-	RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16);
 	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	*p++ = cpu_to_be32(slot->seq_nr);
 	*p++ = cpu_to_be32(args->sa_slotid);
-- 
cgit v1.2.2


From 2220f13a8b90d2259f3094cb54cf4de67d8eee2d Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:18 +0300
Subject: nfs: nfs4xdr: encode_compound_hdr does not have to round up reserved
 bytes

This is already done by xdr_reserve_space and since encode_compound_hdr
is adding a byte count to "12" which is already word aligned, the xdr
level rounding will work just as well.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d460d81758ae..7c2b162dd845 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -740,7 +740,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 
 	dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
 	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
-	RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2));
+	RESERVE_SPACE(12 + hdr->taglen);
 	*p++ = cpu_to_be32(hdr->taglen);
 	p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen);
 	*p++ = cpu_to_be32(hdr->minorversion);
-- 
cgit v1.2.2


From 13c65ce90006badccd5663e558e3c85869ae5ce6 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:25 +0300
Subject: nfs: nfs4xdr: change RESERVE_SPACE macro into a static helper

In order to open code and expose the result pointer assignment.

Alternatively, we can open code the call to xdr_reserve_space
and do the BUG_ON an the error case at the call site.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 138 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 65 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7c2b162dd845..3bcde49fbea8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -702,20 +702,12 @@ struct compound_hdr {
 	u32		minorversion;
 };
 
-/*
- * START OF "GENERIC" ENCODE ROUTINES.
- *   These may look a little ugly since they are imported from a "generic"
- * set of XDR encode/decode routines which are intended to be shared by
- * all of our NFSv4 implementations (OpenBSD, MacOS X...).
- *
- * If the pain of reading these is too great, it should be a straightforward
- * task to translate them into Linux-specific versions which are more
- * consistent with the style used in NFSv2/v3...
- */
-#define RESERVE_SPACE(nbytes)	do {				\
-	p = xdr_reserve_space(xdr, nbytes);			\
-	BUG_ON(!p);						\
-} while (0)
+static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)
+{
+	__be32 *p = xdr_reserve_space(xdr, nbytes);
+	BUG_ON(!p);
+	return p;
+}
 
 static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
 {
@@ -740,7 +732,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 
 	dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
 	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
-	RESERVE_SPACE(12 + hdr->taglen);
+	p = reserve_space(xdr, 12 + hdr->taglen);
 	*p++ = cpu_to_be32(hdr->taglen);
 	p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen);
 	*p++ = cpu_to_be32(hdr->minorversion);
@@ -820,7 +812,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 		len += 16;
 	else if (iap->ia_valid & ATTR_MTIME)
 		len += 4;
-	RESERVE_SPACE(len);
+	p = reserve_space(xdr, len);
 
 	/*
 	 * We write the bitmap length now, but leave the bitmap and the attribute
@@ -891,7 +883,7 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8);
+	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(OP_ACCESS);
 	*p++ = cpu_to_be32(access);
 	hdr->nops++;
@@ -902,7 +894,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8+NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, 8+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_CLOSE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
@@ -914,7 +906,7 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
 {
 	__be32 *p;
 
-	RESERVE_SPACE(16);
+	p = reserve_space(xdr, 16);
 	*p++ = cpu_to_be32(OP_COMMIT);
 	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->count);
@@ -926,19 +918,19 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8);
+	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(OP_CREATE);
 	*p++ = cpu_to_be32(create->ftype);
 
 	switch (create->ftype) {
 	case NF4LNK:
-		RESERVE_SPACE(4);
+		p = reserve_space(xdr, 4);
 		*p++ = cpu_to_be32(create->u.symlink.len);
 		xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
 		break;
 
 	case NF4BLK: case NF4CHR:
-		RESERVE_SPACE(8);
+		p = reserve_space(xdr, 8);
 		*p++ = cpu_to_be32(create->u.device.specdata1);
 		*p++ = cpu_to_be32(create->u.device.specdata2);
 		break;
@@ -947,7 +939,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 		break;
 	}
 
-	RESERVE_SPACE(4 + create->name->len);
+	p = reserve_space(xdr, 4 + create->name->len);
 	*p++ = cpu_to_be32(create->name->len);
 	p = xdr_encode_opaque_fixed(p, create->name->name, create->name->len);
 	hdr->nops++;
@@ -960,7 +952,7 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
 {
 	__be32 *p;
 
-	RESERVE_SPACE(12);
+	p = reserve_space(xdr, 12);
 	*p++ = cpu_to_be32(OP_GETATTR);
 	*p++ = cpu_to_be32(1);
 	*p++ = cpu_to_be32(bitmap);
@@ -972,7 +964,7 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
 {
 	__be32 *p;
 
-	RESERVE_SPACE(16);
+	p = reserve_space(xdr, 16);
 	*p++ = cpu_to_be32(OP_GETATTR);
 	*p++ = cpu_to_be32(2);
 	*p++ = cpu_to_be32(bm0);
@@ -1003,7 +995,7 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_GETFH);
 	hdr->nops++;
 	hdr->replen += decode_getfh_maxsz;
@@ -1013,7 +1005,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8 + name->len);
+	p = reserve_space(xdr, 8 + name->len);
 	*p++ = cpu_to_be32(OP_LINK);
 	*p++ = cpu_to_be32(name->len);
 	p = xdr_encode_opaque_fixed(p, name->name, name->len);
@@ -1043,7 +1035,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 {
 	__be32 *p;
 
-	RESERVE_SPACE(32);
+	p = reserve_space(xdr, 32);
 	*p++ = cpu_to_be32(OP_LOCK);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
 	*p++ = cpu_to_be32(args->reclaim);
@@ -1051,7 +1043,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	*p++ = cpu_to_be32(args->new_lock_owner);
 	if (args->new_lock_owner){
-		RESERVE_SPACE(4+NFS4_STATEID_SIZE+32);
+		p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32);
 		*p++ = cpu_to_be32(args->open_seqid->sequence->counter);
 		p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
 		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
@@ -1061,7 +1053,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 		p = xdr_encode_hyper(p, args->lock_owner.id);
 	}
 	else {
-		RESERVE_SPACE(NFS4_STATEID_SIZE+4);
+		p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
 		p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE);
 		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
 	}
@@ -1073,7 +1065,7 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 {
 	__be32 *p;
 
-	RESERVE_SPACE(52);
+	p = reserve_space(xdr, 52);
 	*p++ = cpu_to_be32(OP_LOCKT);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
 	p = xdr_encode_hyper(p, args->fl->fl_start);
@@ -1090,7 +1082,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 {
 	__be32 *p;
 
-	RESERVE_SPACE(12+NFS4_STATEID_SIZE+16);
+	p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16);
 	*p++ = cpu_to_be32(OP_LOCKU);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
 	*p++ = cpu_to_be32(args->seqid->sequence->counter);
@@ -1106,7 +1098,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
 	int len = name->len;
 	__be32 *p;
 
-	RESERVE_SPACE(8 + len);
+	p = reserve_space(xdr, 8 + len);
 	*p++ = cpu_to_be32(OP_LOOKUP);
 	*p++ = cpu_to_be32(len);
 	p = xdr_encode_opaque_fixed(p, name->name, len);
@@ -1118,7 +1110,7 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8);
+	p = reserve_space(xdr, 8);
 	switch (fmode & (FMODE_READ|FMODE_WRITE)) {
 	case FMODE_READ:
 		*p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
@@ -1142,11 +1134,11 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
  * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
  * owner 4 = 32
  */
-	RESERVE_SPACE(8);
+	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(OP_OPEN);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
-	RESERVE_SPACE(28);
+	p = reserve_space(xdr, 28);
 	p = xdr_encode_hyper(p, arg->clientid);
 	*p++ = cpu_to_be32(16);
 	p = xdr_encode_opaque_fixed(p, "open id:", 8);
@@ -1157,7 +1149,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	switch(arg->open_flags & O_EXCL) {
 	case 0:
 		*p++ = cpu_to_be32(NFS4_CREATE_UNCHECKED);
@@ -1173,7 +1165,7 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	switch (arg->open_flags & O_CREAT) {
 	case 0:
 		*p++ = cpu_to_be32(NFS4_OPEN_NOCREATE);
@@ -1189,7 +1181,7 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	switch (delegation_type) {
 	case 0:
 		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
@@ -1209,7 +1201,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
 	encode_string(xdr, name->len, name->name);
 }
@@ -1218,7 +1210,7 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
 	encode_delegation_type(xdr, type);
 }
@@ -1227,7 +1219,7 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
 	p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
 	encode_string(xdr, name->len, name->name);
@@ -1258,7 +1250,7 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
 	*p++ = cpu_to_be32(OP_OPEN_CONFIRM);
 	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
@@ -1270,7 +1262,7 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
 	*p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
 	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
@@ -1285,7 +1277,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
 	int len = fh->size;
 	__be32 *p;
 
-	RESERVE_SPACE(8 + len);
+	p = reserve_space(xdr, 8 + len);
 	*p++ = cpu_to_be32(OP_PUTFH);
 	*p++ = cpu_to_be32(len);
 	p = xdr_encode_opaque_fixed(p, fh->data, len);
@@ -1297,7 +1289,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_PUTROOTFH);
 	hdr->nops++;
 	hdr->replen += decode_putrootfh_maxsz;
@@ -1308,7 +1300,7 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
 	nfs4_stateid stateid;
 	__be32 *p;
 
-	RESERVE_SPACE(NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, NFS4_STATEID_SIZE);
 	if (ctx->state != NULL) {
 		nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
 		p = xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
@@ -1320,12 +1312,12 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_READ);
 
 	encode_stateid(xdr, args->context);
 
-	RESERVE_SPACE(12);
+	p = reserve_space(xdr, 12);
 	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->count);
 	hdr->nops++;
@@ -1340,7 +1332,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	};
 	__be32 *p;
 
-	RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20);
+	p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
 	*p++ = cpu_to_be32(OP_READDIR);
 	p = xdr_encode_hyper(p, readdir->cookie);
 	p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
@@ -1369,7 +1361,7 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_READLINK);
 	hdr->nops++;
 	hdr->replen += decode_readlink_maxsz;
@@ -1379,7 +1371,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8 + name->len);
+	p = reserve_space(xdr, 8 + name->len);
 	*p++ = cpu_to_be32(OP_REMOVE);
 	*p++ = cpu_to_be32(name->len);
 	p = xdr_encode_opaque_fixed(p, name->name, name->len);
@@ -1391,12 +1383,12 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8 + oldname->len);
+	p = reserve_space(xdr, 8 + oldname->len);
 	*p++ = cpu_to_be32(OP_RENAME);
 	*p++ = cpu_to_be32(oldname->len);
 	p = xdr_encode_opaque_fixed(p, oldname->name, oldname->len);
 
-	RESERVE_SPACE(4 + newname->len);
+	p = reserve_space(xdr, 4 + newname->len);
 	*p++ = cpu_to_be32(newname->len);
 	p = xdr_encode_opaque_fixed(p, newname->name, newname->len);
 	hdr->nops++;
@@ -1407,7 +1399,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
 {
 	__be32 *p;
 
-	RESERVE_SPACE(12);
+	p = reserve_space(xdr, 12);
 	*p++ = cpu_to_be32(OP_RENEW);
 	p = xdr_encode_hyper(p, client_stateid->cl_clientid);
 	hdr->nops++;
@@ -1419,7 +1411,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_RESTOREFH);
 	hdr->nops++;
 	hdr->replen += decode_restorefh_maxsz;
@@ -1430,15 +1422,15 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_SETATTR);
 	p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
-	RESERVE_SPACE(2*4);
+	p = reserve_space(xdr, 2*4);
 	*p++ = cpu_to_be32(1);
 	*p++ = cpu_to_be32(FATTR4_WORD0_ACL);
 	if (arg->acl_len % 4)
 		return -EINVAL;
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(arg->acl_len);
 	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
 	hdr->nops++;
@@ -1451,7 +1443,7 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_SAVEFH);
 	hdr->nops++;
 	hdr->replen += decode_savefh_maxsz;
@@ -1461,7 +1453,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_SETATTR);
 	p = xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
 	hdr->nops++;
@@ -1473,16 +1465,16 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE);
+	p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID);
 	p = xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
 
 	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(setclientid->sc_prog);
 	encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
 	encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(setclientid->sc_cb_ident);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_maxsz;
@@ -1492,7 +1484,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
 {
 	__be32 *p;
 
-	RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
+	p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
 	p = xdr_encode_hyper(p, client_state->cl_clientid);
 	p = xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
@@ -1504,12 +1496,12 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_WRITE);
 
 	encode_stateid(xdr, args->context);
 
-	RESERVE_SPACE(16);
+	p = reserve_space(xdr, 16);
 	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->stable);
 	*p++ = cpu_to_be32(args->count);
@@ -1523,7 +1515,7 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 
 	*p++ = cpu_to_be32(OP_DELEGRETURN);
 	p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
@@ -1539,13 +1531,13 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4 + sizeof(args->verifier->data));
+	p = reserve_space(xdr, 4 + sizeof(args->verifier->data));
 	*p++ = cpu_to_be32(OP_EXCHANGE_ID);
 	p = xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
 
 	encode_string(xdr, args->id_len, args->id);
 
-	RESERVE_SPACE(12);
+	p = reserve_space(xdr, 12);
 	*p++ = cpu_to_be32(args->flags);
 	*p++ = cpu_to_be32(0);	/* zero length state_protect4_a */
 	*p++ = cpu_to_be32(0);	/* zero length implementation id array */
@@ -1565,7 +1557,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 	len = scnprintf(machine_name, sizeof(machine_name), "%s",
 			clp->cl_ipaddr);
 
-	RESERVE_SPACE(20 + 2*28 + 20 + len + 12);
+	p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
 	*p++ = cpu_to_be32(OP_CREATE_SESSION);
 	p = xdr_encode_hyper(p, clp->cl_ex_clid);
 	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */
@@ -1609,7 +1601,7 @@ static void encode_destroy_session(struct xdr_stream *xdr,
 				   struct compound_hdr *hdr)
 {
 	__be32 *p;
-	RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN);
+	p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN);
 	*p++ = cpu_to_be32(OP_DESTROY_SESSION);
 	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	hdr->nops++;
@@ -1635,7 +1627,7 @@ static void encode_sequence(struct xdr_stream *xdr,
 	WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
 	slot = tp->slots + args->sa_slotid;
 
-	RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN + 16);
+	p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16);
 	*p++ = cpu_to_be32(OP_SEQUENCE);
 
 	/*
-- 
cgit v1.2.2


From 345585132a204859fbb7d8b662e9b6e5b563c6dc Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:30 +0300
Subject: nfs: nfs4xdr: optimize low level encoding

do not increment encoding ptr if not needed.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 128 +++++++++++++++++++++++++++----------------------------
 1 file changed, 64 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3bcde49fbea8..d75f821cad01 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -737,7 +737,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 	p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen);
 	*p++ = cpu_to_be32(hdr->minorversion);
 	hdr->nops_p = p;
-	*p++ = cpu_to_be32(hdr->nops);
+	*p = cpu_to_be32(hdr->nops);
 }
 
 static void encode_nops(struct compound_hdr *hdr)
@@ -874,7 +874,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	len = (char *)p - (char *)q - 12;
 	*q++ = htonl(bmval0);
 	*q++ = htonl(bmval1);
-	*q++ = htonl(len);
+	*q = htonl(len);
 
 /* out: */
 }
@@ -885,7 +885,7 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd
 
 	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(OP_ACCESS);
-	*p++ = cpu_to_be32(access);
+	*p = cpu_to_be32(access);
 	hdr->nops++;
 	hdr->replen += decode_access_maxsz;
 }
@@ -897,7 +897,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
 	p = reserve_space(xdr, 8+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_CLOSE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
-	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
+	xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_close_maxsz;
 }
@@ -909,7 +909,7 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
 	p = reserve_space(xdr, 16);
 	*p++ = cpu_to_be32(OP_COMMIT);
 	p = xdr_encode_hyper(p, args->offset);
-	*p++ = cpu_to_be32(args->count);
+	*p = cpu_to_be32(args->count);
 	hdr->nops++;
 	hdr->replen += decode_commit_maxsz;
 }
@@ -920,19 +920,19 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 
 	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(OP_CREATE);
-	*p++ = cpu_to_be32(create->ftype);
+	*p = cpu_to_be32(create->ftype);
 
 	switch (create->ftype) {
 	case NF4LNK:
 		p = reserve_space(xdr, 4);
-		*p++ = cpu_to_be32(create->u.symlink.len);
+		*p = cpu_to_be32(create->u.symlink.len);
 		xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
 		break;
 
 	case NF4BLK: case NF4CHR:
 		p = reserve_space(xdr, 8);
 		*p++ = cpu_to_be32(create->u.device.specdata1);
-		*p++ = cpu_to_be32(create->u.device.specdata2);
+		*p = cpu_to_be32(create->u.device.specdata2);
 		break;
 
 	default:
@@ -941,7 +941,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 
 	p = reserve_space(xdr, 4 + create->name->len);
 	*p++ = cpu_to_be32(create->name->len);
-	p = xdr_encode_opaque_fixed(p, create->name->name, create->name->len);
+	xdr_encode_opaque_fixed(p, create->name->name, create->name->len);
 	hdr->nops++;
 	hdr->replen += decode_create_maxsz;
 
@@ -955,7 +955,7 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
 	p = reserve_space(xdr, 12);
 	*p++ = cpu_to_be32(OP_GETATTR);
 	*p++ = cpu_to_be32(1);
-	*p++ = cpu_to_be32(bitmap);
+	*p = cpu_to_be32(bitmap);
 	hdr->nops++;
 	hdr->replen += decode_getattr_maxsz;
 }
@@ -968,7 +968,7 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
 	*p++ = cpu_to_be32(OP_GETATTR);
 	*p++ = cpu_to_be32(2);
 	*p++ = cpu_to_be32(bm0);
-	*p++ = cpu_to_be32(bm1);
+	*p = cpu_to_be32(bm1);
 	hdr->nops++;
 	hdr->replen += decode_getattr_maxsz;
 }
@@ -996,7 +996,7 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_GETFH);
+	*p = cpu_to_be32(OP_GETFH);
 	hdr->nops++;
 	hdr->replen += decode_getfh_maxsz;
 }
@@ -1008,7 +1008,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
 	p = reserve_space(xdr, 8 + name->len);
 	*p++ = cpu_to_be32(OP_LINK);
 	*p++ = cpu_to_be32(name->len);
-	p = xdr_encode_opaque_fixed(p, name->name, name->len);
+	xdr_encode_opaque_fixed(p, name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_link_maxsz;
 }
@@ -1041,7 +1041,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 	*p++ = cpu_to_be32(args->reclaim);
 	p = xdr_encode_hyper(p, args->fl->fl_start);
 	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
-	*p++ = cpu_to_be32(args->new_lock_owner);
+	*p = cpu_to_be32(args->new_lock_owner);
 	if (args->new_lock_owner){
 		p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32);
 		*p++ = cpu_to_be32(args->open_seqid->sequence->counter);
@@ -1050,12 +1050,12 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 		p = xdr_encode_hyper(p, args->lock_owner.clientid);
 		*p++ = cpu_to_be32(16);
 		p = xdr_encode_opaque_fixed(p, "lock id:", 8);
-		p = xdr_encode_hyper(p, args->lock_owner.id);
+		xdr_encode_hyper(p, args->lock_owner.id);
 	}
 	else {
 		p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
 		p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE);
-		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
+		*p = cpu_to_be32(args->lock_seqid->sequence->counter);
 	}
 	hdr->nops++;
 	hdr->replen += decode_lock_maxsz;
@@ -1073,7 +1073,7 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 	p = xdr_encode_hyper(p, args->lock_owner.clientid);
 	*p++ = cpu_to_be32(16);
 	p = xdr_encode_opaque_fixed(p, "lock id:", 8);
-	p = xdr_encode_hyper(p, args->lock_owner.id);
+	xdr_encode_hyper(p, args->lock_owner.id);
 	hdr->nops++;
 	hdr->replen += decode_lockt_maxsz;
 }
@@ -1088,7 +1088,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 	*p++ = cpu_to_be32(args->seqid->sequence->counter);
 	p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
 	p = xdr_encode_hyper(p, args->fl->fl_start);
-	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+	xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	hdr->nops++;
 	hdr->replen += decode_locku_maxsz;
 }
@@ -1101,7 +1101,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
 	p = reserve_space(xdr, 8 + len);
 	*p++ = cpu_to_be32(OP_LOOKUP);
 	*p++ = cpu_to_be32(len);
-	p = xdr_encode_opaque_fixed(p, name->name, len);
+	xdr_encode_opaque_fixed(p, name->name, len);
 	hdr->nops++;
 	hdr->replen += decode_lookup_maxsz;
 }
@@ -1124,7 +1124,7 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
 	default:
 		*p++ = cpu_to_be32(0);
 	}
-	*p++ = cpu_to_be32(0);		/* for linux, share_deny = 0 always */
+	*p = cpu_to_be32(0);		/* for linux, share_deny = 0 always */
 }
 
 static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1136,13 +1136,13 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
  */
 	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(OP_OPEN);
-	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
+	*p = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
 	p = reserve_space(xdr, 28);
 	p = xdr_encode_hyper(p, arg->clientid);
 	*p++ = cpu_to_be32(16);
 	p = xdr_encode_opaque_fixed(p, "open id:", 8);
-	p = xdr_encode_hyper(p, arg->id);
+	xdr_encode_hyper(p, arg->id);
 }
 
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1152,11 +1152,11 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
 	p = reserve_space(xdr, 4);
 	switch(arg->open_flags & O_EXCL) {
 	case 0:
-		*p++ = cpu_to_be32(NFS4_CREATE_UNCHECKED);
+		*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
 		encode_attrs(xdr, arg->u.attrs, arg->server);
 		break;
 	default:
-		*p++ = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
+		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
 		encode_nfs4_verifier(xdr, &arg->u.verifier);
 	}
 }
@@ -1168,11 +1168,11 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
 	p = reserve_space(xdr, 4);
 	switch (arg->open_flags & O_CREAT) {
 	case 0:
-		*p++ = cpu_to_be32(NFS4_OPEN_NOCREATE);
+		*p = cpu_to_be32(NFS4_OPEN_NOCREATE);
 		break;
 	default:
 		BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
-		*p++ = cpu_to_be32(NFS4_OPEN_CREATE);
+		*p = cpu_to_be32(NFS4_OPEN_CREATE);
 		encode_createmode(xdr, arg);
 	}
 }
@@ -1184,13 +1184,13 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega
 	p = reserve_space(xdr, 4);
 	switch (delegation_type) {
 	case 0:
-		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
+		*p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
 		break;
 	case FMODE_READ:
-		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
+		*p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
 		break;
 	case FMODE_WRITE|FMODE_READ:
-		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
+		*p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
 		break;
 	default:
 		BUG();
@@ -1202,7 +1202,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
+	*p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
 	encode_string(xdr, name->len, name->name);
 }
 
@@ -1211,7 +1211,7 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
+	*p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
 	encode_delegation_type(xdr, type);
 }
 
@@ -1221,7 +1221,7 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
 
 	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
-	p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
+	xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
 	encode_string(xdr, name->len, name->name);
 }
 
@@ -1253,7 +1253,7 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
 	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
 	*p++ = cpu_to_be32(OP_OPEN_CONFIRM);
 	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
-	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
+	*p = cpu_to_be32(arg->seqid->sequence->counter);
 	hdr->nops++;
 	hdr->replen += decode_open_confirm_maxsz;
 }
@@ -1265,7 +1265,7 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
 	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
 	*p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
 	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
-	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
+	*p = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
 	hdr->nops++;
 	hdr->replen += decode_open_downgrade_maxsz;
@@ -1280,7 +1280,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
 	p = reserve_space(xdr, 8 + len);
 	*p++ = cpu_to_be32(OP_PUTFH);
 	*p++ = cpu_to_be32(len);
-	p = xdr_encode_opaque_fixed(p, fh->data, len);
+	xdr_encode_opaque_fixed(p, fh->data, len);
 	hdr->nops++;
 	hdr->replen += decode_putfh_maxsz;
 }
@@ -1290,7 +1290,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_PUTROOTFH);
+	*p = cpu_to_be32(OP_PUTROOTFH);
 	hdr->nops++;
 	hdr->replen += decode_putrootfh_maxsz;
 }
@@ -1303,9 +1303,9 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
 	p = reserve_space(xdr, NFS4_STATEID_SIZE);
 	if (ctx->state != NULL) {
 		nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
-		p = xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
+		xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
 	} else
-		p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
+		xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
 }
 
 static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
@@ -1313,13 +1313,13 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_READ);
+	*p = cpu_to_be32(OP_READ);
 
 	encode_stateid(xdr, args->context);
 
 	p = reserve_space(xdr, 12);
 	p = xdr_encode_hyper(p, args->offset);
-	*p++ = cpu_to_be32(args->count);
+	*p = cpu_to_be32(args->count);
 	hdr->nops++;
 	hdr->replen += decode_read_maxsz;
 }
@@ -1345,7 +1345,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	else
 		attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
 	*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
-	*p++ = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
+	*p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
 	hdr->nops++;
 	hdr->replen += decode_readdir_maxsz;
 	dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
@@ -1362,7 +1362,7 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_READLINK);
+	*p = cpu_to_be32(OP_READLINK);
 	hdr->nops++;
 	hdr->replen += decode_readlink_maxsz;
 }
@@ -1374,7 +1374,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
 	p = reserve_space(xdr, 8 + name->len);
 	*p++ = cpu_to_be32(OP_REMOVE);
 	*p++ = cpu_to_be32(name->len);
-	p = xdr_encode_opaque_fixed(p, name->name, name->len);
+	xdr_encode_opaque_fixed(p, name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_remove_maxsz;
 }
@@ -1386,11 +1386,11 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
 	p = reserve_space(xdr, 8 + oldname->len);
 	*p++ = cpu_to_be32(OP_RENAME);
 	*p++ = cpu_to_be32(oldname->len);
-	p = xdr_encode_opaque_fixed(p, oldname->name, oldname->len);
+	xdr_encode_opaque_fixed(p, oldname->name, oldname->len);
 
 	p = reserve_space(xdr, 4 + newname->len);
 	*p++ = cpu_to_be32(newname->len);
-	p = xdr_encode_opaque_fixed(p, newname->name, newname->len);
+	xdr_encode_opaque_fixed(p, newname->name, newname->len);
 	hdr->nops++;
 	hdr->replen += decode_rename_maxsz;
 }
@@ -1401,7 +1401,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
 
 	p = reserve_space(xdr, 12);
 	*p++ = cpu_to_be32(OP_RENEW);
-	p = xdr_encode_hyper(p, client_stateid->cl_clientid);
+	xdr_encode_hyper(p, client_stateid->cl_clientid);
 	hdr->nops++;
 	hdr->replen += decode_renew_maxsz;
 }
@@ -1412,7 +1412,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_RESTOREFH);
+	*p = cpu_to_be32(OP_RESTOREFH);
 	hdr->nops++;
 	hdr->replen += decode_restorefh_maxsz;
 }
@@ -1424,14 +1424,14 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
 
 	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_SETATTR);
-	p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
+	xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
 	p = reserve_space(xdr, 2*4);
 	*p++ = cpu_to_be32(1);
-	*p++ = cpu_to_be32(FATTR4_WORD0_ACL);
+	*p = cpu_to_be32(FATTR4_WORD0_ACL);
 	if (arg->acl_len % 4)
 		return -EINVAL;
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(arg->acl_len);
+	*p = cpu_to_be32(arg->acl_len);
 	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
 	hdr->nops++;
 	hdr->replen += decode_setacl_maxsz;
@@ -1444,7 +1444,7 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_SAVEFH);
+	*p = cpu_to_be32(OP_SAVEFH);
 	hdr->nops++;
 	hdr->replen += decode_savefh_maxsz;
 }
@@ -1455,7 +1455,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 
 	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_SETATTR);
-	p = xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
+	xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setattr_maxsz;
 	encode_attrs(xdr, arg->iap, server);
@@ -1467,15 +1467,15 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
 
 	p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID);
-	p = xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
+	xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
 
 	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(setclientid->sc_prog);
+	*p = cpu_to_be32(setclientid->sc_prog);
 	encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
 	encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(setclientid->sc_cb_ident);
+	*p = cpu_to_be32(setclientid->sc_cb_ident);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_maxsz;
 }
@@ -1487,7 +1487,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
 	p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
 	p = xdr_encode_hyper(p, client_state->cl_clientid);
-	p = xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+	xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_confirm_maxsz;
 }
@@ -1497,14 +1497,14 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_WRITE);
+	*p = cpu_to_be32(OP_WRITE);
 
 	encode_stateid(xdr, args->context);
 
 	p = reserve_space(xdr, 16);
 	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->stable);
-	*p++ = cpu_to_be32(args->count);
+	*p = cpu_to_be32(args->count);
 
 	xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
 	hdr->nops++;
@@ -1518,7 +1518,7 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
 	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 
 	*p++ = cpu_to_be32(OP_DELEGRETURN);
-	p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
+	xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_delegreturn_maxsz;
 }
@@ -1533,14 +1533,14 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 
 	p = reserve_space(xdr, 4 + sizeof(args->verifier->data));
 	*p++ = cpu_to_be32(OP_EXCHANGE_ID);
-	p = xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
+	xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
 
 	encode_string(xdr, args->id_len, args->id);
 
 	p = reserve_space(xdr, 12);
 	*p++ = cpu_to_be32(args->flags);
 	*p++ = cpu_to_be32(0);	/* zero length state_protect4_a */
-	*p++ = cpu_to_be32(0);	/* zero length implementation id array */
+	*p = cpu_to_be32(0);	/* zero length implementation id array */
 	hdr->nops++;
 	hdr->replen += decode_exchange_id_maxsz;
 }
@@ -1591,7 +1591,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 	p = xdr_encode_opaque_fixed(p, machine_name, len);
 	*p++ = cpu_to_be32(0);				/* UID */
 	*p++ = cpu_to_be32(0);				/* GID */
-	*p++ = cpu_to_be32(0);				/* No more gids */
+	*p = cpu_to_be32(0);				/* No more gids */
 	hdr->nops++;
 	hdr->replen += decode_create_session_maxsz;
 }
@@ -1603,7 +1603,7 @@ static void encode_destroy_session(struct xdr_stream *xdr,
 	__be32 *p;
 	p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN);
 	*p++ = cpu_to_be32(OP_DESTROY_SESSION);
-	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+	xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	hdr->nops++;
 	hdr->replen += decode_destroy_session_maxsz;
 }
@@ -1646,7 +1646,7 @@ static void encode_sequence(struct xdr_stream *xdr,
 	*p++ = cpu_to_be32(slot->seq_nr);
 	*p++ = cpu_to_be32(args->sa_slotid);
 	*p++ = cpu_to_be32(tp->highest_used_slotid);
-	*p++ = cpu_to_be32(args->sa_cache_this);
+	*p = cpu_to_be32(args->sa_cache_this);
 	hdr->nops++;
 	hdr->replen += decode_sequence_maxsz;
 #endif /* CONFIG_NFS_V4_1 */
-- 
cgit v1.2.2


From 811652bd6edd66dd35bf9caacdfe96d19f75a47e Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:34 +0300
Subject: nfs: nfs4xdr: merge xdr_encode_int+xdr_encode_opaque_fixed into
 xdr_encode_opaque

use encode_string where appropriate.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 42 ++++++++++++++----------------------------
 1 file changed, 14 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d75f821cad01..efa78ad98228 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -732,9 +732,8 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 
 	dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
 	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
-	p = reserve_space(xdr, 12 + hdr->taglen);
-	*p++ = cpu_to_be32(hdr->taglen);
-	p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen);
+	p = reserve_space(xdr, 4 + hdr->taglen + 8);
+	p = xdr_encode_opaque(p, hdr->tag, hdr->taglen);
 	*p++ = cpu_to_be32(hdr->minorversion);
 	hdr->nops_p = p;
 	*p = cpu_to_be32(hdr->nops);
@@ -832,13 +831,11 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	}
 	if (iap->ia_valid & ATTR_UID) {
 		bmval1 |= FATTR4_WORD1_OWNER;
-		*p++ = cpu_to_be32(owner_namelen);
-		p = xdr_encode_opaque_fixed(p, owner_name, owner_namelen);
+		p = xdr_encode_opaque(p, owner_name, owner_namelen);
 	}
 	if (iap->ia_valid & ATTR_GID) {
 		bmval1 |= FATTR4_WORD1_OWNER_GROUP;
-		*p++ = cpu_to_be32(owner_grouplen);
-		p = xdr_encode_opaque_fixed(p, owner_group, owner_grouplen);
+		p = xdr_encode_opaque(p, owner_group, owner_grouplen);
 	}
 	if (iap->ia_valid & ATTR_ATIME_SET) {
 		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
@@ -939,9 +936,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 		break;
 	}
 
-	p = reserve_space(xdr, 4 + create->name->len);
-	*p++ = cpu_to_be32(create->name->len);
-	xdr_encode_opaque_fixed(p, create->name->name, create->name->len);
+	encode_string(xdr, create->name->len, create->name->name);
 	hdr->nops++;
 	hdr->replen += decode_create_maxsz;
 
@@ -1007,8 +1002,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
 
 	p = reserve_space(xdr, 8 + name->len);
 	*p++ = cpu_to_be32(OP_LINK);
-	*p++ = cpu_to_be32(name->len);
-	xdr_encode_opaque_fixed(p, name->name, name->len);
+	xdr_encode_opaque(p, name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_link_maxsz;
 }
@@ -1100,8 +1094,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
 
 	p = reserve_space(xdr, 8 + len);
 	*p++ = cpu_to_be32(OP_LOOKUP);
-	*p++ = cpu_to_be32(len);
-	xdr_encode_opaque_fixed(p, name->name, len);
+	xdr_encode_opaque(p, name->name, len);
 	hdr->nops++;
 	hdr->replen += decode_lookup_maxsz;
 }
@@ -1279,8 +1272,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
 
 	p = reserve_space(xdr, 8 + len);
 	*p++ = cpu_to_be32(OP_PUTFH);
-	*p++ = cpu_to_be32(len);
-	xdr_encode_opaque_fixed(p, fh->data, len);
+	xdr_encode_opaque(p, fh->data, len);
 	hdr->nops++;
 	hdr->replen += decode_putfh_maxsz;
 }
@@ -1373,8 +1365,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
 
 	p = reserve_space(xdr, 8 + name->len);
 	*p++ = cpu_to_be32(OP_REMOVE);
-	*p++ = cpu_to_be32(name->len);
-	xdr_encode_opaque_fixed(p, name->name, name->len);
+	xdr_encode_opaque(p, name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_remove_maxsz;
 }
@@ -1383,14 +1374,10 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
 {
 	__be32 *p;
 
-	p = reserve_space(xdr, 8 + oldname->len);
-	*p++ = cpu_to_be32(OP_RENAME);
-	*p++ = cpu_to_be32(oldname->len);
-	xdr_encode_opaque_fixed(p, oldname->name, oldname->len);
-
-	p = reserve_space(xdr, 4 + newname->len);
-	*p++ = cpu_to_be32(newname->len);
-	xdr_encode_opaque_fixed(p, newname->name, newname->len);
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(OP_RENAME);
+	encode_string(xdr, oldname->len, oldname->name);
+	encode_string(xdr, newname->len, newname->name);
 	hdr->nops++;
 	hdr->replen += decode_rename_maxsz;
 }
@@ -1587,8 +1574,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 
 	/* authsys_parms rfc1831 */
 	*p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec);	/* stamp */
-	*p++ = cpu_to_be32(len);
-	p = xdr_encode_opaque_fixed(p, machine_name, len);
+	p = xdr_encode_opaque(p, machine_name, len);
 	*p++ = cpu_to_be32(0);				/* UID */
 	*p++ = cpu_to_be32(0);				/* GID */
 	*p = cpu_to_be32(0);				/* No more gids */
-- 
cgit v1.2.2


From 6f723f7710024bb151ca8c5277ce8c71beec4db8 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:37 +0300
Subject: nfs: nfs4xdr: get rid of READ32

s/READ32\((.*)\)/\1 = be32_to_cpup(p++)/

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 145 +++++++++++++++++++++++++++----------------------------
 1 file changed, 72 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index efa78ad98228..5f6b46f17c5b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2433,7 +2433,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define READ32(x)         (x) = ntohl(*p++)
 #define READ64(x)         do {			\
 	(x) = (u64)ntohl(*p++) << 32;		\
 	(x) |= ntohl(*p++);			\
@@ -2464,7 +2463,7 @@ static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char
 	__be32 *p;
 
 	READ_BUF(4);
-	READ32(*len);
+	*len = be32_to_cpup(p++);
 	READ_BUF(*len);
 	*string = (char *)p;
 	return 0;
@@ -2475,13 +2474,13 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	READ_BUF(8);
-	READ32(hdr->status);
-	READ32(hdr->taglen);
+	hdr->status = be32_to_cpup(p++);
+	hdr->taglen = be32_to_cpup(p++);
 
 	READ_BUF(hdr->taglen + 4);
 	hdr->tag = (char *)p;
 	p += XDR_QUADLEN(hdr->taglen);
-	READ32(hdr->nops);
+	hdr->nops = be32_to_cpup(p++);
 	if (unlikely(hdr->nops < 1))
 		return nfs4_stat_to_errno(hdr->status);
 	return 0;
@@ -2494,14 +2493,14 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 	int32_t nfserr;
 
 	READ_BUF(8);
-	READ32(opnum);
+	opnum = be32_to_cpup(p++);
 	if (opnum != expected) {
 		dprintk("nfs: Server returned operation"
 			" %d but we issued a request for %d\n",
 				opnum, expected);
 		return -EIO;
 	}
-	READ32(nfserr);
+	nfserr = be32_to_cpup(p++);
 	if (nfserr != NFS_OK)
 		return nfs4_stat_to_errno(nfserr);
 	return 0;
@@ -2524,14 +2523,14 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 	__be32 *p;
 
 	READ_BUF(4);
-	READ32(bmlen);
+	bmlen = be32_to_cpup(p++);
 
 	bitmap[0] = bitmap[1] = 0;
 	READ_BUF((bmlen << 2));
 	if (bmlen > 0) {
-		READ32(bitmap[0]);
+		bitmap[0] = be32_to_cpup(p++);
 		if (bmlen > 1)
-			READ32(bitmap[1]);
+			bitmap[1] = be32_to_cpup(p++);
 	}
 	return 0;
 }
@@ -2541,7 +2540,7 @@ static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen,
 	__be32 *p;
 
 	READ_BUF(4);
-	READ32(*attrlen);
+	*attrlen = be32_to_cpup(p++);
 	*savep = xdr->p;
 	return 0;
 }
@@ -2567,7 +2566,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
 		READ_BUF(4);
-		READ32(*type);
+		*type = be32_to_cpup(p++);
 		if (*type < NF4REG || *type > NF4NAMEDATTR) {
 			dprintk("%s: bad type %d\n", __func__, *type);
 			return -EIO;
@@ -2625,7 +2624,7 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
 		READ_BUF(4);
-		READ32(*res);
+		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
 	}
 	dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
@@ -2641,7 +2640,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
 		READ_BUF(4);
-		READ32(*res);
+		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
 	}
 	dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
@@ -2679,7 +2678,7 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
 		READ_BUF(4);
-		READ32(*res);
+		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
 	}
 	dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
@@ -2695,7 +2694,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
 		READ_BUF(4);
-		READ32(*res);
+		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
 	}
 	dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
@@ -2796,7 +2795,7 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
 	int status = 0;
 
 	READ_BUF(4);
-	READ32(n);
+	n = be32_to_cpup(p++);
 	if (n == 0)
 		goto root_path;
 	dprintk("path ");
@@ -2848,7 +2847,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 	if (unlikely(status != 0))
 		goto out;
 	READ_BUF(4);
-	READ32(n);
+	n = be32_to_cpup(p++);
 	if (n <= 0)
 		goto out_eio;
 	res->nlocations = 0;
@@ -2857,7 +2856,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 		struct nfs4_fs_location *loc = &res->locations[res->nlocations];
 
 		READ_BUF(4);
-		READ32(m);
+		m = be32_to_cpup(p++);
 
 		loc->nservers = 0;
 		dprintk("%s: servers ", __func__);
@@ -2928,7 +2927,7 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
 		READ_BUF(4);
-		READ32(*maxlink);
+		*maxlink = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
 	}
 	dprintk("%s: maxlink=%u\n", __func__, *maxlink);
@@ -2945,7 +2944,7 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
 		READ_BUF(4);
-		READ32(*maxname);
+		*maxname = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
 	}
 	dprintk("%s: maxname=%u\n", __func__, *maxname);
@@ -3005,7 +3004,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
 		READ_BUF(4);
-		READ32(tmp);
+		tmp = be32_to_cpup(p++);
 		*mode = tmp & ~S_IFMT;
 		bitmap[1] &= ~FATTR4_WORD1_MODE;
 		ret = NFS_ATTR_FATTR_MODE;
@@ -3024,7 +3023,7 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
 		READ_BUF(4);
-		READ32(*nlink);
+		*nlink = be32_to_cpup(p++);
 		bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
 		ret = NFS_ATTR_FATTR_NLINK;
 	}
@@ -3043,7 +3042,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) {
 		READ_BUF(4);
-		READ32(len);
+		len = be32_to_cpup(p++);
 		READ_BUF(len);
 		if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
@@ -3071,7 +3070,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) {
 		READ_BUF(4);
-		READ32(len);
+		len = be32_to_cpup(p++);
 		READ_BUF(len);
 		if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
@@ -3101,8 +3100,8 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 		dev_t tmp;
 
 		READ_BUF(8);
-		READ32(major);
-		READ32(minor);
+		major = be32_to_cpup(p++);
+		minor = be32_to_cpup(p++);
 		tmp = MKDEV(major, minor);
 		if (MAJOR(tmp) == major && MINOR(tmp) == minor)
 			*rdev = tmp;
@@ -3191,7 +3190,7 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
 
 	READ_BUF(12);
 	READ64(sec);
-	READ32(nsec);
+	nsec = be32_to_cpup(p++);
 	time->tv_sec = (time_t)sec;
 	time->tv_nsec = (long)nsec;
 	return 0;
@@ -3273,7 +3272,7 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
 	__be32 *p;
 
 	READ_BUF(20);
-	READ32(cinfo->atomic);
+	cinfo->atomic = be32_to_cpup(p++);
 	READ64(cinfo->before);
 	READ64(cinfo->after);
 	return 0;
@@ -3289,8 +3288,8 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
 	if (status)
 		return status;
 	READ_BUF(8);
-	READ32(supp);
-	READ32(acc);
+	supp = be32_to_cpup(p++);
+	acc = be32_to_cpup(p++);
 	access->supported = supp;
 	access->access = acc;
 	return 0;
@@ -3336,7 +3335,7 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 	if ((status = decode_change_info(xdr, cinfo)))
 		return status;
 	READ_BUF(4);
-	READ32(bmlen);
+	bmlen = be32_to_cpup(p++);
 	READ_BUF(bmlen << 2);
 	return 0;
 }
@@ -3591,7 +3590,7 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
 		return status;
 
 	READ_BUF(4);
-	READ32(len);
+	len = be32_to_cpup(p++);
 	if (len > NFS4_FHSIZE)
 		return -EIO;
 	fh->size = len;
@@ -3622,7 +3621,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 	READ_BUF(32);
 	READ64(offset);
 	READ64(length);
-	READ32(type);
+	type = be32_to_cpup(p++);
 	if (fl != NULL) {
 		fl->fl_start = (loff_t)offset;
 		fl->fl_end = fl->fl_start + (loff_t)length - 1;
@@ -3634,7 +3633,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 		fl->fl_pid = 0;
 	}
 	READ64(clientid);
-	READ32(namelen);
+	namelen = be32_to_cpup(p++);
 	READ_BUF(namelen);
 	return -NFS4ERR_DENIED;
 }
@@ -3695,14 +3694,14 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 	uint32_t limit_type, nblocks, blocksize;
 
 	READ_BUF(12);
-	READ32(limit_type);
+	limit_type = be32_to_cpup(p++);
 	switch (limit_type) {
 	case 1:
 		READ64(*maxsize);
 		break;
 	case 2:
-		READ32(nblocks);
-		READ32(blocksize);
+		nblocks = be32_to_cpup(p++);
+		blocksize = be32_to_cpup(p++);
 		*maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
 	}
 	return 0;
@@ -3714,14 +3713,14 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 	uint32_t delegation_type;
 
 	READ_BUF(4);
-	READ32(delegation_type);
+	delegation_type = be32_to_cpup(p++);
 	if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
 		res->delegation_type = 0;
 		return 0;
 	}
 	READ_BUF(NFS4_STATEID_SIZE+4);
 	COPYMEM(res->delegation.data, NFS4_STATEID_SIZE);
-	READ32(res->do_recall);
+	res->do_recall = be32_to_cpup(p++);
 
 	switch (delegation_type) {
 	case NFS4_OPEN_DELEGATE_READ:
@@ -3752,15 +3751,15 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 	decode_change_info(xdr, &res->cinfo);
 
 	READ_BUF(8);
-	READ32(res->rflags);
-	READ32(bmlen);
+	res->rflags = be32_to_cpup(p++);
+	bmlen = be32_to_cpup(p++);
 	if (bmlen > 10)
 		goto xdr_error;
 
 	READ_BUF(bmlen << 2);
 	savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
 	for (i = 0; i < savewords; ++i)
-		READ32(res->attrset[i]);
+		res->attrset[i] = be32_to_cpup(p++);
 	for (; i < NFS4_BITMAP_SIZE; i++)
 		res->attrset[i] = 0;
 
@@ -3821,8 +3820,8 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
 	if (status)
 		return status;
 	READ_BUF(8);
-	READ32(eof);
-	READ32(count);
+	eof = be32_to_cpup(p++);
+	count = be32_to_cpup(p++);
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (count > recvd) {
@@ -3948,7 +3947,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 
 	/* Convert length of symlink */
 	READ_BUF(4);
-	READ32(len);
+	len = be32_to_cpup(p++);
 	if (len >= rcvbuf->page_len || len <= 0) {
 		dprintk("nfs: server returned giant symlink!\n");
 		return -ENAMETOOLONG;
@@ -4070,7 +4069,7 @@ static int decode_setattr(struct xdr_stream *xdr)
 	if (status)
 		return status;
 	READ_BUF(4);
-	READ32(bmlen);
+	bmlen = be32_to_cpup(p++);
 	READ_BUF(bmlen << 2);
 	return 0;
 }
@@ -4082,13 +4081,13 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 	int32_t nfserr;
 
 	READ_BUF(8);
-	READ32(opnum);
+	opnum = be32_to_cpup(p++);
 	if (opnum != OP_SETCLIENTID) {
 		dprintk("nfs: decode_setclientid: Server returned operation"
 			" %d\n", opnum);
 		return -EIO;
 	}
-	READ32(nfserr);
+	nfserr = be32_to_cpup(p++);
 	if (nfserr == NFS_OK) {
 		READ_BUF(8 + NFS4_VERIFIER_SIZE);
 		READ64(clp->cl_clientid);
@@ -4098,12 +4097,12 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 
 		/* skip netid string */
 		READ_BUF(4);
-		READ32(len);
+		len = be32_to_cpup(p++);
 		READ_BUF(len);
 
 		/* skip uaddr string */
 		READ_BUF(4);
-		READ32(len);
+		len = be32_to_cpup(p++);
 		READ_BUF(len);
 		return -NFSERR_CLID_INUSE;
 	} else
@@ -4127,8 +4126,8 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
 		return status;
 
 	READ_BUF(16);
-	READ32(res->count);
-	READ32(res->verf->committed);
+	res->count = be32_to_cpup(p++);
+	res->verf->committed = be32_to_cpup(p++);
 	COPYMEM(res->verf->verifier, 8);
 	return 0;
 }
@@ -4154,11 +4153,11 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	READ_BUF(8);
 	READ64(clp->cl_ex_clid);
 	READ_BUF(12);
-	READ32(clp->cl_seqid);
-	READ32(clp->cl_exchange_flags);
+	clp->cl_seqid = be32_to_cpup(p++);
+	clp->cl_exchange_flags = be32_to_cpup(p++);
 
 	/* We ask for SP4_NONE */
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	if (dummy != SP4_NONE)
 		return -EIO;
 
@@ -4167,17 +4166,17 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 
 	/* Throw away Major id */
 	READ_BUF(4);
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	READ_BUF(dummy);
 
 	/* Throw away server_scope */
 	READ_BUF(4);
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	READ_BUF(dummy);
 
 	/* Throw away Implementation id array */
 	READ_BUF(4);
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	READ_BUF(dummy);
 
 	return 0;
@@ -4190,13 +4189,13 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
 	u32 nr_attrs;
 
 	READ_BUF(28);
-	READ32(attrs->headerpadsz);
-	READ32(attrs->max_rqst_sz);
-	READ32(attrs->max_resp_sz);
-	READ32(attrs->max_resp_sz_cached);
-	READ32(attrs->max_ops);
-	READ32(attrs->max_reqs);
-	READ32(nr_attrs);
+	attrs->headerpadsz = be32_to_cpup(p++);
+	attrs->max_rqst_sz = be32_to_cpup(p++);
+	attrs->max_resp_sz = be32_to_cpup(p++);
+	attrs->max_resp_sz_cached = be32_to_cpup(p++);
+	attrs->max_ops = be32_to_cpup(p++);
+	attrs->max_reqs = be32_to_cpup(p++);
+	nr_attrs = be32_to_cpup(p++);
 	if (unlikely(nr_attrs > 1)) {
 		printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
 			__func__, nr_attrs);
@@ -4226,8 +4225,8 @@ static int decode_create_session(struct xdr_stream *xdr,
 
 	/* seqid, flags */
 	READ_BUF(8);
-	READ32(clp->cl_seqid);
-	READ32(session->flags);
+	clp->cl_seqid = be32_to_cpup(p++);
+	session->flags = be32_to_cpup(p++);
 
 	/* Channel attributes */
 	status = decode_chan_attrs(xdr, &session->fc_attrs);
@@ -4275,23 +4274,23 @@ static int decode_sequence(struct xdr_stream *xdr,
 		goto out_err;
 	}
 	/* seqid */
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	if (dummy != slot->seq_nr) {
 		dprintk("%s Invalid sequence number\n", __func__);
 		goto out_err;
 	}
 	/* slot id */
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	if (dummy != res->sr_slotid) {
 		dprintk("%s Invalid slot id\n", __func__);
 		goto out_err;
 	}
 	/* highest slot id - currently not processed */
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	/* target highest slot id - currently not processed */
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	/* result flags - currently not processed */
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	status = 0;
 out_err:
 	res->sr_status = status;
-- 
cgit v1.2.2


From 3ceb4dbb993fdab6a6fafc69db36686278871134 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:41 +0300
Subject: nfs: nfs4xdr: get rid of READ64

s/READ64\(\*(.*)\)/p = xdr_decode_hyper(p, \1)/
s/READ64\((.*)\)/p = xdr_decode_hyper(p, &\1)/

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 54 +++++++++++++++++++++++++-----------------------------
 1 file changed, 25 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5f6b46f17c5b..238189ceae3f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2433,10 +2433,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define READ64(x)         do {			\
-	(x) = (u64)ntohl(*p++) << 32;		\
-	(x) |= ntohl(*p++);			\
-} while (0)
 #define READTIME(x)       do {			\
 	p++;					\
 	(x.tv_sec) = ntohl(*p++);		\
@@ -2588,7 +2584,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
 		READ_BUF(8);
-		READ64(*change);
+		p = xdr_decode_hyper(p, change);
 		bitmap[0] &= ~FATTR4_WORD0_CHANGE;
 		ret = NFS_ATTR_FATTR_CHANGE;
 	}
@@ -2607,7 +2603,7 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
 		READ_BUF(8);
-		READ64(*size);
+		p = xdr_decode_hyper(p, size);
 		bitmap[0] &= ~FATTR4_WORD0_SIZE;
 		ret = NFS_ATTR_FATTR_SIZE;
 	}
@@ -2658,8 +2654,8 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
 		READ_BUF(16);
-		READ64(fsid->major);
-		READ64(fsid->minor);
+		p = xdr_decode_hyper(p, &fsid->major);
+		p = xdr_decode_hyper(p, &fsid->minor);
 		bitmap[0] &= ~FATTR4_WORD0_FSID;
 		ret = NFS_ATTR_FATTR_FSID;
 	}
@@ -2711,7 +2707,7 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
 		READ_BUF(8);
-		READ64(*fileid);
+		p = xdr_decode_hyper(p, fileid);
 		bitmap[0] &= ~FATTR4_WORD0_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
@@ -2729,7 +2725,7 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
 		READ_BUF(8);
-		READ64(*fileid);
+		p = xdr_decode_hyper(p, fileid);
 		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
@@ -2747,7 +2743,7 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
 	}
 	dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
@@ -2764,7 +2760,7 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
 	}
 	dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
@@ -2781,7 +2777,7 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
 	}
 	dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
@@ -2910,7 +2906,7 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
 	}
 	dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
@@ -2962,7 +2958,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) {
 		uint64_t maxread;
 		READ_BUF(8);
-		READ64(maxread);
+		p = xdr_decode_hyper(p, &maxread);
 		if (maxread > 0x7FFFFFFF)
 			maxread = 0x7FFFFFFF;
 		*res = (uint32_t)maxread;
@@ -2983,7 +2979,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) {
 		uint64_t maxwrite;
 		READ_BUF(8);
-		READ64(maxwrite);
+		p = xdr_decode_hyper(p, &maxwrite);
 		if (maxwrite > 0x7FFFFFFF)
 			maxwrite = 0x7FFFFFFF;
 		*res = (uint32_t)maxwrite;
@@ -3122,7 +3118,7 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
 	}
 	dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
@@ -3139,7 +3135,7 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
 	}
 	dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
@@ -3156,7 +3152,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
 	}
 	dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
@@ -3173,7 +3169,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
 		READ_BUF(8);
-		READ64(*used);
+		p = xdr_decode_hyper(p, used);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
 		ret = NFS_ATTR_FATTR_SPACE_USED;
 	}
@@ -3189,7 +3185,7 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
 	uint32_t nsec;
 
 	READ_BUF(12);
-	READ64(sec);
+	p = xdr_decode_hyper(p, &sec);
 	nsec = be32_to_cpup(p++);
 	time->tv_sec = (time_t)sec;
 	time->tv_nsec = (long)nsec;
@@ -3273,8 +3269,8 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
 
 	READ_BUF(20);
 	cinfo->atomic = be32_to_cpup(p++);
-	READ64(cinfo->before);
-	READ64(cinfo->after);
+	p = xdr_decode_hyper(p, &cinfo->before);
+	p = xdr_decode_hyper(p, &cinfo->after);
 	return 0;
 }
 
@@ -3619,8 +3615,8 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 	uint32_t namelen, type;
 
 	READ_BUF(32);
-	READ64(offset);
-	READ64(length);
+	p = xdr_decode_hyper(p, &offset);
+	p = xdr_decode_hyper(p, &length);
 	type = be32_to_cpup(p++);
 	if (fl != NULL) {
 		fl->fl_start = (loff_t)offset;
@@ -3632,7 +3628,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 			fl->fl_type = F_RDLCK;
 		fl->fl_pid = 0;
 	}
-	READ64(clientid);
+	p = xdr_decode_hyper(p, &clientid);
 	namelen = be32_to_cpup(p++);
 	READ_BUF(namelen);
 	return -NFS4ERR_DENIED;
@@ -3697,7 +3693,7 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 	limit_type = be32_to_cpup(p++);
 	switch (limit_type) {
 	case 1:
-		READ64(*maxsize);
+		p = xdr_decode_hyper(p, maxsize);
 		break;
 	case 2:
 		nblocks = be32_to_cpup(p++);
@@ -4090,7 +4086,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 	nfserr = be32_to_cpup(p++);
 	if (nfserr == NFS_OK) {
 		READ_BUF(8 + NFS4_VERIFIER_SIZE);
-		READ64(clp->cl_clientid);
+		p = xdr_decode_hyper(p, &clp->cl_clientid);
 		COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE);
 	} else if (nfserr == NFSERR_CLID_INUSE) {
 		uint32_t len;
@@ -4151,7 +4147,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 		return status;
 
 	READ_BUF(8);
-	READ64(clp->cl_ex_clid);
+	p = xdr_decode_hyper(p, &clp->cl_ex_clid);
 	READ_BUF(12);
 	clp->cl_seqid = be32_to_cpup(p++);
 	clp->cl_exchange_flags = be32_to_cpup(p++);
-- 
cgit v1.2.2


From c816fd3406462702dee2e3859e70132c3aab7c10 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:44 +0300
Subject: nfs: nfs4xdr: get rid of READTIME

It has no users.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 238189ceae3f..0c26bb2d43d9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2433,11 +2433,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define READTIME(x)       do {			\
-	p++;					\
-	(x.tv_sec) = ntohl(*p++);		\
-	(x.tv_nsec) = ntohl(*p++);		\
-} while (0)
 #define COPYMEM(x,nbytes) do {			\
 	memcpy((x), p, nbytes);			\
 	p += XDR_QUADLEN(nbytes);		\
-- 
cgit v1.2.2


From 686841b3cc3a71918b45ed148be7a01a4f10e3f8 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:48 +0300
Subject: nfs: nfs4xdr: introduce print_overflow_msg

Part fo the nfs4xdr cleanup.  READ_BUF will go away.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 0c26bb2d43d9..8255ec7079d4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2441,14 +2441,18 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
 #define READ_BUF(nbytes)  do { \
 	p = xdr_inline_decode(xdr, nbytes); \
 	if (unlikely(!p)) { \
-		dprintk("nfs: %s: prematurely hit end of receive" \
-				" buffer\n", __func__); \
-		dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
-				__func__, xdr->p, nbytes, xdr->end); \
+		print_overflow_msg(__func__, xdr); \
 		return -EIO; \
 	} \
 } while (0)
 
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("nfs: %s: prematurely hit end of receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
 static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
 {
 	__be32 *p;
-- 
cgit v1.2.2


From 07d30434cfe2f1a1553143c6b20f1fe68d2ef80a Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:52 +0300
Subject: nfs: nfs4xdr: introduce decode_opaque_fixed and decode_stateid
 helpers

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 71 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 8255ec7079d4..86e6983ef4fa 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3290,19 +3290,34 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
 	return 0;
 }
 
-static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
+static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
 {
 	__be32 *p;
+
+	p = xdr_inline_decode(xdr, len);
+	if (likely(p)) {
+		memcpy(buf, p, len);
+		return 0;
+	}
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
+}
+
+static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
+{
 	int status;
 
 	status = decode_op_hdr(xdr, OP_CLOSE);
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
-	if (status)
-		return status;
-	READ_BUF(NFS4_STATEID_SIZE);
-	COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-	return 0;
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
 }
 
 static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
@@ -3635,15 +3650,15 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 
 static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
 {
-	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_LOCK);
 	if (status == -EIO)
 		goto out;
 	if (status == 0) {
-		READ_BUF(NFS4_STATEID_SIZE);
-		COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
+		status = decode_stateid(xdr, &res->stateid);
+		if (unlikely(status))
+			goto out;
 	} else if (status == -NFS4ERR_DENIED)
 		status = decode_lock_denied(xdr, NULL);
 	if (res->open_seqid != NULL)
@@ -3664,16 +3679,13 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
 
 static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
 {
-	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_LOCKU);
 	if (status != -EIO)
 		nfs_increment_lock_seqid(status, res->seqid);
-	if (status == 0) {
-		READ_BUF(NFS4_STATEID_SIZE);
-		COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-	}
+	if (status == 0)
+		status = decode_stateid(xdr, &res->stateid);
 	return status;
 }
 
@@ -3706,6 +3718,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 {
 	__be32 *p;
 	uint32_t delegation_type;
+	int status;
 
 	READ_BUF(4);
 	delegation_type = be32_to_cpup(p++);
@@ -3713,8 +3726,10 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 		res->delegation_type = 0;
 		return 0;
 	}
-	READ_BUF(NFS4_STATEID_SIZE+4);
-	COPYMEM(res->delegation.data, NFS4_STATEID_SIZE);
+	status = decode_stateid(xdr, &res->delegation);
+	if (unlikely(status))
+		return status;
+	READ_BUF(4);
 	res->do_recall = be32_to_cpup(p++);
 
 	switch (delegation_type) {
@@ -3738,10 +3753,10 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 	status = decode_op_hdr(xdr, OP_OPEN);
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
-	if (status)
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	if (unlikely(status))
 		return status;
-	READ_BUF(NFS4_STATEID_SIZE);
-	COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
 
 	decode_change_info(xdr, &res->cinfo);
 
@@ -3766,32 +3781,26 @@ xdr_error:
 
 static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
 {
-	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
-	if (status)
-		return status;
-	READ_BUF(NFS4_STATEID_SIZE);
-	COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-	return 0;
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
 }
 
 static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
 {
-	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
-	if (status)
-		return status;
-	READ_BUF(NFS4_STATEID_SIZE);
-	COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-	return 0;
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
 }
 
 static int decode_putfh(struct xdr_stream *xdr)
-- 
cgit v1.2.2


From db942bbd09563e169cc5d9004c32c1de33220fd1 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:56 +0300
Subject: nfs: nfs4xdr: introduce decode_verifier helper

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Trond: Fixed up an 'uninitialised variable' issue in decode_readdir]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 86e6983ef4fa..404f2e6373f2 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3320,17 +3320,19 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
 	return status;
 }
 
+static int decode_verifier(struct xdr_stream *xdr, void *verifier)
+{
+	return decode_opaque_fixed(xdr, verifier, 8);
+}
+
 static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
 {
-	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_COMMIT);
-	if (status)
-		return status;
-	READ_BUF(8);
-	COPYMEM(res->verf->verifier, 8);
-	return 0;
+	if (!status)
+		status = decode_verifier(xdr, res->verf->verifier);
+	return status;
 }
 
 static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -3852,17 +3854,17 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
 	int		status;
 
 	status = decode_op_hdr(xdr, OP_READDIR);
-	if (status)
+	if (!status)
+		status = decode_verifier(xdr, readdir->verifier.data);
+	if (unlikely(status))
 		return status;
-	READ_BUF(8);
-	COPYMEM(readdir->verifier.data, 8);
 	dprintk("%s: verifier = %08x:%08x\n",
 			__func__,
 			((u32 *)readdir->verifier.data)[0],
 			((u32 *)readdir->verifier.data)[1]);
 
 
-	hdrlen = (char *) p - (char *) iov->iov_base;
+	hdrlen = (char *) xdr->p - (char *) iov->iov_base;
 	recvd = rcvbuf->len - hdrlen;
 	if (pglen > recvd)
 		pglen = recvd;
-- 
cgit v1.2.2


From e78291e4e07520348b0634095cf19ed3bc868965 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:20:00 +0300
Subject: nfs: nfs4xdr: introduce decode_sessionid helper

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 404f2e6373f2..00630f42a2b4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4212,6 +4212,11 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
 	return 0;
 }
 
+static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
+{
+	return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN);
+}
+
 static int decode_create_session(struct xdr_stream *xdr,
 				 struct nfs41_create_session_res *res)
 {
@@ -4221,14 +4226,11 @@ static int decode_create_session(struct xdr_stream *xdr,
 	struct nfs4_session *session = clp->cl_session;
 
 	status = decode_op_hdr(xdr, OP_CREATE_SESSION);
-
-	if (status)
+	if (!status)
+		status = decode_sessionid(xdr, &session->sess_id);
+	if (unlikely(status))
 		return status;
 
-	/* sessionid */
-	READ_BUF(NFS4_MAX_SESSIONID_LEN);
-	COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN);
-
 	/* seqid, flags */
 	READ_BUF(8);
 	clp->cl_seqid = be32_to_cpup(p++);
@@ -4262,7 +4264,9 @@ static int decode_sequence(struct xdr_stream *xdr,
 		return 0;
 
 	status = decode_op_hdr(xdr, OP_SEQUENCE);
-	if (status)
+	if (!status)
+		status = decode_sessionid(xdr, &id);
+	if (unlikely(status))
 		goto out_err;
 
 	/*
@@ -4271,15 +4275,16 @@ static int decode_sequence(struct xdr_stream *xdr,
 	 */
 	status = -ESERVERFAULT;
 
-	slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
-	READ_BUF(NFS4_MAX_SESSIONID_LEN + 20);
-	COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN);
 	if (memcmp(id.data, res->sr_session->sess_id.data,
 		   NFS4_MAX_SESSIONID_LEN)) {
 		dprintk("%s Invalid session id\n", __func__);
 		goto out_err;
 	}
+
+	READ_BUF(20);
+
 	/* seqid */
+	slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
 	dummy = be32_to_cpup(p++);
 	if (dummy != slot->seq_nr) {
 		dprintk("%s Invalid sequence number\n", __func__);
-- 
cgit v1.2.2


From 99398d0655ada44ae464a1c93d13cd438a306ecd Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:20:05 +0300
Subject: nfs: nfs4xdr: get rid of COPYMEM

Just directly call memcpy.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 00630f42a2b4..f69aafc23d44 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2433,11 +2433,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define COPYMEM(x,nbytes) do {			\
-	memcpy((x), p, nbytes);			\
-	p += XDR_QUADLEN(nbytes);		\
-} while (0)
-
 #define READ_BUF(nbytes)  do { \
 	p = xdr_inline_decode(xdr, nbytes); \
 	if (unlikely(!p)) { \
@@ -3607,7 +3602,7 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
 		return -EIO;
 	fh->size = len;
 	READ_BUF(len);
-	COPYMEM(fh->data, len);
+	memcpy(fh->data, p, len);
 	return 0;
 }
 
@@ -4097,7 +4092,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 	if (nfserr == NFS_OK) {
 		READ_BUF(8 + NFS4_VERIFIER_SIZE);
 		p = xdr_decode_hyper(p, &clp->cl_clientid);
-		COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE);
+		memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE);
 	} else if (nfserr == NFSERR_CLID_INUSE) {
 		uint32_t len;
 
@@ -4134,7 +4129,7 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
 	READ_BUF(16);
 	res->count = be32_to_cpup(p++);
 	res->verf->committed = be32_to_cpup(p++);
-	COPYMEM(res->verf->verifier, 8);
+	memcpy(res->verf->verifier, p, 8);
 	return 0;
 }
 
-- 
cgit v1.2.2


From 2460ba57c49c36dfef0b62c929461de09240fe17 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:20:10 +0300
Subject: nfs: nfs4xdr: simplify decode_exchange_id by reusing
 decode_opaque_inline

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f69aafc23d44..3f49da0960e3 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4144,6 +4144,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 {
 	__be32 *p;
 	uint32_t dummy;
+	char *dummy_str;
 	int status;
 	struct nfs_client *clp = res->client;
 
@@ -4166,19 +4167,19 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	READ_BUF(8);
 
 	/* Throw away Major id */
-	READ_BUF(4);
-	dummy = be32_to_cpup(p++);
-	READ_BUF(dummy);
+	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+	if (unlikely(status))
+		return status;
 
 	/* Throw away server_scope */
-	READ_BUF(4);
-	dummy = be32_to_cpup(p++);
-	READ_BUF(dummy);
+	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+	if (unlikely(status))
+		return status;
 
 	/* Throw away Implementation id array */
-	READ_BUF(4);
-	dummy = be32_to_cpup(p++);
-	READ_BUF(dummy);
+	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+	if (unlikely(status))
+		return status;
 
 	return 0;
 }
-- 
cgit v1.2.2


From c0eae66ece40bdb8cd88c5106834b71a1c9f421c Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:20:14 +0300
Subject: nfs: nfs4xdr: get rid of READ_BUF

Use xdr_inline_decode instead.
Open code debug printout and error return.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 475 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 379 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3f49da0960e3..b0d690c3a24c 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2423,24 +2423,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
 }
 #endif /* CONFIG_NFS_V4_1 */
 
-/*
- * START OF "GENERIC" DECODE ROUTINES.
- *   These may look a little ugly since they are imported from a "generic"
- * set of XDR encode/decode routines which are intended to be shared by
- * all of our NFSv4 implementations (OpenBSD, MacOS X...).
- *
- * If the pain of reading these is too great, it should be a straightforward
- * task to translate them into Linux-specific versions which are more
- * consistent with the style used in NFSv2/v3...
- */
-#define READ_BUF(nbytes)  do { \
-	p = xdr_inline_decode(xdr, nbytes); \
-	if (unlikely(!p)) { \
-		print_overflow_msg(__func__, xdr); \
-		return -EIO; \
-	} \
-} while (0)
-
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
 {
 	dprintk("nfs: %s: prematurely hit end of receive buffer. "
@@ -2452,28 +2434,42 @@ static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char
 {
 	__be32 *p;
 
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	*len = be32_to_cpup(p++);
-	READ_BUF(*len);
+	p = xdr_inline_decode(xdr, *len);
+	if (unlikely(!p))
+		goto out_overflow;
 	*string = (char *)p;
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	hdr->status = be32_to_cpup(p++);
 	hdr->taglen = be32_to_cpup(p++);
 
-	READ_BUF(hdr->taglen + 4);
+	p = xdr_inline_decode(xdr, hdr->taglen + 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	hdr->tag = (char *)p;
 	p += XDR_QUADLEN(hdr->taglen);
 	hdr->nops = be32_to_cpup(p++);
 	if (unlikely(hdr->nops < 1))
 		return nfs4_stat_to_errno(hdr->status);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
@@ -2482,7 +2478,9 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 	uint32_t opnum;
 	int32_t nfserr;
 
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	opnum = be32_to_cpup(p++);
 	if (opnum != expected) {
 		dprintk("nfs: Server returned operation"
@@ -2494,6 +2492,9 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 	if (nfserr != NFS_OK)
 		return nfs4_stat_to_errno(nfserr);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 /* Dummy routine */
@@ -2503,8 +2504,11 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
 	unsigned int strlen;
 	char *str;
 
-	READ_BUF(12);
-	return decode_opaque_inline(xdr, &strlen, &str);
+	p = xdr_inline_decode(xdr, 12);
+	if (likely(p))
+		return decode_opaque_inline(xdr, &strlen, &str);
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
@@ -2512,27 +2516,39 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 	uint32_t bmlen;
 	__be32 *p;
 
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	bmlen = be32_to_cpup(p++);
 
 	bitmap[0] = bitmap[1] = 0;
-	READ_BUF((bmlen << 2));
+	p = xdr_inline_decode(xdr, (bmlen << 2));
+	if (unlikely(!p))
+		goto out_overflow;
 	if (bmlen > 0) {
 		bitmap[0] = be32_to_cpup(p++);
 		if (bmlen > 1)
 			bitmap[1] = be32_to_cpup(p++);
 	}
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep)
 {
 	__be32 *p;
 
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	*attrlen = be32_to_cpup(p++);
 	*savep = xdr->p;
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
@@ -2555,7 +2571,9 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*type = be32_to_cpup(p++);
 		if (*type < NF4REG || *type > NF4NAMEDATTR) {
 			dprintk("%s: bad type %d\n", __func__, *type);
@@ -2566,6 +2584,9 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 	}
 	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
@@ -2577,7 +2598,9 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, change);
 		bitmap[0] &= ~FATTR4_WORD0_CHANGE;
 		ret = NFS_ATTR_FATTR_CHANGE;
@@ -2585,6 +2608,9 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 	dprintk("%s: change attribute=%Lu\n", __func__,
 			(unsigned long long)*change);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
@@ -2596,13 +2622,18 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, size);
 		bitmap[0] &= ~FATTR4_WORD0_SIZE;
 		ret = NFS_ATTR_FATTR_SIZE;
 	}
 	dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2613,12 +2644,17 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
 	}
 	dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2629,12 +2665,17 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
 	}
 	dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
@@ -2647,7 +2688,9 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
-		READ_BUF(16);
+		p = xdr_inline_decode(xdr, 16);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, &fsid->major);
 		p = xdr_decode_hyper(p, &fsid->minor);
 		bitmap[0] &= ~FATTR4_WORD0_FSID;
@@ -2657,6 +2700,9 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 			(unsigned long long)fsid->major,
 			(unsigned long long)fsid->minor);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2667,12 +2713,17 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
 	}
 	dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2683,12 +2734,17 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
 	}
 	dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -2700,13 +2756,18 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, fileid);
 		bitmap[0] &= ~FATTR4_WORD0_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
 	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -2718,13 +2779,18 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, fileid);
 		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
 	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2736,12 +2802,17 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
 	}
 	dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2753,12 +2824,17 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
 	}
 	dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2770,12 +2846,17 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
 	}
 	dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
@@ -2784,7 +2865,9 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
 	__be32 *p;
 	int status = 0;
 
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	n = be32_to_cpup(p++);
 	if (n == 0)
 		goto root_path;
@@ -2819,6 +2902,9 @@ out_eio:
 	dprintk(" status %d", status);
 	status = -EIO;
 	goto out;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
@@ -2836,7 +2922,9 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 	status = decode_pathname(xdr, &res->fs_path);
 	if (unlikely(status != 0))
 		goto out;
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	n = be32_to_cpup(p++);
 	if (n <= 0)
 		goto out_eio;
@@ -2845,7 +2933,9 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 		u32 m;
 		struct nfs4_fs_location *loc = &res->locations[res->nlocations];
 
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		m = be32_to_cpup(p++);
 
 		loc->nservers = 0;
@@ -2885,6 +2975,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 out:
 	dprintk("%s: fs_locations done, error = %d\n", __func__, status);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
 out_eio:
 	status = -EIO;
 	goto out;
@@ -2899,12 +2991,17 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
 	}
 	dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
@@ -2916,12 +3013,17 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*maxlink = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
 	}
 	dprintk("%s: maxlink=%u\n", __func__, *maxlink);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
@@ -2933,12 +3035,17 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*maxname = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
 	}
 	dprintk("%s: maxname=%u\n", __func__, *maxname);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2951,7 +3058,9 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) {
 		uint64_t maxread;
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, &maxread);
 		if (maxread > 0x7FFFFFFF)
 			maxread = 0x7FFFFFFF;
@@ -2960,6 +3069,9 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 	}
 	dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2972,7 +3084,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) {
 		uint64_t maxwrite;
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, &maxwrite);
 		if (maxwrite > 0x7FFFFFFF)
 			maxwrite = 0x7FFFFFFF;
@@ -2981,6 +3095,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 	}
 	dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
@@ -2993,7 +3110,9 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		tmp = be32_to_cpup(p++);
 		*mode = tmp & ~S_IFMT;
 		bitmap[1] &= ~FATTR4_WORD1_MODE;
@@ -3001,6 +3120,9 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
 	}
 	dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
@@ -3012,13 +3134,18 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*nlink = be32_to_cpup(p++);
 		bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
 		ret = NFS_ATTR_FATTR_NLINK;
 	}
 	dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
@@ -3031,9 +3158,13 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		len = be32_to_cpup(p++);
-		READ_BUF(len);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
 		if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
 				ret = NFS_ATTR_FATTR_OWNER;
@@ -3047,6 +3178,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 	}
 	dprintk("%s: uid=%d\n", __func__, (int)*uid);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
@@ -3059,9 +3193,13 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		len = be32_to_cpup(p++);
-		READ_BUF(len);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
 		if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
 				ret = NFS_ATTR_FATTR_GROUP;
@@ -3075,6 +3213,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 	}
 	dprintk("%s: gid=%d\n", __func__, (int)*gid);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
@@ -3089,7 +3230,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 	if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) {
 		dev_t tmp;
 
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		major = be32_to_cpup(p++);
 		minor = be32_to_cpup(p++);
 		tmp = MKDEV(major, minor);
@@ -3100,6 +3243,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 	}
 	dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3111,12 +3257,17 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
 	}
 	dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3128,12 +3279,17 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
 	}
 	dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3145,12 +3301,17 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
 	}
 	dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
@@ -3162,7 +3323,9 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, used);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
 		ret = NFS_ATTR_FATTR_SPACE_USED;
@@ -3170,6 +3333,9 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 	dprintk("%s: space used=%Lu\n", __func__,
 			(unsigned long long)*used);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -3178,12 +3344,17 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
 	uint64_t sec;
 	uint32_t nsec;
 
-	READ_BUF(12);
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
 	p = xdr_decode_hyper(p, &sec);
 	nsec = be32_to_cpup(p++);
 	time->tv_sec = (time_t)sec;
 	time->tv_nsec = (long)nsec;
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
@@ -3261,11 +3432,16 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
 {
 	__be32 *p;
 
-	READ_BUF(20);
+	p = xdr_inline_decode(xdr, 20);
+	if (unlikely(!p))
+		goto out_overflow;
 	cinfo->atomic = be32_to_cpup(p++);
 	p = xdr_decode_hyper(p, &cinfo->before);
 	p = xdr_decode_hyper(p, &cinfo->after);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
@@ -3277,12 +3453,17 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
 	status = decode_op_hdr(xdr, OP_ACCESS);
 	if (status)
 		return status;
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	supp = be32_to_cpup(p++);
 	acc = be32_to_cpup(p++);
 	access->supported = supp;
 	access->access = acc;
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
@@ -3341,10 +3522,16 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 		return status;
 	if ((status = decode_change_info(xdr, cinfo)))
 		return status;
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	bmlen = be32_to_cpup(p++);
-	READ_BUF(bmlen << 2);
-	return 0;
+	p = xdr_inline_decode(xdr, bmlen << 2);
+	if (likely(p))
+		return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
@@ -3596,14 +3783,21 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
 	if (status)
 		return status;
 
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	len = be32_to_cpup(p++);
 	if (len > NFS4_FHSIZE)
 		return -EIO;
 	fh->size = len;
-	READ_BUF(len);
+	p = xdr_inline_decode(xdr, len);
+	if (unlikely(!p))
+		goto out_overflow;
 	memcpy(fh->data, p, len);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -3625,7 +3819,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 	__be32 *p;
 	uint32_t namelen, type;
 
-	READ_BUF(32);
+	p = xdr_inline_decode(xdr, 32);
+	if (unlikely(!p))
+		goto out_overflow;
 	p = xdr_decode_hyper(p, &offset);
 	p = xdr_decode_hyper(p, &length);
 	type = be32_to_cpup(p++);
@@ -3641,8 +3837,12 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 	}
 	p = xdr_decode_hyper(p, &clientid);
 	namelen = be32_to_cpup(p++);
-	READ_BUF(namelen);
-	return -NFS4ERR_DENIED;
+	p = xdr_inline_decode(xdr, namelen);
+	if (likely(p))
+		return -NFS4ERR_DENIED;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
@@ -3697,7 +3897,9 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 	__be32 *p;
 	uint32_t limit_type, nblocks, blocksize;
 
-	READ_BUF(12);
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
 	limit_type = be32_to_cpup(p++);
 	switch (limit_type) {
 	case 1:
@@ -3709,6 +3911,9 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 		*maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
 	}
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -3717,7 +3922,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 	uint32_t delegation_type;
 	int status;
 
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	delegation_type = be32_to_cpup(p++);
 	if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
 		res->delegation_type = 0;
@@ -3726,7 +3933,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 	status = decode_stateid(xdr, &res->delegation);
 	if (unlikely(status))
 		return status;
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	res->do_recall = be32_to_cpup(p++);
 
 	switch (delegation_type) {
@@ -3739,6 +3948,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 				return -EIO;
 	}
 	return decode_ace(xdr, NULL, res->server->nfs_client);
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -3757,13 +3969,17 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 
 	decode_change_info(xdr, &res->cinfo);
 
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	res->rflags = be32_to_cpup(p++);
 	bmlen = be32_to_cpup(p++);
 	if (bmlen > 10)
 		goto xdr_error;
 
-	READ_BUF(bmlen << 2);
+	p = xdr_inline_decode(xdr, bmlen << 2);
+	if (unlikely(!p))
+		goto out_overflow;
 	savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
 	for (i = 0; i < savewords; ++i)
 		res->attrset[i] = be32_to_cpup(p++);
@@ -3774,6 +3990,9 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 xdr_error:
 	dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
 	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
@@ -3820,7 +4039,9 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
 	status = decode_op_hdr(xdr, OP_READ);
 	if (status)
 		return status;
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	eof = be32_to_cpup(p++);
 	count = be32_to_cpup(p++);
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
@@ -3835,6 +4056,9 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
 	res->eof = eof;
 	res->count = count;
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
@@ -3947,7 +4171,9 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 		return status;
 
 	/* Convert length of symlink */
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	len = be32_to_cpup(p++);
 	if (len >= rcvbuf->page_len || len <= 0) {
 		dprintk("nfs: server returned giant symlink!\n");
@@ -3972,6 +4198,9 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 	kaddr[len+rcvbuf->page_base] = '\0';
 	kunmap_atomic(kaddr, KM_USER0);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -4069,10 +4298,16 @@ static int decode_setattr(struct xdr_stream *xdr)
 	status = decode_op_hdr(xdr, OP_SETATTR);
 	if (status)
 		return status;
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	bmlen = be32_to_cpup(p++);
-	READ_BUF(bmlen << 2);
-	return 0;
+	p = xdr_inline_decode(xdr, bmlen << 2);
+	if (likely(p))
+		return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
@@ -4081,7 +4316,9 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 	uint32_t opnum;
 	int32_t nfserr;
 
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	opnum = be32_to_cpup(p++);
 	if (opnum != OP_SETCLIENTID) {
 		dprintk("nfs: decode_setclientid: Server returned operation"
@@ -4090,26 +4327,39 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 	}
 	nfserr = be32_to_cpup(p++);
 	if (nfserr == NFS_OK) {
-		READ_BUF(8 + NFS4_VERIFIER_SIZE);
+		p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, &clp->cl_clientid);
 		memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE);
 	} else if (nfserr == NFSERR_CLID_INUSE) {
 		uint32_t len;
 
 		/* skip netid string */
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		len = be32_to_cpup(p++);
-		READ_BUF(len);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
 
 		/* skip uaddr string */
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		len = be32_to_cpup(p++);
-		READ_BUF(len);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
 		return -NFSERR_CLID_INUSE;
 	} else
 		return nfs4_stat_to_errno(nfserr);
 
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_setclientid_confirm(struct xdr_stream *xdr)
@@ -4126,11 +4376,16 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
 	if (status)
 		return status;
 
-	READ_BUF(16);
+	p = xdr_inline_decode(xdr, 16);
+	if (unlikely(!p))
+		goto out_overflow;
 	res->count = be32_to_cpup(p++);
 	res->verf->committed = be32_to_cpup(p++);
 	memcpy(res->verf->verifier, p, 8);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_delegreturn(struct xdr_stream *xdr)
@@ -4152,9 +4407,13 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	if (status)
 		return status;
 
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	p = xdr_decode_hyper(p, &clp->cl_ex_clid);
-	READ_BUF(12);
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
 	clp->cl_seqid = be32_to_cpup(p++);
 	clp->cl_exchange_flags = be32_to_cpup(p++);
 
@@ -4164,7 +4423,9 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 		return -EIO;
 
 	/* Throw away minor_id */
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 
 	/* Throw away Major id */
 	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
@@ -4182,6 +4443,9 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 		return status;
 
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_chan_attrs(struct xdr_stream *xdr,
@@ -4190,7 +4454,9 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
 	__be32 *p;
 	u32 nr_attrs;
 
-	READ_BUF(28);
+	p = xdr_inline_decode(xdr, 28);
+	if (unlikely(!p))
+		goto out_overflow;
 	attrs->headerpadsz = be32_to_cpup(p++);
 	attrs->max_rqst_sz = be32_to_cpup(p++);
 	attrs->max_resp_sz = be32_to_cpup(p++);
@@ -4203,9 +4469,15 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
 			__func__, nr_attrs);
 		return -EINVAL;
 	}
-	if (nr_attrs == 1)
-		READ_BUF(4); /* skip rdma_attrs */
+	if (nr_attrs == 1) {
+		p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */
+		if (unlikely(!p))
+			goto out_overflow;
+	}
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
@@ -4228,7 +4500,9 @@ static int decode_create_session(struct xdr_stream *xdr,
 		return status;
 
 	/* seqid, flags */
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	clp->cl_seqid = be32_to_cpup(p++);
 	session->flags = be32_to_cpup(p++);
 
@@ -4237,6 +4511,9 @@ static int decode_create_session(struct xdr_stream *xdr,
 	if (!status)
 		status = decode_chan_attrs(xdr, &session->bc_attrs);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
@@ -4277,7 +4554,9 @@ static int decode_sequence(struct xdr_stream *xdr,
 		goto out_err;
 	}
 
-	READ_BUF(20);
+	p = xdr_inline_decode(xdr, 20);
+	if (unlikely(!p))
+		goto out_overflow;
 
 	/* seqid */
 	slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
@@ -4302,6 +4581,10 @@ static int decode_sequence(struct xdr_stream *xdr,
 out_err:
 	res->sr_status = status;
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	status = -EIO;
+	goto out_err;
 #else  /* CONFIG_NFS_V4_1 */
 	return 0;
 #endif /* CONFIG_NFS_V4_1 */
-- 
cgit v1.2.2


From cccddf4f5580131c9b963900e1d3400655e633cc Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:20:19 +0300
Subject: nfs: nfs4xdr: optimize low level decoding

do not increment decoding ptr if not needed.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 118 +++++++++++++++++++++++++++----------------------------
 1 file changed, 59 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b0d690c3a24c..14b6f513648f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2437,7 +2437,7 @@ static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	*len = be32_to_cpup(p++);
+	*len = be32_to_cpup(p);
 	p = xdr_inline_decode(xdr, *len);
 	if (unlikely(!p))
 		goto out_overflow;
@@ -2456,14 +2456,14 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	if (unlikely(!p))
 		goto out_overflow;
 	hdr->status = be32_to_cpup(p++);
-	hdr->taglen = be32_to_cpup(p++);
+	hdr->taglen = be32_to_cpup(p);
 
 	p = xdr_inline_decode(xdr, hdr->taglen + 4);
 	if (unlikely(!p))
 		goto out_overflow;
 	hdr->tag = (char *)p;
 	p += XDR_QUADLEN(hdr->taglen);
-	hdr->nops = be32_to_cpup(p++);
+	hdr->nops = be32_to_cpup(p);
 	if (unlikely(hdr->nops < 1))
 		return nfs4_stat_to_errno(hdr->status);
 	return 0;
@@ -2488,7 +2488,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 				opnum, expected);
 		return -EIO;
 	}
-	nfserr = be32_to_cpup(p++);
+	nfserr = be32_to_cpup(p);
 	if (nfserr != NFS_OK)
 		return nfs4_stat_to_errno(nfserr);
 	return 0;
@@ -2519,7 +2519,7 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	bmlen = be32_to_cpup(p++);
+	bmlen = be32_to_cpup(p);
 
 	bitmap[0] = bitmap[1] = 0;
 	p = xdr_inline_decode(xdr, (bmlen << 2));
@@ -2528,7 +2528,7 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 	if (bmlen > 0) {
 		bitmap[0] = be32_to_cpup(p++);
 		if (bmlen > 1)
-			bitmap[1] = be32_to_cpup(p++);
+			bitmap[1] = be32_to_cpup(p);
 	}
 	return 0;
 out_overflow:
@@ -2543,7 +2543,7 @@ static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen,
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	*attrlen = be32_to_cpup(p++);
+	*attrlen = be32_to_cpup(p);
 	*savep = xdr->p;
 	return 0;
 out_overflow:
@@ -2574,7 +2574,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*type = be32_to_cpup(p++);
+		*type = be32_to_cpup(p);
 		if (*type < NF4REG || *type > NF4NAMEDATTR) {
 			dprintk("%s: bad type %d\n", __func__, *type);
 			return -EIO;
@@ -2601,7 +2601,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, change);
+		xdr_decode_hyper(p, change);
 		bitmap[0] &= ~FATTR4_WORD0_CHANGE;
 		ret = NFS_ATTR_FATTR_CHANGE;
 	}
@@ -2625,7 +2625,7 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, size);
+		xdr_decode_hyper(p, size);
 		bitmap[0] &= ~FATTR4_WORD0_SIZE;
 		ret = NFS_ATTR_FATTR_SIZE;
 	}
@@ -2647,7 +2647,7 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*res = be32_to_cpup(p++);
+		*res = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
 	}
 	dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
@@ -2668,7 +2668,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*res = be32_to_cpup(p++);
+		*res = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
 	}
 	dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
@@ -2692,7 +2692,7 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 		if (unlikely(!p))
 			goto out_overflow;
 		p = xdr_decode_hyper(p, &fsid->major);
-		p = xdr_decode_hyper(p, &fsid->minor);
+		xdr_decode_hyper(p, &fsid->minor);
 		bitmap[0] &= ~FATTR4_WORD0_FSID;
 		ret = NFS_ATTR_FATTR_FSID;
 	}
@@ -2716,7 +2716,7 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*res = be32_to_cpup(p++);
+		*res = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
 	}
 	dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
@@ -2737,7 +2737,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*res = be32_to_cpup(p++);
+		*res = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
 	}
 	dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
@@ -2759,7 +2759,7 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, fileid);
+		xdr_decode_hyper(p, fileid);
 		bitmap[0] &= ~FATTR4_WORD0_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
@@ -2782,7 +2782,7 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, fileid);
+		xdr_decode_hyper(p, fileid);
 		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
@@ -2805,7 +2805,7 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
 	}
 	dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
@@ -2827,7 +2827,7 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
 	}
 	dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
@@ -2849,7 +2849,7 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
 	}
 	dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
@@ -2868,7 +2868,7 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	n = be32_to_cpup(p++);
+	n = be32_to_cpup(p);
 	if (n == 0)
 		goto root_path;
 	dprintk("path ");
@@ -2925,7 +2925,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	n = be32_to_cpup(p++);
+	n = be32_to_cpup(p);
 	if (n <= 0)
 		goto out_eio;
 	res->nlocations = 0;
@@ -2936,7 +2936,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		m = be32_to_cpup(p++);
+		m = be32_to_cpup(p);
 
 		loc->nservers = 0;
 		dprintk("%s: servers ", __func__);
@@ -2994,7 +2994,7 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
 	}
 	dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
@@ -3016,7 +3016,7 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*maxlink = be32_to_cpup(p++);
+		*maxlink = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
 	}
 	dprintk("%s: maxlink=%u\n", __func__, *maxlink);
@@ -3038,7 +3038,7 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*maxname = be32_to_cpup(p++);
+		*maxname = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
 	}
 	dprintk("%s: maxname=%u\n", __func__, *maxname);
@@ -3061,7 +3061,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, &maxread);
+		xdr_decode_hyper(p, &maxread);
 		if (maxread > 0x7FFFFFFF)
 			maxread = 0x7FFFFFFF;
 		*res = (uint32_t)maxread;
@@ -3087,7 +3087,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, &maxwrite);
+		xdr_decode_hyper(p, &maxwrite);
 		if (maxwrite > 0x7FFFFFFF)
 			maxwrite = 0x7FFFFFFF;
 		*res = (uint32_t)maxwrite;
@@ -3113,7 +3113,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		tmp = be32_to_cpup(p++);
+		tmp = be32_to_cpup(p);
 		*mode = tmp & ~S_IFMT;
 		bitmap[1] &= ~FATTR4_WORD1_MODE;
 		ret = NFS_ATTR_FATTR_MODE;
@@ -3137,7 +3137,7 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*nlink = be32_to_cpup(p++);
+		*nlink = be32_to_cpup(p);
 		bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
 		ret = NFS_ATTR_FATTR_NLINK;
 	}
@@ -3161,7 +3161,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		len = be32_to_cpup(p++);
+		len = be32_to_cpup(p);
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
 			goto out_overflow;
@@ -3196,7 +3196,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		len = be32_to_cpup(p++);
+		len = be32_to_cpup(p);
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
 			goto out_overflow;
@@ -3234,7 +3234,7 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 		if (unlikely(!p))
 			goto out_overflow;
 		major = be32_to_cpup(p++);
-		minor = be32_to_cpup(p++);
+		minor = be32_to_cpup(p);
 		tmp = MKDEV(major, minor);
 		if (MAJOR(tmp) == major && MINOR(tmp) == minor)
 			*rdev = tmp;
@@ -3260,7 +3260,7 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
 	}
 	dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
@@ -3282,7 +3282,7 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
 	}
 	dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
@@ -3304,7 +3304,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
 	}
 	dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
@@ -3326,7 +3326,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, used);
+		xdr_decode_hyper(p, used);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
 		ret = NFS_ATTR_FATTR_SPACE_USED;
 	}
@@ -3348,7 +3348,7 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
 	if (unlikely(!p))
 		goto out_overflow;
 	p = xdr_decode_hyper(p, &sec);
-	nsec = be32_to_cpup(p++);
+	nsec = be32_to_cpup(p);
 	time->tv_sec = (time_t)sec;
 	time->tv_nsec = (long)nsec;
 	return 0;
@@ -3437,7 +3437,7 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
 		goto out_overflow;
 	cinfo->atomic = be32_to_cpup(p++);
 	p = xdr_decode_hyper(p, &cinfo->before);
-	p = xdr_decode_hyper(p, &cinfo->after);
+	xdr_decode_hyper(p, &cinfo->after);
 	return 0;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
@@ -3457,7 +3457,7 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
 	if (unlikely(!p))
 		goto out_overflow;
 	supp = be32_to_cpup(p++);
-	acc = be32_to_cpup(p++);
+	acc = be32_to_cpup(p);
 	access->supported = supp;
 	access->access = acc;
 	return 0;
@@ -3525,7 +3525,7 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	bmlen = be32_to_cpup(p++);
+	bmlen = be32_to_cpup(p);
 	p = xdr_inline_decode(xdr, bmlen << 2);
 	if (likely(p))
 		return 0;
@@ -3786,7 +3786,7 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	len = be32_to_cpup(p++);
+	len = be32_to_cpup(p);
 	if (len > NFS4_FHSIZE)
 		return -EIO;
 	fh->size = len;
@@ -3836,7 +3836,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 		fl->fl_pid = 0;
 	}
 	p = xdr_decode_hyper(p, &clientid);
-	namelen = be32_to_cpup(p++);
+	namelen = be32_to_cpup(p);
 	p = xdr_inline_decode(xdr, namelen);
 	if (likely(p))
 		return -NFS4ERR_DENIED;
@@ -3903,11 +3903,11 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 	limit_type = be32_to_cpup(p++);
 	switch (limit_type) {
 	case 1:
-		p = xdr_decode_hyper(p, maxsize);
+		xdr_decode_hyper(p, maxsize);
 		break;
 	case 2:
 		nblocks = be32_to_cpup(p++);
-		blocksize = be32_to_cpup(p++);
+		blocksize = be32_to_cpup(p);
 		*maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
 	}
 	return 0;
@@ -3925,7 +3925,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	delegation_type = be32_to_cpup(p++);
+	delegation_type = be32_to_cpup(p);
 	if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
 		res->delegation_type = 0;
 		return 0;
@@ -3936,7 +3936,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	res->do_recall = be32_to_cpup(p++);
+	res->do_recall = be32_to_cpup(p);
 
 	switch (delegation_type) {
 	case NFS4_OPEN_DELEGATE_READ:
@@ -3973,7 +3973,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 	if (unlikely(!p))
 		goto out_overflow;
 	res->rflags = be32_to_cpup(p++);
-	bmlen = be32_to_cpup(p++);
+	bmlen = be32_to_cpup(p);
 	if (bmlen > 10)
 		goto xdr_error;
 
@@ -4043,7 +4043,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
 	if (unlikely(!p))
 		goto out_overflow;
 	eof = be32_to_cpup(p++);
-	count = be32_to_cpup(p++);
+	count = be32_to_cpup(p);
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (count > recvd) {
@@ -4174,7 +4174,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	len = be32_to_cpup(p++);
+	len = be32_to_cpup(p);
 	if (len >= rcvbuf->page_len || len <= 0) {
 		dprintk("nfs: server returned giant symlink!\n");
 		return -ENAMETOOLONG;
@@ -4301,7 +4301,7 @@ static int decode_setattr(struct xdr_stream *xdr)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	bmlen = be32_to_cpup(p++);
+	bmlen = be32_to_cpup(p);
 	p = xdr_inline_decode(xdr, bmlen << 2);
 	if (likely(p))
 		return 0;
@@ -4325,7 +4325,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 			" %d\n", opnum);
 		return -EIO;
 	}
-	nfserr = be32_to_cpup(p++);
+	nfserr = be32_to_cpup(p);
 	if (nfserr == NFS_OK) {
 		p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
 		if (unlikely(!p))
@@ -4339,7 +4339,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		len = be32_to_cpup(p++);
+		len = be32_to_cpup(p);
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
 			goto out_overflow;
@@ -4348,7 +4348,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		len = be32_to_cpup(p++);
+		len = be32_to_cpup(p);
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
 			goto out_overflow;
@@ -4410,7 +4410,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
 		goto out_overflow;
-	p = xdr_decode_hyper(p, &clp->cl_ex_clid);
+	xdr_decode_hyper(p, &clp->cl_ex_clid);
 	p = xdr_inline_decode(xdr, 12);
 	if (unlikely(!p))
 		goto out_overflow;
@@ -4418,7 +4418,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	clp->cl_exchange_flags = be32_to_cpup(p++);
 
 	/* We ask for SP4_NONE */
-	dummy = be32_to_cpup(p++);
+	dummy = be32_to_cpup(p);
 	if (dummy != SP4_NONE)
 		return -EIO;
 
@@ -4463,7 +4463,7 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
 	attrs->max_resp_sz_cached = be32_to_cpup(p++);
 	attrs->max_ops = be32_to_cpup(p++);
 	attrs->max_reqs = be32_to_cpup(p++);
-	nr_attrs = be32_to_cpup(p++);
+	nr_attrs = be32_to_cpup(p);
 	if (unlikely(nr_attrs > 1)) {
 		printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
 			__func__, nr_attrs);
@@ -4504,7 +4504,7 @@ static int decode_create_session(struct xdr_stream *xdr,
 	if (unlikely(!p))
 		goto out_overflow;
 	clp->cl_seqid = be32_to_cpup(p++);
-	session->flags = be32_to_cpup(p++);
+	session->flags = be32_to_cpup(p);
 
 	/* Channel attributes */
 	status = decode_chan_attrs(xdr, &session->fc_attrs);
@@ -4576,7 +4576,7 @@ static int decode_sequence(struct xdr_stream *xdr,
 	/* target highest slot id - currently not processed */
 	dummy = be32_to_cpup(p++);
 	/* result flags - currently not processed */
-	dummy = be32_to_cpup(p++);
+	dummy = be32_to_cpup(p);
 	status = 0;
 out_err:
 	res->sr_status = status;
-- 
cgit v1.2.2


From 8633ecfaba4ce53c094f8db92dbd7ac474a7aa36 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 31 Jul 2009 11:07:29 +0100
Subject: GFS2: Add online uevent to GFS2

We already have an offline uevent (used when a withdraw occurs)
but no online uevent. This adds an online uevent so that userspace
will be able to detect a successful mount by means other than
not receiving a remove event after the add & recovery (change)
uevents.

It has also been added to the remount path as well - we can't use
a change uevent there as older GFS2 userspace acts on change uevents
according to the state that it thinks the fs is in, so we can't
easily add any new ones.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 13 ++++++++++++-
 fs/gfs2/super.c      |  1 +
 fs/gfs2/super.h      |  4 ++--
 3 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7bc3c45cd676..39021c020176 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1133,6 +1133,17 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
 		lm->lm_unmount(sdp);
 }
 
+void gfs2_online_uevent(struct gfs2_sbd *sdp)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	char ro[20];
+	char spectator[20];
+	char *envp[] = { ro, spectator, NULL };
+	sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
+	sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
+	kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp);
+}
+
 /**
  * fill_super - Read in superblock
  * @sb: The VFS superblock
@@ -1236,7 +1247,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	gfs2_glock_dq_uninit(&mount_gh);
-
+	gfs2_online_uevent(sdp);
 	return 0;
 
 fail_threads:
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index f522bb017973..85bd2bc8c1de 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1084,6 +1084,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 	gt->gt_log_flush_secs = args.ar_commit;
 	spin_unlock(&gt->gt_spin);
 
+	gfs2_online_uevent(sdp);
 	return 0;
 }
 
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 22e0417ed996..911c954cefbd 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -25,7 +25,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 	return x;
 }
 
-void gfs2_jindex_free(struct gfs2_sbd *sdp);
+extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
 
 extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
 
@@ -36,7 +36,7 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
 				     struct gfs2_inode **ipp);
 
 extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
-
+extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
 extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
 extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
 			       s64 dinodes);
-- 
cgit v1.2.2


From 440d6da207e9eef2b576331cfedd5ecae9068663 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 31 Jul 2009 12:16:25 +0100
Subject: GFS2: Add some more info to uevents

With each uevent, we now always include the journal ID. We
can't call it JID since that is already in use by some of
the individual events relating to recovery, so we use
JOURNALID instead. We don't send the JOURNALID for spectator
mounts, since there isn't one.

Also the ADD event now has both RDONLY and SPECTATOR information
to match that of the ONLINE event.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/sys.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index a7cbfbd340c7..dda4bf1e9eb4 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -519,7 +519,14 @@ static struct attribute_group lock_module_group = {
 
 int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 {
+	struct super_block *sb = sdp->sd_vfs;
 	int error;
+	char ro[20];
+	char spectator[20];
+	char *envp[] = { ro, spectator, NULL };
+
+	sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
+	sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
 
 	sdp->sd_kobj.kset = gfs2_kset;
 	error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL,
@@ -535,7 +542,7 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 	if (error)
 		goto fail_tune;
 
-	kobject_uevent(&sdp->sd_kobj, KOBJ_ADD);
+	kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp);
 	return 0;
 
 fail_tune:
@@ -554,7 +561,6 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 	kobject_put(&sdp->sd_kobj);
 }
 
-
 static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
 		       struct kobj_uevent_env *env)
 {
@@ -563,6 +569,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
 
 	add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
 	add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
+	if (!sdp->sd_args.ar_spectator)
+		add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
 	if (gfs2_uuid_valid(uuid)) {
 		add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-"
 			       "%02X%02X-%02X%02X%02X%02X%02X%02X",
@@ -578,7 +586,6 @@ static struct kset_uevent_ops gfs2_uevent_ops = {
 	.uevent = gfs2_uevent,
 };
 
-
 int gfs2_sys_init(void)
 {
 	gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
-- 
cgit v1.2.2


From 6050b9c74f24513191fc6b5e4b3583e38d146bf5 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 31 Jul 2009 16:19:40 +0100
Subject: GFS2: Improve error handling in inode allocation

A little while back, block allocation was given some improved
error handling which meant that -EIO was returned in the case
of there being a problem in the resource group data. In addition
a message is printed explaning what went wrong and how to fix it.
This extends that error handling so that it also covers inode
allocation too.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c |  2 +-
 fs/gfs2/rgrp.c  | 36 +++++++++++++++++++++++++-----------
 fs/gfs2/rgrp.h  |  2 +-
 3 files changed, 27 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2f94bd723698..f9b4fe886540 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -731,7 +731,7 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
 	if (error)
 		goto out_ipreserv;
 
-	*no_addr = gfs2_alloc_di(dip, generation);
+	error = gfs2_alloc_di(dip, no_addr, generation);
 
 	gfs2_trans_end(sdp);
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fba795798d3a..3d0193af19c3 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1459,6 +1459,16 @@ int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 	return 0;
 }
 
+static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
+	        (unsigned long long)rgd->rd_addr);
+	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
+	gfs2_rgrp_dump(NULL, rgd->rd_gl);
+	rgd->rd_flags |= GFS2_RDF_ERROR;
+}
+
 /**
  * gfs2_alloc_block - Allocate one or more blocks
  * @ip: the inode to allocate the block for
@@ -1520,22 +1530,20 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
 	return 0;
 
 rgrp_error:
-	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
-	        (unsigned long long)rgd->rd_addr);
-	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
-	gfs2_rgrp_dump(NULL, rgd->rd_gl);
-	rgd->rd_flags |= GFS2_RDF_ERROR;
+	gfs2_rgrp_error(rgd);
 	return -EIO;
 }
 
 /**
  * gfs2_alloc_di - Allocate a dinode
  * @dip: the directory that the inode is going in
+ * @bn: the block number which is allocated
+ * @generation: the generation number of the inode
  *
- * Returns: the block allocated
+ * Returns: 0 on success or error
  */
 
-u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
+int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	struct gfs2_alloc *al = dip->i_alloc;
@@ -1546,12 +1554,13 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
 
 	blk = rgblk_search(rgd, rgd->rd_last_alloc,
 			   GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n);
-	BUG_ON(blk == BFITNOENT);
 
-	rgd->rd_last_alloc = blk;
+	/* Since all blocks are reserved in advance, this shouldn't happen */
+	if (blk == BFITNOENT)
+		goto rgrp_error;
 
+	rgd->rd_last_alloc = blk;
 	block = rgd->rd_data0 + blk;
-
 	gfs2_assert_withdraw(sdp, rgd->rd_free);
 	rgd->rd_free--;
 	rgd->rd_dinodes++;
@@ -1568,7 +1577,12 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
 	rgd->rd_free_clone--;
 	spin_unlock(&sdp->sd_rindex_spin);
 	trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
-	return block;
+	*bn = block;
+	return 0;
+
+rgrp_error:
+	gfs2_rgrp_error(rgd);
+	return -EIO;
 }
 
 /**
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 1e76ff0f3e00..a8dedd78245b 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -47,7 +47,7 @@ extern void gfs2_inplace_release(struct gfs2_inode *ip);
 extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
 
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
-extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
+extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
 
 extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
-- 
cgit v1.2.2


From 05164e5b37a8329bfbcf72f526302cb2dd885fbd Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 17 Aug 2009 11:06:43 +0100
Subject: GFS2: Replace assertion with proper error handling

One fewer assert, one more place we can recover gracefully
if there is an error.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3d0193af19c3..c681c54fbd60 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1561,7 +1561,9 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
 
 	rgd->rd_last_alloc = blk;
 	block = rgd->rd_data0 + blk;
-	gfs2_assert_withdraw(sdp, rgd->rd_free);
+	if (rgd->rd_free == 0)
+		goto rgrp_error;
+
 	rgd->rd_free--;
 	rgd->rd_dinodes++;
 	*generation = rgd->rd_igeneration++;
-- 
cgit v1.2.2


From 31e54b01f3f00b595aac02e887960e5dcc575844 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 13 Aug 2009 12:18:08 +0100
Subject: GFS2: Add sysfs link to device

This adds a link from the per-gfs2 sb sysfs directory to
the block device upon which the filesystem is mounted. The
link is called "device", strangely enough :-)

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/sys.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index dda4bf1e9eb4..0d4f7e974e9f 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -16,6 +16,7 @@
 #include <linux/kobject.h>
 #include <asm/uaccess.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/genhd.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -542,9 +543,17 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 	if (error)
 		goto fail_tune;
 
+	error = sysfs_create_link(&sdp->sd_kobj,
+				  &disk_to_dev(sb->s_bdev->bd_disk)->kobj,
+				  "device");
+	if (error)
+		goto fail_lock_module;
+
 	kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp);
 	return 0;
 
+fail_lock_module:
+	sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
 fail_tune:
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 fail_reg:
@@ -556,6 +565,7 @@ fail:
 
 void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 {
+	sysfs_remove_link(&sdp->sd_kobj, "device");
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 	sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
 	kobject_put(&sdp->sd_kobj);
-- 
cgit v1.2.2


From 50797481a7bdee548589506d7d7b48b08bc14dcd Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 18 Sep 2009 13:34:02 -0400
Subject: ext4: Avoid group preallocation for closed files

Currently the group preallocation code tries to find a large (512)
free block from which to do per-cpu group allocation for small files.
The problem with this scheme is that it leaves the filesystem horribly
fragmented.  In the worst case, if the filesystem is unmounted and
remounted (after a system shutdown, for example) we forget the fact
that wee were using a particular (now-partially filled) 512 block
extent.  So the next time we try to allocate space for a small file,
we will find *another* completely free 512 block chunk to allocate
small files.  Given that there are 32,768 blocks in a block group,
after 64 iterations of "mount, write one 4k file in a directory,
unmount", the block group will have 64 files, each separated by 511
blocks, and the block group will no longer have any free 512
completely free chunks of blocks for group preallocation space.

So if we try to allocate blocks for a file that has been closed, such
that we know the final size of the file, and the filesystem is not
busy, avoid using group preallocation.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 30 +++++++++++++++++++++++++++++-
 fs/ext4/mballoc.c | 10 +++++++++-
 2 files changed, 38 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 70aa951ecb3c..2e9a2036c114 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -952,6 +952,7 @@ struct ext4_sb_info {
 	atomic_t s_mb_lost_chunks;
 	atomic_t s_mb_preallocated;
 	atomic_t s_mb_discarded;
+	atomic_t s_lock_busy;
 
 	/* locality groups */
 	struct ext4_locality_group *s_locality_groups;
@@ -1593,15 +1594,42 @@ struct ext4_group_info {
 #define EXT4_MB_GRP_NEED_INIT(grp)	\
 	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
 
+#define EXT4_MAX_CONTENTION		8
+#define EXT4_CONTENTION_THRESHOLD	2
+
 static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
 					      ext4_group_t group)
 {
 	return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
 }
 
+/*
+ * Returns true if the filesystem is busy enough that attempts to
+ * access the block group locks has run into contention.
+ */
+static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
+{
+	return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
+}
+
 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
 {
-	spin_lock(ext4_group_lock_ptr(sb, group));
+	spinlock_t *lock = ext4_group_lock_ptr(sb, group);
+	if (spin_trylock(lock))
+		/*
+		 * We're able to grab the lock right away, so drop the
+		 * lock contention counter.
+		 */
+		atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+	else {
+		/*
+		 * The lock is busy, so bump the contention counter,
+		 * and then wait on the spin lock.
+		 */
+		atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
+				  EXT4_MAX_CONTENTION);
+		spin_lock(lock);
+	}
 }
 
 static inline void ext4_unlock_group(struct super_block *sb,
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 90a30ce822fc..faf5bd056a93 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4191,9 +4191,17 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 		return;
 
 	size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
-	isize = i_size_read(ac->ac_inode) >> bsbits;
+	isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
+		>> bsbits;
 	size = max(size, isize);
 
+	if ((size == isize) &&
+	    !ext4_fs_is_busy(sbi) &&
+	    (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+		ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
+		return;
+	}
+
 	/* don't use group allocation for large files */
 	if (size >= sbi->s_mb_stream_request) {
 		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
-- 
cgit v1.2.2


From 84fe3bef59dc45a1cb0d2f9b0aefa8f1fbfbdf98 Mon Sep 17 00:00:00 2001
From: Mingming <cmm@us.ibm.com>
Date: Tue, 1 Sep 2009 08:44:37 -0400
Subject: ext4: Compile warning fix when EXT_DEBUG enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When EXT_DEBUG is enabled I received the following compile warning on
PPC64:

  CC [M]  fs/ext4/inode.o
  CC [M]  fs/ext4/extents.o
fs/ext4/extents.c: In function ‘ext4_ext_rm_leaf’:
fs/ext4/extents.c:2097: warning: format ‘%lu’ expects type ‘long unsigned int’, but argument 2 has type ‘ext4_lblk_t’
fs/ext4/extents.c: In function ‘ext4_ext_get_blocks’:
fs/ext4/extents.c:2789: warning: format ‘%u’ expects type ‘unsigned int’, but argument 4 has type ‘long unsigned int’
fs/ext4/extents.c:2852: warning: format ‘%lu’ expects type ‘long unsigned int’, but argument 3 has type ‘ext4_lblk_t’
fs/ext4/extents.c:2953: warning: format ‘%lu’ expects type ‘long unsigned int’, but argument 4 has type ‘unsigned int’
  CC [M]  fs/ext4/migrate.o

The patch fixes compile warning.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

Index: linux-2.6.31-rc4/fs/ext4/extents.c
===================================================================
---
 fs/ext4/extents.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 73ebfb44ad75..c583ebd1e41f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2094,7 +2094,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		else
 			uninitialized = 0;
 
-		ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
+		ext_debug("remove ext %u:%d\n", ex_ee_block, ex_ee_len);
 		path[depth].p_ext = ex;
 
 		a = ex_ee_block > start ? ex_ee_block : start;
@@ -2786,7 +2786,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	struct ext4_allocation_request ar;
 
 	__clear_bit(BH_New, &bh_result->b_state);
-	ext_debug("blocks %u/%u requested for inode %u\n",
+	ext_debug("blocks %u/%u requested for inode %lu\n",
 			iblock, max_blocks, inode->i_ino);
 
 	/* check in cache */
@@ -2849,7 +2849,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			newblock = iblock - ee_block + ee_start;
 			/* number of remaining blocks in the extent */
 			allocated = ee_len - (iblock - ee_block);
-			ext_debug("%u fit into %lu:%d -> %llu\n", iblock,
+			ext_debug("%u fit into %u:%d -> %llu\n", iblock,
 					ee_block, ee_len, newblock);
 
 			/* Do not put uninitialized extent in the cache */
@@ -2950,7 +2950,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	newblock = ext4_mb_new_blocks(handle, &ar, &err);
 	if (!newblock)
 		goto out2;
-	ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
+	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
 		  ar.goal, newblock, allocated);
 
 	/* try to insert new extent into found leaf and return */
-- 
cgit v1.2.2


From 553f9008939638335836eec834f4dea310c17eae Mon Sep 17 00:00:00 2001
From: Mingming <cmm@us.ibm.com>
Date: Fri, 18 Sep 2009 13:34:55 -0400
Subject: ext4: Show unwritten extent flag in ext4_ext_show_leaf()

ext4_ext_show_leaf() will display the leaf extents when extent
debugging is enabled.

Printing out the unwritten bit is useful for debugging unwritten
extent, allow us to see the unwritten extents vs written extents,
after the unwritten extents are splitted or converted.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
---
 fs/ext4/ext4_extents.h |  3 +--
 fs/ext4/extents.c      | 31 ++++++++++++++++++++++---------
 2 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 20a84105a10b..9538633879d7 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,8 +43,7 @@
 #define CHECK_BINSEARCH__
 
 /*
- * If EXT_DEBUG is defined you can use the 'extdebug' mount option
- * to get lots of info about what's going on.
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
  */
 #define EXT_DEBUG__
 #ifdef EXT_DEBUG
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c583ebd1e41f..13db43408533 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -437,8 +437,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
 		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
 			    idx_pblock(path->p_idx));
 		} else if (path->p_ext) {
-			ext_debug("  %d:%d:%llu ",
+			ext_debug("  %d:[%d]%d:%llu ",
 				  le32_to_cpu(path->p_ext->ee_block),
+				  ext4_ext_is_uninitialized(path->p_ext),
 				  ext4_ext_get_actual_len(path->p_ext),
 				  ext_pblock(path->p_ext));
 		} else
@@ -460,8 +461,11 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 	eh = path[depth].p_hdr;
 	ex = EXT_FIRST_EXTENT(eh);
 
+	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
+
 	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
-		ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
+		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
+			  ext4_ext_is_uninitialized(ex),
 			  ext4_ext_get_actual_len(ex), ext_pblock(ex));
 	}
 	ext_debug("\n");
@@ -580,9 +584,10 @@ ext4_ext_binsearch(struct inode *inode,
 	}
 
 	path->p_ext = l - 1;
-	ext_debug("  -> %d:%llu:%d ",
+	ext_debug("  -> %d:%llu:[%d]%d ",
 			le32_to_cpu(path->p_ext->ee_block),
 			ext_pblock(path->p_ext),
+			ext4_ext_is_uninitialized(path->p_ext),
 			ext4_ext_get_actual_len(path->p_ext));
 
 #ifdef CHECK_BINSEARCH
@@ -850,9 +855,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	path[depth].p_ext++;
 	while (path[depth].p_ext <=
 			EXT_MAX_EXTENT(path[depth].p_hdr)) {
-		ext_debug("move %d:%llu:%d in new leaf %llu\n",
+		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
 				le32_to_cpu(path[depth].p_ext->ee_block),
 				ext_pblock(path[depth].p_ext),
+				ext4_ext_is_uninitialized(path[depth].p_ext),
 				ext4_ext_get_actual_len(path[depth].p_ext),
 				newblock);
 		/*memmove(ex++, path[depth].p_ext++,
@@ -1580,9 +1586,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 
 	/* try to insert block into found extent and return */
 	if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
-		ext_debug("append %d block to %d:%d (from %llu)\n",
+		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
+				ext4_ext_is_uninitialized(newext),
 				ext4_ext_get_actual_len(newext),
 				le32_to_cpu(ex->ee_block),
+				ext4_ext_is_uninitialized(ex),
 				ext4_ext_get_actual_len(ex), ext_pblock(ex));
 		err = ext4_ext_get_access(handle, inode, path + depth);
 		if (err)
@@ -1651,9 +1659,10 @@ has_space:
 
 	if (!nearex) {
 		/* there is no extent in this leaf, create first one */
-		ext_debug("first extent in the leaf: %d:%llu:%d\n",
+		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
 				le32_to_cpu(newext->ee_block),
 				ext_pblock(newext),
+				ext4_ext_is_uninitialized(newext),
 				ext4_ext_get_actual_len(newext));
 		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
 	} else if (le32_to_cpu(newext->ee_block)
@@ -1663,10 +1672,11 @@ has_space:
 			len = EXT_MAX_EXTENT(eh) - nearex;
 			len = (len - 1) * sizeof(struct ext4_extent);
 			len = len < 0 ? 0 : len;
-			ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
+			ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
 					"move %d from 0x%p to 0x%p\n",
 					le32_to_cpu(newext->ee_block),
 					ext_pblock(newext),
+					ext4_ext_is_uninitialized(newext),
 					ext4_ext_get_actual_len(newext),
 					nearex, len, nearex + 1, nearex + 2);
 			memmove(nearex + 2, nearex + 1, len);
@@ -1676,10 +1686,11 @@ has_space:
 		BUG_ON(newext->ee_block == nearex->ee_block);
 		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
 		len = len < 0 ? 0 : len;
-		ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
+		ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
 				"move %d from 0x%p to 0x%p\n",
 				le32_to_cpu(newext->ee_block),
 				ext_pblock(newext),
+				ext4_ext_is_uninitialized(newext),
 				ext4_ext_get_actual_len(newext),
 				nearex, len, nearex + 1, nearex + 2);
 		memmove(nearex + 1, nearex, len);
@@ -2094,7 +2105,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		else
 			uninitialized = 0;
 
-		ext_debug("remove ext %u:%d\n", ex_ee_block, ex_ee_len);
+		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
+			 uninitialized, ex_ee_len);
 		path[depth].p_ext = ex;
 
 		a = ex_ee_block > start ? ex_ee_block : start;
@@ -2743,6 +2755,7 @@ insert:
 	} else if (err)
 		goto fix_extent_len;
 out:
+	ext4_ext_show_leaf(inode, path);
 	return err ? err : allocated;
 
 fix_extent_len:
-- 
cgit v1.2.2


From 9599b0e597d810be9b8f759ea6e9619c4f983c5e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 17 Aug 2009 21:23:17 -0400
Subject: jbd2: Annotate transaction start also for jbd2_journal_restart()

lockdep annotation for a transaction start has been at the end of
jbd2_journal_start(). But a transaction is also started from
jbd2_journal_restart(). Move the lockdep annotation to start_this_handle()
which covers both cases.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/transaction.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 4516a87ca55c..a0512700542f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -238,6 +238,8 @@ repeat_locked:
 		  __jbd2_log_space_left(journal));
 	spin_unlock(&transaction->t_handle_lock);
 	spin_unlock(&journal->j_state_lock);
+
+	lock_map_acquire(&handle->h_lockdep_map);
 out:
 	if (unlikely(new_transaction))		/* It's usually NULL */
 		kfree(new_transaction);
@@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 		handle = ERR_PTR(err);
 		goto out;
 	}
-
-	lock_map_acquire(&handle->h_lockdep_map);
 out:
 	return handle;
 }
@@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
 	__jbd2_log_start_commit(journal, transaction->t_tid);
 	spin_unlock(&journal->j_state_lock);
 
+	lock_map_release(&handle->h_lockdep_map);
 	handle->h_buffer_credits = nblocks;
 	ret = start_this_handle(journal, handle);
 	return ret;
-- 
cgit v1.2.2


From 487caeef9fc08c0565e082c40a8aaf58dad92bbb Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 17 Aug 2009 22:17:20 -0400
Subject: ext4: Fix possible deadlock between ext4_truncate() and
 ext4_get_blocks()

During truncate we are sometimes forced to start a new transaction as
the amount of blocks to be journaled is both quite large and hard to
predict. So far we restarted a transaction while holding i_data_sem
and that violates lock ordering because i_data_sem ranks below a
transaction start (and it can lead to a real deadlock with
ext4_get_blocks() mapping blocks in some page while having a
transaction open).

We fix the problem by dropping the i_data_sem before restarting the
transaction and acquire it afterwards. It's slightly subtle that this
works:

1) By the time ext4_truncate() is called, all the page cache for the
truncated part of the file is dropped so get_block() should not be
called on it (we only have to invalidate extent cache after we
reacquire i_data_sem because some extent from not-truncated part could
extend also into the part we are going to truncate).

2) Writes, migrate or defrag hold i_mutex so they are stopped for all
the time of the truncate.

This bug has been found and analyzed by Theodore Tso <tytso@mit.edu>.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  1 +
 fs/ext4/extents.c | 15 ++++++++++++---
 fs/ext4/inode.c   | 23 +++++++++++++++++++----
 3 files changed, 32 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2e9a2036c114..fb21663ffe54 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1370,6 +1370,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
+extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern int ext4_alloc_da_blocks(struct inode *inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 13db43408533..8c20caf4aa5c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
 	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
 }
 
-static int ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_truncate_extend_restart(handle_t *handle,
+					    struct inode *inode,
+					    int needed)
 {
 	int err;
 
@@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
 	err = ext4_journal_extend(handle, needed);
 	if (err <= 0)
 		return err;
-	return ext4_journal_restart(handle, needed);
+	err = ext4_truncate_restart_trans(handle, inode, needed);
+	/*
+	 * We have dropped i_data_sem so someone might have cached again
+	 * an extent we are going to truncate.
+	 */
+	ext4_ext_invalidate_cache(inode);
+
+	return err;
 }
 
 /*
@@ -2150,7 +2159,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		}
 		credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
 
-		err = ext4_ext_journal_restart(handle, credits);
+		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
 		if (err)
 			goto out;
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9a4c929b16dc..d61fb523308f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -192,11 +192,24 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
  * so before we call here everything must be consistently dirtied against
  * this transaction.
  */
-static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
+ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+				 int nblocks)
 {
+	int ret;
+
+	/*
+	 * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
+	 * moment, get_block can be called only for blocks inside i_size since
+	 * page cache has been already dropped and writes are blocked by
+	 * i_mutex. So we can safely drop the i_data_sem here.
+	 */
 	BUG_ON(EXT4_JOURNAL(inode) == NULL);
 	jbd_debug(2, "restarting handle %p\n", handle);
-	return ext4_journal_restart(handle, blocks_for_truncate(inode));
+	up_write(&EXT4_I(inode)->i_data_sem);
+	ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+	down_write(&EXT4_I(inode)->i_data_sem);
+
+	return ret;
 }
 
 /*
@@ -3658,7 +3671,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
 			ext4_handle_dirty_metadata(handle, inode, bh);
 		}
 		ext4_mark_inode_dirty(handle, inode);
-		ext4_journal_test_restart(handle, inode);
+		ext4_truncate_restart_trans(handle, inode,
+					    blocks_for_truncate(inode));
 		if (bh) {
 			BUFFER_TRACE(bh, "retaking write access");
 			ext4_journal_get_write_access(handle, bh);
@@ -3869,7 +3883,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 				return;
 			if (try_to_extend_transaction(handle, inode)) {
 				ext4_mark_inode_dirty(handle, inode);
-				ext4_journal_test_restart(handle, inode);
+				ext4_truncate_restart_trans(handle, inode,
+					    blocks_for_truncate(inode));
 			}
 
 			ext4_free_blocks(handle, inode, nr, 1, 1);
-- 
cgit v1.2.2


From bf43d84b185e2ff54598f8c58a5a8e63148b6e90 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 17 Aug 2009 23:48:51 -0400
Subject: ext4: reject too-large filesystems on 32-bit kernels

ext4 will happily mount a > 16T filesystem on a 32-bit box, but
this is not safe; writes to the block device will wrap past 16T
and the page cache can't index past 16T (232 index * 4k pages).

Adding another test to the existing "too many sectors" test
should do the trick.

Add a comment, a relevant return value, and fix the reference
to the CONFIG_LBD(AF) option as well.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index fe3f376b7df2..de67c3657b83 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2550,12 +2550,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	if (ext4_blocks_count(es) >
-		    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+	/*
+	 * Test whether we have more sectors than will fit in sector_t,
+	 * and whether the max offset is addressable by the page cache.
+	 */
+	if ((ext4_blocks_count(es) >
+	     (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
+	    (ext4_blocks_count(es) >
+	     (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
 		ext4_msg(sb, KERN_ERR, "filesystem"
-			" too large to mount safely");
+			 " too large to mount safely on this system");
 		if (sizeof(sector_t) < 8)
 			ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
+		ret = -EFBIG;
 		goto failed_mount;
 	}
 
-- 
cgit v1.2.2


From 0373130d5bc783751c1fbad948886916a21d4559 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 17 Aug 2009 23:51:29 -0400
Subject: ext4: open-code ext4_mb_update_group_info

ext4_mb_update_group_info is only called in one place, and it's
extremely simple.  There's no reason to have it in a separate function
in a separate file as far as I can tell, it just obfuscates what's
really going on.

Perhaps it was intended to keep the grp->bb_* manipulation local to
mballoc.c but we're already accessing other grp-> fields in balloc.c
directly so this seems ok.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 2 +-
 fs/ext4/ext4.h    | 2 --
 fs/ext4/mballoc.c | 9 ---------
 3 files changed, 1 insertion(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e2126d70dff5..1d0418980f8d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -478,7 +478,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 	 * new bitmap information
 	 */
 	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
-	ext4_mb_update_group_info(grp, blocks_freed);
+	grp->bb_free += blocks_freed;
 	up_write(&grp->alloc_sem);
 
 	/* We dirtied the bitmap block */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index fb21663ffe54..02b22885eb02 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1343,8 +1343,6 @@ extern void ext4_mb_free_blocks(handle_t *, struct inode *,
 		ext4_fsblk_t, unsigned long, int, unsigned long *);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
 		ext4_group_t i, struct ext4_group_desc *desc);
-extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
-		ext4_grpblk_t add);
 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
 extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
 						ext4_group_t, int);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index faf5bd056a93..833d87236881 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2554,15 +2554,6 @@ exit_meta_group_info:
 	return -ENOMEM;
 } /* ext4_mb_add_groupinfo */
 
-/*
- * Update an existing group.
- * This function is used for online resize
- */
-void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
-{
-	grp->bb_free += add;
-}
-
 static int ext4_mb_init_backend(struct super_block *sb)
 {
 	ext4_group_t ngroups = ext4_get_groups_count(sb);
-- 
cgit v1.2.2


From 38877f4e8dbbec12c6fde85ee1fce1dc27ef3290 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 17 Aug 2009 23:55:24 -0400
Subject: simplify some logic in ext4_mb_normalize_request

While reading through some of the mballoc code it seems that a couple
spots in the size normalization function could be streamlined.

The test for non-overlapping PAs can be or'd for the start & end
conditions, and the tests for adjacent PAs can be else-if'd -
it's essentially independently testing:

	if (A + B <= C)
		...
	if (A > C)
		...

These cannot both be true so it seems like the else-if might
be slightly more efficient and/or informative.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 833d87236881..1a9cd7de04dd 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3184,23 +3184,18 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 		BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
 			ac->ac_o_ex.fe_logical < pa->pa_lstart));
 
-		/* skip PA normalized request doesn't overlap with */
-		if (pa->pa_lstart >= end) {
-			spin_unlock(&pa->pa_lock);
-			continue;
-		}
-		if (pa_end <= start) {
+		/* skip PAs this normalized request doesn't overlap with */
+		if (pa->pa_lstart >= end || pa_end <= start) {
 			spin_unlock(&pa->pa_lock);
 			continue;
 		}
 		BUG_ON(pa->pa_lstart <= start && pa_end >= end);
 
+		/* adjust start or end to be adjacent to this pa */
 		if (pa_end <= ac->ac_o_ex.fe_logical) {
 			BUG_ON(pa_end < start);
 			start = pa_end;
-		}
-
-		if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
+		} else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
 			BUG_ON(pa->pa_lstart > end);
 			end = pa->pa_lstart;
 		}
-- 
cgit v1.2.2


From a13fb1a4533f26c1e2b0204d5283b696689645af Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 18 Aug 2009 00:20:23 -0400
Subject: ext4: Add feature set check helper for mount & remount paths

A user reported that although his root ext4 filesystem was mounting
fine, other filesystems would not mount, with the:

"Filesystem with huge files cannot be mounted RDWR without CONFIG_LBDAF"

error on his 32-bit box built without CONFIG_LBDAF.  This is because
the test at mount time for this situation was not being re-checked
on remount, and the normal boot process makes an ro->rw transition,
so this was being missed.

Refactor to make a common helper function to test the filesystem
features against the type of mount request (RO vs. RW) so that we
stay consistent.

Addresses Red-Hat-Bugzilla: #517650

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 91 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 49 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index de67c3657b83..4037fe0b5a5c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2254,6 +2254,49 @@ static struct kobj_type ext4_ktype = {
 	.release	= ext4_sb_release,
 };
 
+/*
+ * Check whether this filesystem can be mounted based on
+ * the features present and the RDONLY/RDWR mount requested.
+ * Returns 1 if this filesystem can be mounted as requested,
+ * 0 if it cannot be.
+ */
+static int ext4_feature_set_ok(struct super_block *sb, int readonly)
+{
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+		ext4_msg(sb, KERN_ERR,
+			"Couldn't mount because of "
+			"unsupported optional features (%x)",
+			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+			~EXT4_FEATURE_INCOMPAT_SUPP));
+		return 0;
+	}
+
+	if (readonly)
+		return 1;
+
+	/* Check that feature set is OK for a read-write mount */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+		ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
+			 "unsupported optional features (%x)",
+			 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+				~EXT4_FEATURE_RO_COMPAT_SUPP));
+		return 0;
+	}
+	/*
+	 * Large file size enabled file system can only be mounted
+	 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
+	 */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+		if (sizeof(blkcnt_t) < sizeof(u64)) {
+			ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
+				 "cannot be mounted RDWR without "
+				 "CONFIG_LBDAF");
+			return 0;
+		}
+	}
+	return 1;
+}
+
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				__releases(kernel_lock)
 				__acquires(kernel_lock)
@@ -2275,7 +2318,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	unsigned int db_count;
 	unsigned int i;
 	int needs_recovery, has_huge_files;
-	int features;
 	__u64 blocks_count;
 	int err;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -2402,39 +2444,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * previously didn't change the revision level when setting the flags,
 	 * so there is a chance incompat flags are set on a rev 0 filesystem.
 	 */
-	features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
-	if (features) {
-		ext4_msg(sb, KERN_ERR,
-			"Couldn't mount because of "
-			"unsupported optional features (%x)",
-			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
-			~EXT4_FEATURE_INCOMPAT_SUPP));
-		goto failed_mount;
-	}
-	features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
-	if (!(sb->s_flags & MS_RDONLY) && features) {
-		ext4_msg(sb, KERN_ERR,
-			"Couldn't mount RDWR because of "
-			"unsupported optional features (%x)",
-			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
-			~EXT4_FEATURE_RO_COMPAT_SUPP));
+	if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
 		goto failed_mount;
-	}
-	has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
-				    EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
-	if (has_huge_files) {
-		/*
-		 * Large file size enabled file system can only be
-		 * mount if kernel is build with CONFIG_LBDAF
-		 */
-		if (sizeof(root->i_blocks) < sizeof(u64) &&
-				!(sb->s_flags & MS_RDONLY)) {
-			ext4_msg(sb, KERN_ERR, "Filesystem with huge "
-					"files cannot be mounted read-write "
-					"without CONFIG_LBDAF");
-			goto failed_mount;
-		}
-	}
+
 	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
 
 	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
@@ -2470,6 +2482,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		}
 	}
 
+	has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
 	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
 						      has_huge_files);
 	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@ -3485,18 +3499,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			if (sbi->s_journal)
 				ext4_mark_recovery_complete(sb, es);
 		} else {
-			int ret;
-			if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					~EXT4_FEATURE_RO_COMPAT_SUPP))) {
-				ext4_msg(sb, KERN_WARNING, "couldn't "
-				       "remount RDWR because of unsupported "
-				       "optional features (%x)",
-				(le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
-					~EXT4_FEATURE_RO_COMPAT_SUPP));
+			/* Make sure we can mount this feature set readwrite */
+			if (!ext4_feature_set_ok(sb, 0)) {
 				err = -EROFS;
 				goto restore_opts;
 			}
-
 			/*
 			 * Make sure the group descriptor checksums
 			 * are sane.  If they aren't, refuse to remount r/w.
-- 
cgit v1.2.2


From 970343cd49048446da4189986cf960273588d71a Mon Sep 17 00:00:00 2001
From: Wengang Wang <wen.gang.wang@oracle.com>
Date: Tue, 18 Aug 2009 14:25:03 +0800
Subject: GFS2: free disk inode which is deleted by remote node -V2

this patch is for the same problem that Benjamin Marzinski fixes at commit
b94a170e96dc416828af9d350ae2e34b70ae7347

quotation of the original problem:

---cut here---
When a file is deleted from a gfs2 filesystem on one node, a dcache
entry for it may still exist on other nodes in the cluster. If this
happens, gfs2 will be unable to free this file on disk. Because of this,
it's possible to have a gfs2 filesystem with no files on it and no free
space. With this patch, when a node receives a callback notifying it
that the file is being deleted on another node, it schedules a new
workqueue thread to remove the file's dcache entry.
---end cut---

after applying Benjamin's patch, I think there is still a case in which the disk
inode remains even when "no space" is hit. the case is that when running
d_prune_aliases() against the inode, there are one or more dentries(aliases)
which have reference count number > 0. in this case the dentries won't be pruned.
and even later, the reference count becomes to 0, the dentries can still be
cached in memory. unfortunately, no callback come again, things come back to
the state before the callback runs. thus the on disk inode remains there until
in memoryinode is removed for some other reason(shrinking inode cache or unmount
the volume..).

this patch is to remove those dentries when their reference count becomes to 0 and
the inode is deleted by remote node. for implementation, gfs2_dentry_delete() is
added as dentry_operations.d_delete. the function returns true when the inode is
deleted by remote node. in dput(), gfs2_dentry_delete() is called and since it
returns true, the dentry is unhashed from dcache and then removed. when all dentries
are removed, the in memory inode get removed so that the on disk inode is freed.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/dentry.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 022c66cd5606..91beddadd388 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -107,8 +107,26 @@ static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
 	return 0;
 }
 
+static int gfs2_dentry_delete(struct dentry *dentry)
+{
+	struct gfs2_inode *ginode;
+
+	if (!dentry->d_inode)
+		return 0;
+
+	ginode = GFS2_I(dentry->d_inode);
+	if (!ginode->i_iopen_gh.gh_gl)
+		return 0;
+
+	if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags))
+		return 1;
+
+	return 0;
+}
+
 const struct dentry_operations gfs2_dops = {
 	.d_revalidate = gfs2_drevalidate,
 	.d_hash = gfs2_dhash,
+	.d_delete = gfs2_dentry_delete,
 };
 
-- 
cgit v1.2.2


From b5711b8e5a437ca7d35321d19de568b4f76a7739 Mon Sep 17 00:00:00 2001
From: Casey Dahlin <cdahlin@redhat.com>
Date: Tue, 28 Jul 2009 12:29:05 -0500
Subject: dlm: fix double-release of socket in error exit path

The last correction to the tcp_connect_to_sock error exit path,
commit a89d63a159b1ba5833be2bef00adf8ad8caac8be, can free an already
freed socket, due to collision with a previous (incomplete) attempt
to fix the same issue, commit 311f6fc77c51926dbdfbeab0a5d88d70f01fa3f4.

Signed-off-by: Casey Dahlin <cdahlin@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 618a60f03886..210d52c48808 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -926,10 +926,8 @@ static void tcp_connect_to_sock(struct connection *con)
 		goto out_err;
 
 	memset(&saddr, 0, sizeof(saddr));
-	if (dlm_nodeid_to_addr(con->nodeid, &saddr)) {
-		sock_release(sock);
+	if (dlm_nodeid_to_addr(con->nodeid, &saddr))
 		goto out_err;
-	}
 
 	sock->sk->sk_user_data = con;
 	con->rx_action = receive_from_sock;
-- 
cgit v1.2.2


From e571cbf1a4f8d8b6cfd4898df718dae84c75a8e1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 19 Aug 2009 18:12:27 -0400
Subject: NFS: Add a dns resolver for use with NFSv4 referrals and migration

The NFSv4 and NFSv4.1 protocols both allow for the redirection of a client
from one server to another in order to support filesystem migration and
replication. For full protocol support, we need to add the ability to
convert a DNS host name into an IP address that we can feed to the RPC
client.

We'll reuse the sunrpc cache, now that it has been converted to work with
rpc_pipefs.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/Makefile      |   3 +-
 fs/nfs/cache_lib.c   | 140 +++++++++++++++++++++
 fs/nfs/cache_lib.h   |  27 +++++
 fs/nfs/dns_resolve.c | 335 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/dns_resolve.h |  14 +++
 fs/nfs/inode.c       |   8 ++
 6 files changed, 526 insertions(+), 1 deletion(-)
 create mode 100644 fs/nfs/cache_lib.c
 create mode 100644 fs/nfs/cache_lib.h
 create mode 100644 fs/nfs/dns_resolve.c
 create mode 100644 fs/nfs/dns_resolve.h

(limited to 'fs')

diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 845159814de2..da7fda639eac 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o
 
 nfs-y 			:= client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
 			   direct.o pagelist.o proc.o read.o symlink.o unlink.o \
-			   write.o namespace.o mount_clnt.o
+			   write.o namespace.o mount_clnt.o \
+			   dns_resolve.o cache_lib.o
 nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o
 nfs-$(CONFIG_NFS_V3)	+= nfs3proc.o nfs3xdr.o
 nfs-$(CONFIG_NFS_V3_ACL)	+= nfs3acl.o
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
new file mode 100644
index 000000000000..b4ffd0146ea6
--- /dev/null
+++ b/fs/nfs/cache_lib.c
@@ -0,0 +1,140 @@
+/*
+ * linux/fs/nfs/cache_lib.c
+ *
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include "cache_lib.h"
+
+#define NFS_CACHE_UPCALL_PATHLEN 256
+#define NFS_CACHE_UPCALL_TIMEOUT 15
+
+static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] =
+				"/sbin/nfs_cache_getent";
+static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT;
+
+module_param_string(cache_getent, nfs_cache_getent_prog,
+		sizeof(nfs_cache_getent_prog), 0600);
+MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program");
+module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600);
+MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which "
+		"the cache upcall is assumed to have failed");
+
+int nfs_cache_upcall(struct cache_detail *cd, char *entry_name)
+{
+	static char *envp[] = { "HOME=/",
+		"TERM=linux",
+		"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+		NULL
+	};
+	char *argv[] = {
+		nfs_cache_getent_prog,
+		cd->name,
+		entry_name,
+		NULL
+	};
+	int ret = -EACCES;
+
+	if (nfs_cache_getent_prog[0] == '\0')
+		goto out;
+	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+	/*
+	 * Disable the upcall mechanism if we're getting an ENOENT or
+	 * EACCES error. The admin can re-enable it on the fly by using
+	 * sysfs to set the 'cache_getent' parameter once the problem
+	 * has been fixed.
+	 */
+	if (ret == -ENOENT || ret == -EACCES)
+		nfs_cache_getent_prog[0] = '\0';
+out:
+	return ret > 0 ? 0 : ret;
+}
+
+/*
+ * Deferred request handling
+ */
+void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq)
+{
+	if (atomic_dec_and_test(&dreq->count))
+		kfree(dreq);
+}
+
+static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
+{
+	struct nfs_cache_defer_req *dreq;
+
+	dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
+
+	complete_all(&dreq->completion);
+	nfs_cache_defer_req_put(dreq);
+}
+
+static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req)
+{
+	struct nfs_cache_defer_req *dreq;
+
+	dreq = container_of(req, struct nfs_cache_defer_req, req);
+	dreq->deferred_req.revisit = nfs_dns_cache_revisit;
+	atomic_inc(&dreq->count);
+
+	return &dreq->deferred_req;
+}
+
+struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void)
+{
+	struct nfs_cache_defer_req *dreq;
+
+	dreq = kzalloc(sizeof(*dreq), GFP_KERNEL);
+	if (dreq) {
+		init_completion(&dreq->completion);
+		atomic_set(&dreq->count, 1);
+		dreq->req.defer = nfs_dns_cache_defer;
+	}
+	return dreq;
+}
+
+int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
+{
+	if (wait_for_completion_timeout(&dreq->completion,
+			nfs_cache_getent_timeout * HZ) == 0)
+		return -ETIMEDOUT;
+	return 0;
+}
+
+int nfs_cache_register(struct cache_detail *cd)
+{
+	struct nameidata nd;
+	struct vfsmount *mnt;
+	int ret;
+
+	mnt = rpc_get_mount();
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+	ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd);
+	if (ret)
+		goto err;
+	ret = sunrpc_cache_register_pipefs(nd.path.dentry,
+			cd->name, 0600, cd);
+	path_put(&nd.path);
+	if (!ret)
+		return ret;
+err:
+	rpc_put_mount();
+	return ret;
+}
+
+void nfs_cache_unregister(struct cache_detail *cd)
+{
+	sunrpc_cache_unregister_pipefs(cd);
+	rpc_put_mount();
+}
+
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
new file mode 100644
index 000000000000..76f856e284e4
--- /dev/null
+++ b/fs/nfs/cache_lib.h
@@ -0,0 +1,27 @@
+/*
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+
+#include <linux/completion.h>
+#include <linux/sunrpc/cache.h>
+#include <asm/atomic.h>
+
+/*
+ * Deferred request handling
+ */
+struct nfs_cache_defer_req {
+	struct cache_req req;
+	struct cache_deferred_req deferred_req;
+	struct completion completion;
+	atomic_t count;
+};
+
+extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name);
+extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
+extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
+extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
+
+extern int nfs_cache_register(struct cache_detail *cd);
+extern void nfs_cache_unregister(struct cache_detail *cd);
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
new file mode 100644
index 000000000000..f4d54ba97cc6
--- /dev/null
+++ b/fs/nfs/dns_resolve.c
@@ -0,0 +1,335 @@
+/*
+ * linux/fs/nfs/dns_resolve.c
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * Resolves DNS hostnames into valid ip addresses
+ */
+
+#include <linux/hash.h>
+#include <linux/string.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/seq_file.h>
+#include <linux/inet.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/svcauth.h>
+
+#include "dns_resolve.h"
+#include "cache_lib.h"
+
+#define NFS_DNS_HASHBITS 4
+#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS)
+
+static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE];
+
+struct nfs_dns_ent {
+	struct cache_head h;
+
+	char *hostname;
+	size_t namelen;
+
+	struct sockaddr_storage addr;
+	size_t addrlen;
+};
+
+
+static void nfs_dns_ent_init(struct cache_head *cnew,
+		struct cache_head *ckey)
+{
+	struct nfs_dns_ent *new;
+	struct nfs_dns_ent *key;
+
+	new = container_of(cnew, struct nfs_dns_ent, h);
+	key = container_of(ckey, struct nfs_dns_ent, h);
+
+	kfree(new->hostname);
+	new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
+	if (new->hostname) {
+		new->namelen = key->namelen;
+		memcpy(&new->addr, &key->addr, key->addrlen);
+		new->addrlen = key->addrlen;
+	} else {
+		new->namelen = 0;
+		new->addrlen = 0;
+	}
+}
+
+static void nfs_dns_ent_put(struct kref *ref)
+{
+	struct nfs_dns_ent *item;
+
+	item = container_of(ref, struct nfs_dns_ent, h.ref);
+	kfree(item->hostname);
+	kfree(item);
+}
+
+static struct cache_head *nfs_dns_ent_alloc(void)
+{
+	struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
+
+	if (item != NULL) {
+		item->hostname = NULL;
+		item->namelen = 0;
+		item->addrlen = 0;
+		return &item->h;
+	}
+	return NULL;
+};
+
+static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key)
+{
+	return hash_str(key->hostname, NFS_DNS_HASHBITS);
+}
+
+static void nfs_dns_request(struct cache_detail *cd,
+		struct cache_head *ch,
+		char **bpp, int *blen)
+{
+	struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+
+	qword_add(bpp, blen, key->hostname);
+	(*bpp)[-1] = '\n';
+}
+
+static int nfs_dns_upcall(struct cache_detail *cd,
+		struct cache_head *ch)
+{
+	struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+	int ret;
+
+	ret = nfs_cache_upcall(cd, key->hostname);
+	if (ret)
+		ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request);
+	return ret;
+}
+
+static int nfs_dns_match(struct cache_head *ca,
+		struct cache_head *cb)
+{
+	struct nfs_dns_ent *a;
+	struct nfs_dns_ent *b;
+
+	a = container_of(ca, struct nfs_dns_ent, h);
+	b = container_of(cb, struct nfs_dns_ent, h);
+
+	if (a->namelen == 0 || a->namelen != b->namelen)
+		return 0;
+	return memcmp(a->hostname, b->hostname, a->namelen) == 0;
+}
+
+static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
+		struct cache_head *h)
+{
+	struct nfs_dns_ent *item;
+	long ttl;
+
+	if (h == NULL) {
+		seq_puts(m, "# ip address      hostname        ttl\n");
+		return 0;
+	}
+	item = container_of(h, struct nfs_dns_ent, h);
+	ttl = (long)item->h.expiry_time - (long)get_seconds();
+	if (ttl < 0)
+		ttl = 0;
+
+	if (!test_bit(CACHE_NEGATIVE, &h->flags)) {
+		char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1];
+
+		rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf));
+		seq_printf(m, "%15s ", buf);
+	} else
+		seq_puts(m, "<none>          ");
+	seq_printf(m, "%15s %ld\n", item->hostname, ttl);
+	return 0;
+}
+
+struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
+		struct nfs_dns_ent *key)
+{
+	struct cache_head *ch;
+
+	ch = sunrpc_cache_lookup(cd,
+			&key->h,
+			nfs_dns_hash(key));
+	if (!ch)
+		return NULL;
+	return container_of(ch, struct nfs_dns_ent, h);
+}
+
+struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
+		struct nfs_dns_ent *new,
+		struct nfs_dns_ent *key)
+{
+	struct cache_head *ch;
+
+	ch = sunrpc_cache_update(cd,
+			&new->h, &key->h,
+			nfs_dns_hash(key));
+	if (!ch)
+		return NULL;
+	return container_of(ch, struct nfs_dns_ent, h);
+}
+
+static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
+{
+	char buf1[NFS_DNS_HOSTNAME_MAXLEN+1];
+	struct nfs_dns_ent key, *item;
+	unsigned long ttl;
+	ssize_t len;
+	int ret = -EINVAL;
+
+	if (buf[buflen-1] != '\n')
+		goto out;
+	buf[buflen-1] = '\0';
+
+	len = qword_get(&buf, buf1, sizeof(buf1));
+	if (len <= 0)
+		goto out;
+	key.addrlen = rpc_pton(buf1, len,
+			(struct sockaddr *)&key.addr,
+			sizeof(key.addr));
+
+	len = qword_get(&buf, buf1, sizeof(buf1));
+	if (len <= 0)
+		goto out;
+
+	key.hostname = buf1;
+	key.namelen = len;
+	memset(&key.h, 0, sizeof(key.h));
+
+	ttl = get_expiry(&buf);
+	if (ttl == 0)
+		goto out;
+	key.h.expiry_time = ttl + get_seconds();
+
+	ret = -ENOMEM;
+	item = nfs_dns_lookup(cd, &key);
+	if (item == NULL)
+		goto out;
+
+	if (key.addrlen == 0)
+		set_bit(CACHE_NEGATIVE, &key.h.flags);
+
+	item = nfs_dns_update(cd, &key, item);
+	if (item == NULL)
+		goto out;
+
+	ret = 0;
+	cache_put(&item->h, cd);
+out:
+	return ret;
+}
+
+static struct cache_detail nfs_dns_resolve = {
+	.owner = THIS_MODULE,
+	.hash_size = NFS_DNS_HASHTBL_SIZE,
+	.hash_table = nfs_dns_table,
+	.name = "dns_resolve",
+	.cache_put = nfs_dns_ent_put,
+	.cache_upcall = nfs_dns_upcall,
+	.cache_parse = nfs_dns_parse,
+	.cache_show = nfs_dns_show,
+	.match = nfs_dns_match,
+	.init = nfs_dns_ent_init,
+	.update = nfs_dns_ent_init,
+	.alloc = nfs_dns_ent_alloc,
+};
+
+static int do_cache_lookup(struct cache_detail *cd,
+		struct nfs_dns_ent *key,
+		struct nfs_dns_ent **item,
+		struct nfs_cache_defer_req *dreq)
+{
+	int ret = -ENOMEM;
+
+	*item = nfs_dns_lookup(cd, key);
+	if (*item) {
+		ret = cache_check(cd, &(*item)->h, &dreq->req);
+		if (ret)
+			*item = NULL;
+	}
+	return ret;
+}
+
+static int do_cache_lookup_nowait(struct cache_detail *cd,
+		struct nfs_dns_ent *key,
+		struct nfs_dns_ent **item)
+{
+	int ret = -ENOMEM;
+
+	*item = nfs_dns_lookup(cd, key);
+	if (!*item)
+		goto out_err;
+	ret = -ETIMEDOUT;
+	if (!test_bit(CACHE_VALID, &(*item)->h.flags)
+			|| (*item)->h.expiry_time < get_seconds()
+			|| cd->flush_time > (*item)->h.last_refresh)
+		goto out_put;
+	ret = -ENOENT;
+	if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
+		goto out_put;
+	return 0;
+out_put:
+	cache_put(&(*item)->h, cd);
+out_err:
+	*item = NULL;
+	return ret;
+}
+
+static int do_cache_lookup_wait(struct cache_detail *cd,
+		struct nfs_dns_ent *key,
+		struct nfs_dns_ent **item)
+{
+	struct nfs_cache_defer_req *dreq;
+	int ret = -ENOMEM;
+
+	dreq = nfs_cache_defer_req_alloc();
+	if (!dreq)
+		goto out;
+	ret = do_cache_lookup(cd, key, item, dreq);
+	if (ret == -EAGAIN) {
+		ret = nfs_cache_wait_for_upcall(dreq);
+		if (!ret)
+			ret = do_cache_lookup_nowait(cd, key, item);
+	}
+	nfs_cache_defer_req_put(dreq);
+out:
+	return ret;
+}
+
+ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+		struct sockaddr *sa, size_t salen)
+{
+	struct nfs_dns_ent key = {
+		.hostname = name,
+		.namelen = namelen,
+	};
+	struct nfs_dns_ent *item = NULL;
+	ssize_t ret;
+
+	ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item);
+	if (ret == 0) {
+		if (salen >= item->addrlen) {
+			memcpy(sa, &item->addr, item->addrlen);
+			ret = item->addrlen;
+		} else
+			ret = -EOVERFLOW;
+		cache_put(&item->h, &nfs_dns_resolve);
+	} else if (ret == -ENOENT)
+		ret = -ESRCH;
+	return ret;
+}
+
+int nfs_dns_resolver_init(void)
+{
+	return nfs_cache_register(&nfs_dns_resolve);
+}
+
+void nfs_dns_resolver_destroy(void)
+{
+	nfs_cache_unregister(&nfs_dns_resolve);
+}
+
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
new file mode 100644
index 000000000000..a3f0938babf7
--- /dev/null
+++ b/fs/nfs/dns_resolve.h
@@ -0,0 +1,14 @@
+/*
+ * Resolve DNS hostnames into valid ip addresses
+ */
+#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H
+#define __LINUX_FS_NFS_DNS_RESOLVE_H
+
+#define NFS_DNS_HOSTNAME_MAXLEN	(128)
+
+extern int nfs_dns_resolver_init(void);
+extern void nfs_dns_resolver_destroy(void);
+extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+		struct sockaddr *sa, size_t salen);
+
+#endif
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index fe5a8b45d867..060022b4651c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -46,6 +46,7 @@
 #include "iostat.h"
 #include "internal.h"
 #include "fscache.h"
+#include "dns_resolve.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
@@ -1506,6 +1507,10 @@ static int __init init_nfs_fs(void)
 {
 	int err;
 
+	err = nfs_dns_resolver_init();
+	if (err < 0)
+		goto out8;
+
 	err = nfs_fscache_register();
 	if (err < 0)
 		goto out7;
@@ -1564,6 +1569,8 @@ out5:
 out6:
 	nfs_fscache_unregister();
 out7:
+	nfs_dns_resolver_destroy();
+out8:
 	return err;
 }
 
@@ -1575,6 +1582,7 @@ static void __exit exit_nfs_fs(void)
 	nfs_destroy_inodecache();
 	nfs_destroy_nfspagecache();
 	nfs_fscache_unregister();
+	nfs_dns_resolver_destroy();
 #ifdef CONFIG_PROC_FS
 	rpc_proc_unregister("nfs");
 #endif
-- 
cgit v1.2.2


From 7d7ea882898f23989209d0359691b356f73240dc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 19 Aug 2009 18:12:34 -0400
Subject: NFS: Use the DNS resolver in the mount code.

In the referral code, use it to look up the new server's ip address if the
fs_locations attribute contains a hostname.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4namespace.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index ef22ee89aa77..2636c26d56fa 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -17,6 +17,7 @@
 #include <linux/inet.h>
 #include "internal.h"
 #include "nfs4_fs.h"
+#include "dns_resolve.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
@@ -95,6 +96,20 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
 	return 0;
 }
 
+static size_t nfs_parse_server_name(char *string, size_t len,
+		struct sockaddr *sa, size_t salen)
+{
+	ssize_t ret;
+
+	ret = rpc_pton(string, len, sa, salen);
+	if (ret == 0) {
+		ret = nfs_dns_resolve_name(string, len, sa, salen);
+		if (ret < 0)
+			ret = 0;
+	}
+	return ret;
+}
+
 static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 				     char *page, char *page2,
 				     const struct nfs4_fs_location *location)
@@ -121,7 +136,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 
 		if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
 			continue;
-		mountdata->addrlen = rpc_pton(buf->data, buf->len,
+		mountdata->addrlen = nfs_parse_server_name(buf->data,
+				buf->len,
 				mountdata->addr, mountdata->addrlen);
 		if (mountdata->addrlen == 0)
 			continue;
-- 
cgit v1.2.2


From e1af88a1ad8f4dea3a2d6c5637d94a3fc3c62994 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 19 Aug 2009 18:04:43 +0200
Subject: nfs: Remove reference to generic_osync_inode from a comment

generic_file_direct_write() no longer calls generic_osync_inode() so remove the
comment.

CC: linux-nfs@vger.kernel.org
CC: Neil Brown <neilb@suse.de>
CC: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e4e089a8f294..6c3210099d51 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -934,9 +934,6 @@ out:
  * back into its cache.  We let the server do generic write
  * parameter checking and report problems.
  *
- * We also avoid an unnecessary invocation of generic_osync_inode(),
- * as it is fairly meaningless to sync the metadata of an NFS file.
- *
  * We eliminate local atime updates, see direct read above.
  *
  * We avoid unnecessary page cache invalidations for normal cached
-- 
cgit v1.2.2


From 939a9421eb53d3ea83188ae13802779041caefdb Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Thu, 20 Aug 2009 19:29:03 -0700
Subject: vfs: allow file truncations when both suid and write permissions set

When suid is set and the non-owner user has write permission, any writing
into this file should be allowed and suid should be removed after that.

However, current kernel only allows writing without truncations, when we
do truncations on that file, we get EPERM.  This is a bug.

Steps to reproduce this bug:

% ls -l rootdir/file1
-rwsrwsrwx 1 root root 3 Jun 25 15:42 rootdir/file1
% echo h > rootdir/file1
zsh: operation not permitted: rootdir/file1
% ls -l rootdir/file1
-rwsrwsrwx 1 root root 3 Jun 25 15:42 rootdir/file1
% echo h >> rootdir/file1
% ls -l rootdir/file1
-rwxrwxrwx 1 root root 5 Jun 25 16:34 rootdir/file1

Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Eric Sandeen <esandeen@redhat.com>
Acked-by: Eric Paris <eparis@redhat.com>
Cc: Eugene Teo <eteo@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/open.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index dd98e8076024..40d1fa25f5aa 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -199,7 +199,7 @@ out:
 int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 	struct file *filp)
 {
-	int err;
+	int ret;
 	struct iattr newattrs;
 
 	/* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
@@ -214,12 +214,14 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 	}
 
 	/* Remove suid/sgid on truncate too */
-	newattrs.ia_valid |= should_remove_suid(dentry);
+	ret = should_remove_suid(dentry);
+	if (ret)
+		newattrs.ia_valid |= ret | ATTR_FORCE;
 
 	mutex_lock(&dentry->d_inode->i_mutex);
-	err = notify_change(dentry, &newattrs);
+	ret = notify_change(dentry, &newattrs);
 	mutex_unlock(&dentry->d_inode->i_mutex);
-	return err;
+	return ret;
 }
 
 static long do_sys_truncate(const char __user *pathname, loff_t length)
-- 
cgit v1.2.2


From 5eecfde615894dc1c2e3f85b515a96ae2e408fb5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 21 Aug 2009 17:50:30 -0400
Subject: NFS: Handle a zero-length auth flavor list

Some releases of Linux rpc.mountd (nfs-utils 1.1.4 and later) return an
empty auth flavor list if no sec= was specified for the export.  This is
notably broken server behavior.

The new auth flavor list checking added in a recent commit rejects this
case.  The OpenSolaris client does too.

The broken mountd implementation is already widely deployed.  To avoid
a behavioral regression, the kernel's mount client skips flavor checking
(ie reverts to the pre-2.6.32 behavior) if mountd returns an empty
flavor list.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 9c85cdb353aa..f3a95df4b95f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1337,6 +1337,16 @@ static int nfs_walk_authlist(struct nfs_parsed_mount_data *args,
 {
 	unsigned int i, j, server_authlist_len = *(request->auth_flav_len);
 
+	/*
+	 * Certain releases of Linux's mountd return an empty
+	 * flavor list.  To prevent behavioral regression with
+	 * these servers (ie. rejecting mounts that used to
+	 * succeed), revert to pre-2.6.32 behavior (no checking)
+	 * if the returned flavor list is empty.
+	 */
+	if (server_authlist_len == 0)
+		return 0;
+
 	/*
 	 * We avoid sophisticated negotiating here, as there are
 	 * plenty of cases where we can get it wrong, providing
-- 
cgit v1.2.2


From cd0120751d631bc4b99f180c1c22de2caca98207 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Sat, 22 Aug 2009 19:26:42 +0200
Subject: GFS2: jumping to wrong label?

Also a gfs2_glock_dq() is required here.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index f8bd20baf99c..ce551f85524f 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -349,7 +349,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
 
 	error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
 	if (error)
-		goto out_rgrp;
+		goto out_gunlock;
 
 	error = gfs2_dir_del(dip, &dentry->d_name);
         if (error)
-- 
cgit v1.2.2


From d34843d0c4a20872f3f3bfb510328bd043b939ff Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 24 Aug 2009 10:44:18 +0100
Subject: GFS2: Add "-o errors=panic|withdraw" mount options

This patch adds "-o errors=panic" and "-o errors=withdraw" to the
gfs2 mount options.  The "errors=withdraw" option is today's
current behaviour, meaning to withdraw from the file system if a
non-serious gfs2 error occurs.  The new "errors=panic" option
tells gfs2 to force a kernel panic if a non-serious gfs2 file
system error occurs.  This may be useful, for example, where
fabric-level fencing is used that has no way to reboot (such as
fence_scsi).

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h     |  7 +++++++
 fs/gfs2/ops_fstype.c |  1 +
 fs/gfs2/super.c      | 36 ++++++++++++++++++++++++++++++++++++
 fs/gfs2/util.c       | 41 +++++++++++++++++++++++++++--------------
 4 files changed, 71 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 61801ada36f0..1d11e6e0373e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -406,6 +406,12 @@ struct gfs2_statfs_change_host {
 #define GFS2_DATA_WRITEBACK	1
 #define GFS2_DATA_ORDERED	2
 
+#define GFS2_ERRORS_DEFAULT     GFS2_ERRORS_WITHDRAW
+#define GFS2_ERRORS_WITHDRAW    0
+#define GFS2_ERRORS_CONTINUE    1 /* place holder for future feature */
+#define GFS2_ERRORS_RO          2 /* place holder for future feature */
+#define GFS2_ERRORS_PANIC       3
+
 struct gfs2_args {
 	char ar_lockproto[GFS2_LOCKNAME_LEN];	/* Name of the Lock Protocol */
 	char ar_locktable[GFS2_LOCKNAME_LEN];	/* Name of the Lock Table */
@@ -422,6 +428,7 @@ struct gfs2_args {
 	unsigned int ar_data:2;			/* ordered/writeback */
 	unsigned int ar_meta:1;			/* mount metafs */
 	unsigned int ar_discard:1;		/* discard requests */
+	unsigned int ar_errors:2;               /* errors=withdraw | panic */
 	int ar_commit;				/* Commit interval */
 };
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 39021c020176..165518a1041e 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1168,6 +1168,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 	sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
 	sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
 	sdp->sd_args.ar_commit = 60;
+	sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT;
 
 	error = gfs2_mount_args(sdp, &sdp->sd_args, data);
 	if (error) {
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 85bd2bc8c1de..7a5c128e8776 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -68,6 +68,8 @@ enum {
 	Opt_discard,
 	Opt_nodiscard,
 	Opt_commit,
+	Opt_err_withdraw,
+	Opt_err_panic,
 	Opt_error,
 };
 
@@ -97,6 +99,8 @@ static const match_table_t tokens = {
 	{Opt_discard, "discard"},
 	{Opt_nodiscard, "nodiscard"},
 	{Opt_commit, "commit=%d"},
+	{Opt_err_withdraw, "errors=withdraw"},
+	{Opt_err_panic, "errors=panic"},
 	{Opt_error, NULL}
 };
 
@@ -152,6 +156,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
 			args->ar_localcaching = 1;
 			break;
 		case Opt_debug:
+			if (args->ar_errors == GFS2_ERRORS_PANIC) {
+				fs_info(sdp, "-o debug and -o errors=panic "
+				       "are mutually exclusive.\n");
+				return -EINVAL;
+			}
 			args->ar_debug = 1;
 			break;
 		case Opt_nodebug:
@@ -205,6 +214,17 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
 				return rv ? rv : -EINVAL;
 			}
 			break;
+		case Opt_err_withdraw:
+			args->ar_errors = GFS2_ERRORS_WITHDRAW;
+			break;
+		case Opt_err_panic:
+			if (args->ar_debug) {
+				fs_info(sdp, "-o debug and -o errors=panic "
+					"are mutually exclusive.\n");
+				return -EINVAL;
+			}
+			args->ar_errors = GFS2_ERRORS_PANIC;
+			break;
 		case Opt_error:
 		default:
 			fs_info(sdp, "invalid mount option: %s\n", o);
@@ -1226,6 +1246,22 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	lfsecs = sdp->sd_tune.gt_log_flush_secs;
 	if (lfsecs != 60)
 		seq_printf(s, ",commit=%d", lfsecs);
+	if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
+		const char *state;
+
+		switch (args->ar_errors) {
+		case GFS2_ERRORS_WITHDRAW:
+			state = "withdraw";
+			break;
+		case GFS2_ERRORS_PANIC:
+			state = "panic";
+			break;
+		default:
+			state = "unknown";
+			break;
+		}
+		seq_printf(s, ",errors=%s", state);
+	}
 	return 0;
 }
 
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 9d12b1118ba0..f6a7efa34eb9 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -38,24 +38,30 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
 	const struct lm_lockops *lm = ls->ls_ops;
 	va_list args;
 
-	if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
+	    test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
 		return 0;
 
 	va_start(args, fmt);
 	vprintk(fmt, args);
 	va_end(args);
 
-	fs_err(sdp, "about to withdraw this file system\n");
-	BUG_ON(sdp->sd_args.ar_debug);
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
+		fs_err(sdp, "about to withdraw this file system\n");
+		BUG_ON(sdp->sd_args.ar_debug);
 
-	kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
+		kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
 
-	if (lm->lm_unmount) {
-		fs_err(sdp, "telling LM to unmount\n");
-		lm->lm_unmount(sdp);
+		if (lm->lm_unmount) {
+			fs_err(sdp, "telling LM to unmount\n");
+			lm->lm_unmount(sdp);
+		}
+		fs_err(sdp, "withdrawn\n");
+		dump_stack();
 	}
-	fs_err(sdp, "withdrawn\n");
-	dump_stack();
+
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+		panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname);
 
 	return -1;
 }
@@ -93,17 +99,24 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
 			gfs2_tune_get(sdp, gt_complain_secs) * HZ))
 		return -2;
 
-	printk(KERN_WARNING
-	       "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
-	       "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
-	       sdp->sd_fsname, assertion,
-	       sdp->sd_fsname, function, file, line);
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
+		printk(KERN_WARNING
+		       "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+		       "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		       sdp->sd_fsname, assertion,
+		       sdp->sd_fsname, function, file, line);
 
 	if (sdp->sd_args.ar_debug)
 		BUG();
 	else
 		dump_stack();
 
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+		panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+		      "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		      sdp->sd_fsname, assertion,
+		      sdp->sd_fsname, function, file, line);
+
 	sdp->sd_last_warning = jiffies;
 
 	return -1;
-- 
cgit v1.2.2


From 063c4c99630c0b06afad080d2a18bda64172c1a2 Mon Sep 17 00:00:00 2001
From: Lars Marowsky-Bree <lmb@suse.de>
Date: Tue, 11 Aug 2009 16:18:23 -0500
Subject: dlm: fix connection close handling

Closing a connection to a node can create problems if there are
outstanding messages for that node.  The problems include dlm_send
spinning attempting to reconnect, or BUG from tcp_connect_to_sock()
attempting to use a partially closed connection.

To cleanly close a connection, we now first attempt to send any pending
messages, cancel any remaining workqueue work, and flag the connection
as closed to avoid reconnect attempts.

Signed-off-by: Lars Marowsky-Bree <lmb@suse.de>
Signed-off-by: Christine Caulfield <ccaulfie@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 210d52c48808..bda690cd3640 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -106,6 +106,7 @@ struct connection {
 #define CF_CONNECT_PENDING 3
 #define CF_INIT_PENDING 4
 #define CF_IS_OTHERCON 5
+#define CF_CLOSE 6
 	struct list_head writequeue;  /* List of outgoing writequeue_entries */
 	spinlock_t writequeue_lock;
 	int (*rx_action) (struct connection *);	/* What to do when active */
@@ -299,6 +300,8 @@ static void lowcomms_write_space(struct sock *sk)
 
 static inline void lowcomms_connect_sock(struct connection *con)
 {
+	if (test_bit(CF_CLOSE, &con->flags))
+		return;
 	if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
 		queue_work(send_workqueue, &con->swork);
 }
@@ -1368,6 +1371,13 @@ int dlm_lowcomms_close(int nodeid)
 	log_print("closing connection to node %d", nodeid);
 	con = nodeid2con(nodeid, 0);
 	if (con) {
+		clear_bit(CF_CONNECT_PENDING, &con->flags);
+		clear_bit(CF_WRITE_PENDING, &con->flags);
+		set_bit(CF_CLOSE, &con->flags);
+		if (cancel_work_sync(&con->swork))
+			log_print("canceled swork for node %d", nodeid);
+		if (cancel_work_sync(&con->rwork))
+			log_print("canceled rwork for node %d", nodeid);
 		clean_one_writequeue(con);
 		close_connection(con, true);
 	}
@@ -1393,9 +1403,10 @@ static void process_send_sockets(struct work_struct *work)
 
 	if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
 		con->connect_action(con);
+		set_bit(CF_WRITE_PENDING, &con->flags);
 	}
-	clear_bit(CF_WRITE_PENDING, &con->flags);
-	send_to_sock(con);
+	if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags))
+		send_to_sock(con);
 }
 
 
-- 
cgit v1.2.2


From 1329e3f2c898cfabb6ed236d3fb8c1725197af53 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <bonzini@gnu.org>
Date: Mon, 24 Aug 2009 13:18:04 -0500
Subject: dlm: use kernel_sendpage

Using kernel_sendpage() is cleaner and safer than following
sock->ops ourselves.

Signed-off-by: Paolo Bonzini <bonzini@gnu.org>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index bda690cd3640..240cef14fe58 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1285,7 +1285,6 @@ out:
 static void send_to_sock(struct connection *con)
 {
 	int ret = 0;
-	ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
 	const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
 	struct writequeue_entry *e;
 	int len, offset;
@@ -1294,8 +1293,6 @@ static void send_to_sock(struct connection *con)
 	if (con->sock == NULL)
 		goto out_connect;
 
-	sendpage = con->sock->ops->sendpage;
-
 	spin_lock(&con->writequeue_lock);
 	for (;;) {
 		e = list_entry(con->writequeue.next, struct writequeue_entry,
@@ -1310,8 +1307,8 @@ static void send_to_sock(struct connection *con)
 
 		ret = 0;
 		if (len) {
-			ret = sendpage(con->sock, e->page, offset, len,
-				       msg_flags);
+			ret = kernel_sendpage(con->sock, e->page, offset, len,
+					      msg_flags);
 			if (ret == -EAGAIN || ret == 0) {
 				cond_resched();
 				goto out;
-- 
cgit v1.2.2


From a8526e84ac758ac6da45cf273aa1538a6a7aa3de Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 25 Aug 2009 22:36:05 -0400
Subject: ext4: Add missing unlock_new_inode() call in extent migration code

We need to unlock the new inode before iput.  This patch fixes the
following warning when calling chattr +e to migrate a file to use
extents.  It also fixes problems in when e4defrag attempts to
defragment an inode.

[  470.400044] ------------[ cut here ]------------
[  470.400065] WARNING: at fs/inode.c:1210 generic_delete_inode+0x65/0x16a()
[  470.400072] Hardware name: N/A
.....
...
[  470.400353] Pid: 4451, comm: chattr Not tainted 2.6.31-rc7-red-debug #4
[  470.400359] Call Trace:
[  470.400372]  [<ffffffff81037771>] warn_slowpath_common+0x77/0x8f
[  470.400385]  [<ffffffff81037798>] warn_slowpath_null+0xf/0x11
[  470.400395]  [<ffffffff810b7f28>] generic_delete_inode+0x65/0x16a
[  470.400405]  [<ffffffff810b8044>] generic_drop_inode+0x17/0x1bd
[  470.400413]  [<ffffffff810b7083>] iput+0x61/0x65
[  470.400455]  [<ffffffffa003b229>] ext4_ext_migrate+0x5eb/0x66a [ext4]
[  470.400492]  [<ffffffffa002b1f8>] ext4_ioctl+0x340/0x756 [ext4]
[  470.400507]  [<ffffffff810b1a91>] vfs_ioctl+0x1d/0x82
[  470.400517]  [<ffffffff810b1ff0>] do_vfs_ioctl+0x483/0x4c9
[  470.400527]  [<ffffffff81059c30>] ? trace_hardirqs_on+0xd/0xf
[  470.400537]  [<ffffffff810b2087>] sys_ioctl+0x51/0x74
[  470.400549]  [<ffffffff8100ba6b>] system_call_fastpath+0x16/0x1b
[  470.400557] ---[ end trace ab85723542352dac ]---

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/migrate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 313a50b39741..05361ad5b80a 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -618,7 +618,7 @@ err_out:
 	tmp_inode->i_nlink = 0;
 
 	ext4_journal_stop(handle);
-
+	unlock_new_inode(tmp_inode);
 	iput(tmp_inode);
 
 	return retval;
-- 
cgit v1.2.2


From 1927805e6599d8602d2c0af6a0155c85acc0b214 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 25 Aug 2009 22:36:25 -0400
Subject: ext4: use variables not types in sizeofs() for allocations

Precursor to changing some types; to keep things in sync, it
seems better to allocate/memset based on the size of the
variables we are using rather than on some disconnected
basic type like "unsigned short"

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
---
 fs/ext4/mballoc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 1a9cd7de04dd..081eaf7fce89 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -868,7 +868,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			grinfo = ext4_get_group_info(sb, group);
 			grinfo->bb_fragments = 0;
 			memset(grinfo->bb_counters, 0,
-			       sizeof(unsigned short)*(sb->s_blocksize_bits+2));
+			       sizeof(*grinfo->bb_counters) *
+				(sb->s_blocksize_bits+2));
 			/*
 			 * incore got set to the group block bitmap below
 			 */
@@ -2640,14 +2641,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	unsigned max;
 	int ret;
 
-	i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
+	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
 
 	sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_offsets == NULL) {
 		return -ENOMEM;
 	}
 
-	i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
+	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
 	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_maxs == NULL) {
 		kfree(sbi->s_mb_offsets);
-- 
cgit v1.2.2


From a36b44988cef1fc007535107013571fa691a2d7f Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 25 Aug 2009 22:36:45 -0400
Subject: ext4: use ext4_grpblk_t more extensively

unsigned  short is potentially too small to track blocks within
a group; today it is safe due to restrictions in e2fsprogs but
we have _lo / _hi bits for group blocks with the intent to go
up to 32 bits, so clean this up now.

There are many more places where we use unsigned/int/unsigned int
to contain a group block but this should at least fix all the
short types.

I added a few comments to the struct ext4_group_info definition
as well.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 11 +++++++----
 fs/ext4/mballoc.c | 18 +++++++++---------
 fs/ext4/mballoc.h |  6 +++---
 3 files changed, 19 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 02b22885eb02..41a76e163b99 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1577,15 +1577,18 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
 struct ext4_group_info {
 	unsigned long   bb_state;
 	struct rb_root  bb_free_root;
-	unsigned short  bb_first_free;
-	unsigned short  bb_free;
-	unsigned short  bb_fragments;
+	ext4_grpblk_t	bb_first_free;	/* first free block */
+	ext4_grpblk_t	bb_free;	/* total free blocks */
+	ext4_grpblk_t	bb_fragments;	/* nr of freespace fragments */
 	struct          list_head bb_prealloc_list;
 #ifdef DOUBLE_CHECK
 	void            *bb_bitmap;
 #endif
 	struct rw_semaphore alloc_sem;
-	unsigned short  bb_counters[];
+	ext4_grpblk_t	bb_counters[];	/* Nr of free power-of-two-block
+					 * regions, index is order.
+					 * bb_counters[3] = 5 means
+					 * 5 free 8-block regions. */
 };
 
 #define EXT4_GROUP_INFO_NEED_INIT_BIT	0
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 081eaf7fce89..3086b3c65adc 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -623,13 +623,13 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 
 /* FIXME!! need more doc */
 static void ext4_mb_mark_free_simple(struct super_block *sb,
-				void *buddy, unsigned first, int len,
+				void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
 					struct ext4_group_info *grp)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	unsigned short min;
-	unsigned short max;
-	unsigned short chunk;
+	ext4_grpblk_t min;
+	ext4_grpblk_t max;
+	ext4_grpblk_t chunk;
 	unsigned short border;
 
 	BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
@@ -663,10 +663,10 @@ void ext4_mb_generate_buddy(struct super_block *sb,
 				void *buddy, void *bitmap, ext4_group_t group)
 {
 	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
-	unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
-	unsigned short i = 0;
-	unsigned short first;
-	unsigned short len;
+	ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+	ext4_grpblk_t i = 0;
+	ext4_grpblk_t first;
+	ext4_grpblk_t len;
 	unsigned free = 0;
 	unsigned fragments = 0;
 	unsigned long long period = get_cycles();
@@ -2325,7 +2325,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 	struct ext4_buddy e4b;
 	struct sg {
 		struct ext4_group_info info;
-		unsigned short counters[16];
+		ext4_grpblk_t counters[16];
 	} sg;
 
 	group--;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 9db890d4d275..188d3d709b24 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -136,8 +136,8 @@ struct ext4_prealloc_space {
 	unsigned		pa_deleted;
 	ext4_fsblk_t		pa_pstart;	/* phys. block */
 	ext4_lblk_t		pa_lstart;	/* log. block */
-	unsigned short		pa_len;		/* len of preallocated chunk */
-	unsigned short		pa_free;	/* how many blocks are free */
+	ext4_grpblk_t		pa_len;		/* len of preallocated chunk */
+	ext4_grpblk_t		pa_free;	/* how many blocks are free */
 	unsigned short		pa_type;	/* pa type. inode or group */
 	spinlock_t		*pa_obj_lock;
 	struct inode		*pa_inode;	/* hack, for history only */
@@ -152,7 +152,7 @@ struct ext4_free_extent {
 	ext4_lblk_t fe_logical;
 	ext4_grpblk_t fe_start;
 	ext4_group_t fe_group;
-	int fe_len;
+	ext4_grpblk_t fe_len;
 };
 
 /*
-- 
cgit v1.2.2


From 40b78a322365aa1d87770200f7fc7de3b361c11a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 26 Aug 2009 18:41:32 +0100
Subject: GFS2: Clean up of extended attribute support

This has been on my list for some time. We need to change the way
in which we handle extended attributes to allow faster file creation
times (by reducing the number of transactions required) and the
extended attribute code is the main obstacle to this.

In addition to that, the VFS provides a way to demultiplex the xattr
calls which we ought to be using, rather than rolling our own. This
patch changes the GFS2 code to use that VFS feature and as a result
the code shrinks by a couple of hundred lines or so, and becomes
easier to read.

I'm planning on doing further clean up work in this area, but this
patch is a good start. The cleaned up code also uses the more usual
"xattr" shorthand, I plan to eliminate the use of "eattr" eventually
and in the mean time it serves as a flag as to which bits of the code
have been updated.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Makefile     |   2 +-
 fs/gfs2/acl.c        | 104 ++++++-------
 fs/gfs2/eaops.c      | 157 -------------------
 fs/gfs2/eaops.h      |  30 ----
 fs/gfs2/eattr.c      | 421 +++++++++++++++++++++++++++++----------------------
 fs/gfs2/eattr.h      |  52 ++-----
 fs/gfs2/file.c       |   1 -
 fs/gfs2/inode.c      |  12 +-
 fs/gfs2/ops_fstype.c |   1 +
 fs/gfs2/ops_inode.c  |  78 +++++-----
 fs/gfs2/super.h      |   1 +
 11 files changed, 333 insertions(+), 526 deletions(-)
 delete mode 100644 fs/gfs2/eaops.c
 delete mode 100644 fs/gfs2/eaops.h

(limited to 'fs')

diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 3da2f1f4f738..2a4fd894858a 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,6 +1,6 @@
 EXTRA_CFLAGS := -I$(src)
 obj-$(CONFIG_GFS2_FS) += gfs2.o
-gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
+gfs2-y := acl.o bmap.o dir.o eattr.o glock.o \
 	glops.o inode.o log.o lops.o main.o meta_io.o \
 	aops.o dentry.o export.o file.o \
 	ops_fstype.o ops_inode.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index fa881bdc3d85..f6777f167260 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -19,7 +19,6 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "acl.h"
-#include "eaops.h"
 #include "eattr.h"
 #include "glock.h"
 #include "inode.h"
@@ -31,8 +30,7 @@
 #define ACL_DEFAULT 0
 
 int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
-		      struct gfs2_ea_request *er,
-		      int *remove, mode_t *mode)
+			  struct gfs2_ea_request *er, int *remove, mode_t *mode)
 {
 	struct posix_acl *acl;
 	int error;
@@ -83,30 +81,20 @@ int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
 	return 0;
 }
 
-static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
-		   struct gfs2_ea_location *el, char **data, unsigned int *len)
+static int acl_get(struct gfs2_inode *ip, const char *name,
+		   struct posix_acl **acl, struct gfs2_ea_location *el,
+		   char **datap, unsigned int *lenp)
 {
-	struct gfs2_ea_request er;
-	struct gfs2_ea_location el_this;
+	char *data;
+	unsigned int len;
 	int error;
 
+	el->el_bh = NULL;
+
 	if (!ip->i_eattr)
 		return 0;
 
-	memset(&er, 0, sizeof(struct gfs2_ea_request));
-	if (access) {
-		er.er_name = GFS2_POSIX_ACL_ACCESS;
-		er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
-	} else {
-		er.er_name = GFS2_POSIX_ACL_DEFAULT;
-		er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
-	}
-	er.er_type = GFS2_EATYPE_SYS;
-
-	if (!el)
-		el = &el_this;
-
-	error = gfs2_ea_find(ip, &er, el);
+	error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el);
 	if (error)
 		return error;
 	if (!el->el_ea)
@@ -114,32 +102,31 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
 	if (!GFS2_EA_DATA_LEN(el->el_ea))
 		goto out;
 
-	er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
-	er.er_data = kmalloc(er.er_data_len, GFP_NOFS);
+	len = GFS2_EA_DATA_LEN(el->el_ea);
+	data = kmalloc(len, GFP_NOFS);
 	error = -ENOMEM;
-	if (!er.er_data)
+	if (!data)
 		goto out;
 
-	error = gfs2_ea_get_copy(ip, el, er.er_data);
-	if (error)
+	error = gfs2_ea_get_copy(ip, el, data, len);
+	if (error < 0)
 		goto out_kfree;
+	error = 0;
 
 	if (acl) {
-		*acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
+		*acl = posix_acl_from_xattr(data, len);
 		if (IS_ERR(*acl))
 			error = PTR_ERR(*acl);
 	}
 
 out_kfree:
-	if (error || !data)
-		kfree(er.er_data);
-	else {
-		*data = er.er_data;
-		*len = er.er_data_len;
+	if (error || !datap) {
+		kfree(data);
+	} else {
+		*datap = data;
+		*lenp = len;
 	}
 out:
-	if (error || el == &el_this)
-		brelse(el->el_bh);
 	return error;
 }
 
@@ -153,10 +140,12 @@ out:
 
 int gfs2_check_acl(struct inode *inode, int mask)
 {
+	struct gfs2_ea_location el;
 	struct posix_acl *acl = NULL;
 	int error;
 
-	error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
+	error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL);
+	brelse(el.el_bh);
 	if (error)
 		return error;
 
@@ -196,10 +185,12 @@ static int munge_mode(struct gfs2_inode *ip, mode_t mode)
 
 int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 {
+	struct gfs2_ea_location el;
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	struct posix_acl *acl = NULL, *clone;
-	struct gfs2_ea_request er;
 	mode_t mode = ip->i_inode.i_mode;
+	char *data = NULL;
+	unsigned int len;
 	int error;
 
 	if (!sdp->sd_args.ar_posix_acl)
@@ -207,11 +198,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 	if (S_ISLNK(ip->i_inode.i_mode))
 		return 0;
 
-	memset(&er, 0, sizeof(struct gfs2_ea_request));
-	er.er_type = GFS2_EATYPE_SYS;
-
-	error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
-			&er.er_data, &er.er_data_len);
+	error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len);
+	brelse(el.el_bh);
 	if (error)
 		return error;
 	if (!acl) {
@@ -229,9 +217,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 	acl = clone;
 
 	if (S_ISDIR(ip->i_inode.i_mode)) {
-		er.er_name = GFS2_POSIX_ACL_DEFAULT;
-		er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
-		error = gfs2_system_eaops.eo_set(ip, &er);
+		error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
+				       GFS2_POSIX_ACL_DEFAULT, data, len, 0);
 		if (error)
 			goto out;
 	}
@@ -239,21 +226,19 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 	error = posix_acl_create_masq(acl, &mode);
 	if (error < 0)
 		goto out;
-	if (error > 0) {
-		er.er_name = GFS2_POSIX_ACL_ACCESS;
-		er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
-		posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
-		er.er_mode = mode;
-		er.er_flags = GFS2_ERF_MODE;
-		error = gfs2_system_eaops.eo_set(ip, &er);
-		if (error)
-			goto out;
-	} else
-		munge_mode(ip, mode);
+	if (error == 0)
+		goto munge;
 
+	posix_acl_to_xattr(acl, data, len);
+	error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
+			       GFS2_POSIX_ACL_ACCESS, data, len, 0);
+	if (error)
+		goto out;
+munge:
+	error = munge_mode(ip, mode);
 out:
 	posix_acl_release(acl);
-	kfree(er.er_data);
+	kfree(data);
 	return error;
 }
 
@@ -265,9 +250,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
 	unsigned int len;
 	int error;
 
-	error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
+	error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len);
 	if (error)
-		return error;
+		goto out_brelse;
 	if (!acl)
 		return gfs2_setattr_simple(ip, attr);
 
@@ -286,8 +271,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
 
 out:
 	posix_acl_release(acl);
-	brelse(el.el_bh);
 	kfree(data);
+out_brelse:
+	brelse(el.el_bh);
 	return error;
 }
 
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
deleted file mode 100644
index dee9b03e5b37..000000000000
--- a/fs/gfs2/eaops.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/capability.h>
-#include <linux/xattr.h>
-#include <linux/gfs2_ondisk.h>
-#include <asm/uaccess.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "acl.h"
-#include "eaops.h"
-#include "eattr.h"
-#include "util.h"
-
-/**
- * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
- * @namep: ea name, possibly with type appended
- *
- * Returns: GFS2_EATYPE_XXX
- */
-
-unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
-{
-	unsigned int type;
-
-	if (strncmp(name, "system.", 7) == 0) {
-		type = GFS2_EATYPE_SYS;
-		if (truncated_name)
-			*truncated_name = name + sizeof("system.") - 1;
-	} else if (strncmp(name, "user.", 5) == 0) {
-		type = GFS2_EATYPE_USR;
-		if (truncated_name)
-			*truncated_name = name + sizeof("user.") - 1;
-	} else if (strncmp(name, "security.", 9) == 0) {
-		type = GFS2_EATYPE_SECURITY;
-		if (truncated_name)
-			*truncated_name = name + sizeof("security.") - 1;
-	} else {
-		type = GFS2_EATYPE_UNUSED;
-		if (truncated_name)
-			*truncated_name = NULL;
-	}
-
-	return type;
-}
-
-static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-	if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
-	    !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
-	    !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
-	    (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
-	     GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
-		return -EOPNOTSUPP;
-
-	return gfs2_ea_get_i(ip, er);
-}
-
-static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-	int remove = 0;
-	int error;
-
-	if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
-		if (!(er->er_flags & GFS2_ERF_MODE)) {
-			er->er_mode = ip->i_inode.i_mode;
-			er->er_flags |= GFS2_ERF_MODE;
-		}
-		error = gfs2_acl_validate_set(ip, 1, er,
-					      &remove, &er->er_mode);
-		if (error)
-			return error;
-		error = gfs2_ea_set_i(ip, er);
-		if (error)
-			return error;
-		if (remove)
-			gfs2_ea_remove_i(ip, er);
-		return 0;
-
-	} else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
-		error = gfs2_acl_validate_set(ip, 0, er,
-					      &remove, NULL);
-		if (error)
-			return error;
-		if (!remove)
-			error = gfs2_ea_set_i(ip, er);
-		else {
-			error = gfs2_ea_remove_i(ip, er);
-			if (error == -ENODATA)
-				error = 0;
-		}
-		return error;
-	}
-
-	return -EPERM;
-}
-
-static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-	if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
-		int error = gfs2_acl_validate_remove(ip, 1);
-		if (error)
-			return error;
-
-	} else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
-		int error = gfs2_acl_validate_remove(ip, 0);
-		if (error)
-			return error;
-
-	} else
-		return -EPERM;
-
-	return gfs2_ea_remove_i(ip, er);
-}
-
-static const struct gfs2_eattr_operations gfs2_user_eaops = {
-	.eo_get = gfs2_ea_get_i,
-	.eo_set = gfs2_ea_set_i,
-	.eo_remove = gfs2_ea_remove_i,
-	.eo_name = "user",
-};
-
-const struct gfs2_eattr_operations gfs2_system_eaops = {
-	.eo_get = system_eo_get,
-	.eo_set = system_eo_set,
-	.eo_remove = system_eo_remove,
-	.eo_name = "system",
-};
-
-static const struct gfs2_eattr_operations gfs2_security_eaops = {
-	.eo_get = gfs2_ea_get_i,
-	.eo_set = gfs2_ea_set_i,
-	.eo_remove = gfs2_ea_remove_i,
-	.eo_name = "security",
-};
-
-const struct gfs2_eattr_operations *gfs2_ea_ops[] = {
-	NULL,
-	&gfs2_user_eaops,
-	&gfs2_system_eaops,
-	&gfs2_security_eaops,
-};
-
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
deleted file mode 100644
index da2f7fbbb40d..000000000000
--- a/fs/gfs2/eaops.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __EAOPS_DOT_H__
-#define __EAOPS_DOT_H__
-
-struct gfs2_ea_request;
-struct gfs2_inode;
-
-struct gfs2_eattr_operations {
-	int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
-	int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
-	int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
-	char *eo_name;
-};
-
-unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
-
-extern const struct gfs2_eattr_operations gfs2_system_eaops;
-
-extern const struct gfs2_eattr_operations *gfs2_ea_ops[];
-
-#endif /* __EAOPS_DOT_H__ */
-
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 07ea9529adda..d9a9e260d5ac 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -18,7 +18,6 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "acl.h"
-#include "eaops.h"
 #include "eattr.h"
 #include "glock.h"
 #include "inode.h"
@@ -38,26 +37,32 @@
  * Returns: 1 if the EA should be stuffed
  */
 
-static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
+static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize,
 			unsigned int *size)
 {
-	*size = GFS2_EAREQ_SIZE_STUFFED(er);
-	if (*size <= sdp->sd_jbsize)
+	unsigned int jbsize = sdp->sd_jbsize;
+
+	/* Stuffed */
+	*size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8);
+
+	if (*size <= jbsize)
 		return 1;
 
-	*size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
+	/* Unstuffed */
+	*size = ALIGN(sizeof(struct gfs2_ea_header) + nsize +
+		      (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8);
 
 	return 0;
 }
 
-static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
+static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize)
 {
 	unsigned int size;
 
-	if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
+	if (dsize > GFS2_EA_MAX_DATA_LEN)
 		return -ERANGE;
 
-	ea_calc_size(sdp, er, &size);
+	ea_calc_size(sdp, nsize, dsize, &size);
 
 	/* This can only happen with 512 byte blocks */
 	if (size > sdp->sd_jbsize)
@@ -151,7 +156,9 @@ out:
 }
 
 struct ea_find {
-	struct gfs2_ea_request *ef_er;
+	int type;
+	const char *name;
+	size_t namel;
 	struct gfs2_ea_location *ef_el;
 };
 
@@ -160,14 +167,13 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
 		     void *private)
 {
 	struct ea_find *ef = private;
-	struct gfs2_ea_request *er = ef->ef_er;
 
 	if (ea->ea_type == GFS2_EATYPE_UNUSED)
 		return 0;
 
-	if (ea->ea_type == er->er_type) {
-		if (ea->ea_name_len == er->er_name_len &&
-		    !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
+	if (ea->ea_type == ef->type) {
+		if (ea->ea_name_len == ef->namel &&
+		    !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) {
 			struct gfs2_ea_location *el = ef->ef_el;
 			get_bh(bh);
 			el->el_bh = bh;
@@ -180,13 +186,15 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
 	return 0;
 }
 
-int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
 		 struct gfs2_ea_location *el)
 {
 	struct ea_find ef;
 	int error;
 
-	ef.ef_er = er;
+	ef.type = type;
+	ef.name = name;
+	ef.namel = strlen(name);
 	ef.ef_el = el;
 
 	memset(el, 0, sizeof(struct gfs2_ea_location));
@@ -344,6 +352,20 @@ struct ea_list {
 	unsigned int ei_size;
 };
 
+static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
+{
+	switch (ea->ea_type) {
+	case GFS2_EATYPE_USR:
+		return 5 + ea->ea_name_len + 1;
+	case GFS2_EATYPE_SYS:
+		return 7 + ea->ea_name_len + 1;
+	case GFS2_EATYPE_SECURITY:
+		return 9 + ea->ea_name_len + 1;
+	default:
+		return 0;
+	}
+}
+
 static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
 		     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
 		     void *private)
@@ -392,21 +414,25 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
 }
 
 /**
- * gfs2_ea_list -
- * @ip:
- * @er:
+ * gfs2_listxattr - List gfs2 extended attributes
+ * @dentry: The dentry whose inode we are interested in
+ * @buffer: The buffer to write the results
+ * @size: The size of the buffer
  *
  * Returns: actual size of data on success, -errno on error
  */
 
-int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
+	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+	struct gfs2_ea_request er;
 	struct gfs2_holder i_gh;
 	int error;
 
-	if (!er->er_data || !er->er_data_len) {
-		er->er_data = NULL;
-		er->er_data_len = 0;
+	memset(&er, 0, sizeof(struct gfs2_ea_request));
+	if (size) {
+		er.er_data = buffer;
+		er.er_data_len = size;
 	}
 
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
@@ -414,7 +440,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 		return error;
 
 	if (ip->i_eattr) {
-		struct ea_list ei = { .ei_er = er, .ei_size = 0 };
+		struct ea_list ei = { .ei_er = &er, .ei_size = 0 };
 
 		error = ea_foreach(ip, ea_list_i, &ei);
 		if (!error)
@@ -491,83 +517,60 @@ out:
 }
 
 int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-		     char *data)
+		     char *data, size_t size)
 {
+	int ret;
+	size_t len = GFS2_EA_DATA_LEN(el->el_ea);
+	if (len > size)
+		return -ERANGE;
+
 	if (GFS2_EA_IS_STUFFED(el->el_ea)) {
-		memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea));
-		return 0;
-	} else
-		return ea_get_unstuffed(ip, el->el_ea, data);
+		memcpy(data, GFS2_EA2DATA(el->el_ea), len);
+		return len;
+	}
+	ret = ea_get_unstuffed(ip, el->el_ea, data);
+	if (ret < 0)
+		return ret;
+	return len;
 }
 
 /**
- * gfs2_ea_get_i -
- * @ip: The GFS2 inode
- * @er: The request structure
+ * gfs2_xattr_get - Get a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of extended attribute
+ * @name: The name of the extended attribute
+ * @buffer: The buffer to write the result into
+ * @size: The size of the buffer
  *
  * Returns: actual size of data on success, -errno on error
  */
 
-int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+		   void *buffer, size_t size)
 {
+	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_ea_location el;
 	int error;
 
 	if (!ip->i_eattr)
 		return -ENODATA;
+	if (strlen(name) > GFS2_EA_MAX_NAME_LEN)
+		return -EINVAL;
 
-	error = gfs2_ea_find(ip, er, &el);
+	error = gfs2_ea_find(ip, type, name, &el);
 	if (error)
 		return error;
 	if (!el.el_ea)
 		return -ENODATA;
-
-	if (er->er_data_len) {
-		if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
-			error =  -ERANGE;
-		else
-			error = gfs2_ea_get_copy(ip, &el, er->er_data);
-	}
-	if (!error)
+	if (size) 
+		error = gfs2_ea_get_copy(ip, &el, buffer, size);
+	else
 		error = GFS2_EA_DATA_LEN(el.el_ea);
-
 	brelse(el.el_bh);
 
 	return error;
 }
 
-/**
- * gfs2_ea_get -
- * @ip: The GFS2 inode
- * @er: The request structure
- *
- * Returns: actual size of data on success, -errno on error
- */
-
-int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-	struct gfs2_holder i_gh;
-	int error;
-
-	if (!er->er_name_len ||
-	    er->er_name_len > GFS2_EA_MAX_NAME_LEN)
-		return -EINVAL;
-	if (!er->er_data || !er->er_data_len) {
-		er->er_data = NULL;
-		er->er_data_len = 0;
-	}
-
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-	if (error)
-		return error;
-
-	error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
-
-	gfs2_glock_dq_uninit(&i_gh);
-
-	return error;
-}
-
 /**
  * ea_alloc_blk - allocates a new block for extended attributes.
  * @ip: A pointer to the inode that's getting extended attributes
@@ -713,12 +716,6 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
-		if (er->er_flags & GFS2_ERF_MODE) {
-			gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
-					    (ip->i_inode.i_mode & S_IFMT) ==
-					    (er->er_mode & S_IFMT));
-			ip->i_inode.i_mode = er->er_mode;
-		}
 		ip->i_inode.i_ctime = CURRENT_TIME;
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 		gfs2_dinode_out(ip, dibh->b_data);
@@ -762,15 +759,23 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
  * Returns: errno
  */
 
-static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+static int ea_init(struct gfs2_inode *ip, int type, const char *name,
+		   const void *data, size_t size)
 {
+	struct gfs2_ea_request er;
 	unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
 	unsigned int blks = 1;
 
-	if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
-		blks += DIV_ROUND_UP(er->er_data_len, jbsize);
+	er.er_type = type;
+	er.er_name = name;
+	er.er_name_len = strlen(name);
+	er.er_data = (void *)data;
+	er.er_data_len = size;
+
+	if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize)
+		blks += DIV_ROUND_UP(er.er_data_len, jbsize);
 
-	return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
+	return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL);
 }
 
 static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
@@ -848,12 +853,6 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (error)
 		goto out;
-
-	if (er->er_flags & GFS2_ERF_MODE) {
-		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
-			(ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT));
-		ip->i_inode.i_mode = er->er_mode;
-	}
 	ip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
@@ -894,7 +893,8 @@ static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
 	int stuffed;
 	int error;
 
-	stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size);
+	stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len,
+			       es->es_er->er_data_len, &size);
 
 	if (ea->ea_type == GFS2_EATYPE_UNUSED) {
 		if (GFS2_EA_REC_LEN(ea) < size)
@@ -1005,15 +1005,22 @@ out:
 	return error;
 }
 
-static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
-		    struct gfs2_ea_location *el)
+static int ea_set_i(struct gfs2_inode *ip, int type, const char *name,
+		    const void *value, size_t size, struct gfs2_ea_location *el)
 {
+	struct gfs2_ea_request er;
 	struct ea_set es;
 	unsigned int blks = 2;
 	int error;
 
+	er.er_type = type;
+	er.er_name = name;
+	er.er_data = (void *)value;
+	er.er_name_len = strlen(name);
+	er.er_data_len = size;
+
 	memset(&es, 0, sizeof(struct ea_set));
-	es.es_er = er;
+	es.es_er = &er;
 	es.es_el = el;
 
 	error = ea_foreach(ip, ea_set_simple, &es);
@@ -1024,10 +1031,10 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 
 	if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
 		blks++;
-	if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
-		blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
+	if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
+		blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
 
-	return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
+	return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el);
 }
 
 static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
@@ -1042,74 +1049,6 @@ static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
 	return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
 }
 
-int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-	struct gfs2_ea_location el;
-	int error;
-
-	if (!ip->i_eattr) {
-		if (er->er_flags & XATTR_REPLACE)
-			return -ENODATA;
-		return ea_init(ip, er);
-	}
-
-	error = gfs2_ea_find(ip, er, &el);
-	if (error)
-		return error;
-
-	if (el.el_ea) {
-		if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
-			brelse(el.el_bh);
-			return -EPERM;
-		}
-
-		error = -EEXIST;
-		if (!(er->er_flags & XATTR_CREATE)) {
-			int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
-			error = ea_set_i(ip, er, &el);
-			if (!error && unstuffed)
-				ea_set_remove_unstuffed(ip, &el);
-		}
-
-		brelse(el.el_bh);
-	} else {
-		error = -ENODATA;
-		if (!(er->er_flags & XATTR_REPLACE))
-			error = ea_set_i(ip, er, NULL);
-	}
-
-	return error;
-}
-
-int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-	struct gfs2_holder i_gh;
-	int error;
-
-	if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
-		return -EINVAL;
-	if (!er->er_data || !er->er_data_len) {
-		er->er_data = NULL;
-		er->er_data_len = 0;
-	}
-	error = ea_check_size(GFS2_SB(&ip->i_inode), er);
-	if (error)
-		return error;
-
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
-	if (error)
-		return error;
-
-	if (IS_IMMUTABLE(&ip->i_inode))
-		error = -EPERM;
-	else
-		error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
-
-	gfs2_glock_dq_uninit(&i_gh);
-
-	return error;
-}
-
 static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
 {
 	struct gfs2_ea_header *ea = el->el_ea;
@@ -1131,8 +1070,9 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
 
 		if (GFS2_EA_IS_LAST(ea))
 			prev->ea_flags |= GFS2_EAFLAG_LAST;
-	} else
+	} else {
 		ea->ea_type = GFS2_EATYPE_UNUSED;
+	}
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
@@ -1147,15 +1087,29 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
 	return error;
 }
 
-int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+/**
+ * gfs2_xattr_remove - Remove a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ *
+ * This is not called directly by the VFS since we use the (common)
+ * scheme of making a "set with NULL data" mean a remove request. Note
+ * that this is different from a set with zero length data.
+ *
+ * Returns: 0, or errno on failure
+ */
+
+static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
 {
+	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_ea_location el;
 	int error;
 
 	if (!ip->i_eattr)
 		return -ENODATA;
 
-	error = gfs2_ea_find(ip, er, &el);
+	error = gfs2_ea_find(ip, type, name, &el);
 	if (error)
 		return error;
 	if (!el.el_ea)
@@ -1164,8 +1118,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 	if (GFS2_EA_IS_STUFFED(el.el_ea))
 		error = ea_remove_stuffed(ip, &el);
 	else
-		error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
-					    0);
+		error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0);
 
 	brelse(el.el_bh);
 
@@ -1173,31 +1126,70 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 }
 
 /**
- * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
- * @ip: pointer to the inode of the target file
- * @er: request information
+ * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ * @value: The value of the extended attribute (NULL for remove)
+ * @size: The size of the @value argument
+ * @flags: Create or Replace
  *
- * Returns: errno
+ * See gfs2_xattr_remove() for details of the removal of xattrs.
+ *
+ * Returns: 0 or errno on failure
  */
 
-int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+		   const void *value, size_t size, int flags)
 {
-	struct gfs2_holder i_gh;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_ea_location el;
+	unsigned int namel = strlen(name);
 	int error;
 
-	if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
-		return -EINVAL;
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		return -EPERM;
+	if (namel > GFS2_EA_MAX_NAME_LEN)
+		return -ERANGE;
 
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+	if (value == NULL)
+		return gfs2_xattr_remove(inode, type, name);
+
+	if (ea_check_size(sdp, namel, size))
+		return -ERANGE;
+
+	if (!ip->i_eattr) {
+		if (flags & XATTR_REPLACE)
+			return -ENODATA;
+		return ea_init(ip, type, name, value, size);
+	}
+
+	error = gfs2_ea_find(ip, type, name, &el);
 	if (error)
 		return error;
 
-	if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
-		error = -EPERM;
-	else
-		error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
+	if (el.el_ea) {
+		if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
+			brelse(el.el_bh);
+			return -EPERM;
+		}
 
-	gfs2_glock_dq_uninit(&i_gh);
+		error = -EEXIST;
+		if (!(flags & XATTR_CREATE)) {
+			int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
+			error = ea_set_i(ip, type, name, value, size, &el);
+			if (!error && unstuffed)
+				ea_set_remove_unstuffed(ip, &el);
+		}
+
+		brelse(el.el_bh);
+		return error;
+	}
+
+	error = -ENODATA;
+	if (!(flags & XATTR_REPLACE))
+		error = ea_set_i(ip, type, name, value, size, NULL);
 
 	return error;
 }
@@ -1503,3 +1495,64 @@ out_alloc:
 	return error;
 }
 
+static int gfs2_xattr_user_get(struct inode *inode, const char *name,
+			       void *buffer, size_t size)
+{
+	return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size);
+}
+
+static int gfs2_xattr_user_set(struct inode *inode, const char *name,
+			       const void *value, size_t size, int flags)
+{
+	return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
+}
+
+static int gfs2_xattr_system_get(struct inode *inode, const char *name,
+				 void *buffer, size_t size)
+{
+	return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
+}
+
+static int gfs2_xattr_system_set(struct inode *inode, const char *name,
+				 const void *value, size_t size, int flags)
+{
+	return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
+}
+
+static int gfs2_xattr_security_get(struct inode *inode, const char *name,
+				   void *buffer, size_t size)
+{
+	return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size);
+}
+
+static int gfs2_xattr_security_set(struct inode *inode, const char *name,
+				   const void *value, size_t size, int flags)
+{
+	return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags);
+}
+
+static struct xattr_handler gfs2_xattr_user_handler = {
+	.prefix = XATTR_USER_PREFIX,
+	.get    = gfs2_xattr_user_get,
+	.set    = gfs2_xattr_user_set,
+};
+
+static struct xattr_handler gfs2_xattr_security_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.get    = gfs2_xattr_security_get,
+	.set    = gfs2_xattr_security_set,
+};
+
+static struct xattr_handler gfs2_xattr_system_handler = {
+	.prefix = XATTR_SYSTEM_PREFIX,
+	.get    = gfs2_xattr_system_get,
+	.set    = gfs2_xattr_system_set,
+};
+
+struct xattr_handler *gfs2_xattr_handlers[] = {
+	&gfs2_xattr_user_handler,
+	&gfs2_xattr_security_handler,
+	&gfs2_xattr_system_handler,
+	NULL,
+};
+
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
index c82dbe01d713..4040a188f63b 100644
--- a/fs/gfs2/eattr.h
+++ b/fs/gfs2/eattr.h
@@ -27,10 +27,6 @@ ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
 #define GFS2_EAREQ_SIZE_STUFFED(er) \
 ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
 
-#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
-ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
-      sizeof(__be64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
-
 #define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
 #define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
 
@@ -43,16 +39,12 @@ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
 #define GFS2_EA_BH2FIRST(bh) \
 ((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
 
-#define GFS2_ERF_MODE 0x80000000
-
 struct gfs2_ea_request {
 	const char *er_name;
 	char *er_data;
 	unsigned int er_name_len;
 	unsigned int er_data_len;
 	unsigned int er_type; /* GFS2_EATYPE_... */
-	int er_flags;
-	mode_t er_mode;
 };
 
 struct gfs2_ea_location {
@@ -61,40 +53,20 @@ struct gfs2_ea_location {
 	struct gfs2_ea_header *el_prev;
 };
 
-int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-
-int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-
-int gfs2_ea_dealloc(struct gfs2_inode *ip);
+extern int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+			  void *buffer, size_t size);
+extern int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+			  const void *value, size_t size, int flags);
+extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
+extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
 
 /* Exported to acl.c */
 
-int gfs2_ea_find(struct gfs2_inode *ip,
-		 struct gfs2_ea_request *er,
-		 struct gfs2_ea_location *el);
-int gfs2_ea_get_copy(struct gfs2_inode *ip,
-		     struct gfs2_ea_location *el,
-		     char *data);
-int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-		      struct iattr *attr, char *data);
-
-static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
-{
-	switch (ea->ea_type) {
-	case GFS2_EATYPE_USR:
-		return 5 + ea->ea_name_len + 1;
-	case GFS2_EATYPE_SYS:
-		return 7 + ea->ea_name_len + 1;
-	case GFS2_EATYPE_SECURITY:
-		return 9 + ea->ea_name_len + 1;
-	default:
-		return 0;
-	}
-}
+extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+			struct gfs2_ea_location *el);
+extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+			    char *data, size_t size);
+extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+			     struct iattr *attr, char *data);
 
 #endif /* __EATTR_DOT_H__ */
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 73318a3ce6f1..166f38fbd246 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -38,7 +38,6 @@
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
-#include "eaops.h"
 
 /**
  * gfs2_llseek - seek to a location in a file
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index f9b4fe886540..f936d2d68138 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -924,7 +924,6 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
 	size_t len;
 	void *value;
 	char *name;
-	struct gfs2_ea_request er;
 
 	err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
 					   &name, &value, &len);
@@ -935,16 +934,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
 		return err;
 	}
 
-	memset(&er, 0, sizeof(struct gfs2_ea_request));
-
-	er.er_type = GFS2_EATYPE_SECURITY;
-	er.er_name = name;
-	er.er_data = value;
-	er.er_name_len = strlen(name);
-	er.er_data_len = len;
-
-	err = gfs2_ea_set_i(ip, &er);
-
+	err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0);
 	kfree(value);
 	kfree(name);
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 165518a1041e..1ec69e63c694 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1186,6 +1186,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_magic = GFS2_MAGIC;
 	sb->s_op = &gfs2_super_ops;
 	sb->s_export_op = &gfs2_export_ops;
+	sb->s_xattr = gfs2_xattr_handlers;
 	sb->s_time_gran = 1;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index ce551f85524f..1ee0c2657467 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -26,7 +26,6 @@
 #include "acl.h"
 #include "bmap.h"
 #include "dir.h"
-#include "eaops.h"
 #include "eattr.h"
 #include "glock.h"
 #include "inode.h"
@@ -1302,60 +1301,53 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name,
 			 const void *data, size_t size, int flags)
 {
 	struct inode *inode = dentry->d_inode;
-	struct gfs2_ea_request er;
-
-	memset(&er, 0, sizeof(struct gfs2_ea_request));
-	er.er_type = gfs2_ea_name2type(name, &er.er_name);
-	if (er.er_type == GFS2_EATYPE_UNUSED)
-		return -EOPNOTSUPP;
-	er.er_data = (char *)data;
-	er.er_name_len = strlen(er.er_name);
-	er.er_data_len = size;
-	er.er_flags = flags;
-
-	gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
 
-	return gfs2_ea_set(GFS2_I(inode), &er);
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret == 0) {
+		ret = generic_setxattr(dentry, name, data, size, flags);
+		gfs2_glock_dq(&gh);
+	}
+	gfs2_holder_uninit(&gh);
+	return ret;
 }
 
 static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
 			     void *data, size_t size)
 {
-	struct gfs2_ea_request er;
-
-	memset(&er, 0, sizeof(struct gfs2_ea_request));
-	er.er_type = gfs2_ea_name2type(name, &er.er_name);
-	if (er.er_type == GFS2_EATYPE_UNUSED)
-		return -EOPNOTSUPP;
-	er.er_data = data;
-	er.er_name_len = strlen(er.er_name);
-	er.er_data_len = size;
-
-	return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
-}
-
-static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
-	struct gfs2_ea_request er;
-
-	memset(&er, 0, sizeof(struct gfs2_ea_request));
-	er.er_data = (size) ? buffer : NULL;
-	er.er_data_len = size;
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
 
-	return gfs2_ea_list(GFS2_I(dentry->d_inode), &er);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret == 0) {
+		ret = generic_getxattr(dentry, name, data, size);
+		gfs2_glock_dq(&gh);
+	}
+	gfs2_holder_uninit(&gh);
+	return ret;
 }
 
 static int gfs2_removexattr(struct dentry *dentry, const char *name)
 {
-	struct gfs2_ea_request er;
-
-	memset(&er, 0, sizeof(struct gfs2_ea_request));
-	er.er_type = gfs2_ea_name2type(name, &er.er_name);
-	if (er.er_type == GFS2_EATYPE_UNUSED)
-		return -EOPNOTSUPP;
-	er.er_name_len = strlen(er.er_name);
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
 
-	return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret == 0) {
+		ret = generic_removexattr(dentry, name);
+		gfs2_glock_dq(&gh);
+	}
+	gfs2_holder_uninit(&gh);
+	return ret;
 }
 
 static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 911c954cefbd..235db3682885 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -54,6 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
 extern const struct export_operations gfs2_export_ops;
 extern const struct super_operations gfs2_super_ops;
 extern const struct dentry_operations gfs2_dops;
+extern struct xattr_handler *gfs2_xattr_handlers[];
 
 #endif /* __SUPER_DOT_H__ */
 
-- 
cgit v1.2.2


From 307cf6e63cfa5025589ea1a06db44439a43819ff Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 26 Aug 2009 18:51:04 +0100
Subject: GFS2: Rename eattr.[ch] as xattr.[ch]

Use the more conventional name for the extended attribute
support code. Update all the places which care.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Makefile    |    2 +-
 fs/gfs2/acl.c       |    2 +-
 fs/gfs2/eattr.c     | 1558 ---------------------------------------------------
 fs/gfs2/eattr.h     |   72 ---
 fs/gfs2/inode.c     |    2 +-
 fs/gfs2/ops_inode.c |    2 +-
 fs/gfs2/super.c     |    2 +-
 fs/gfs2/xattr.c     | 1558 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/gfs2/xattr.h     |   72 +++
 9 files changed, 1635 insertions(+), 1635 deletions(-)
 delete mode 100644 fs/gfs2/eattr.c
 delete mode 100644 fs/gfs2/eattr.h
 create mode 100644 fs/gfs2/xattr.c
 create mode 100644 fs/gfs2/xattr.h

(limited to 'fs')

diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 2a4fd894858a..21f7e46da4c0 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,6 +1,6 @@
 EXTRA_CFLAGS := -I$(src)
 obj-$(CONFIG_GFS2_FS) += gfs2.o
-gfs2-y := acl.o bmap.o dir.o eattr.o glock.o \
+gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
 	glops.o inode.o log.o lops.o main.o meta_io.o \
 	aops.o dentry.o export.o file.o \
 	ops_fstype.o ops_inode.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index f6777f167260..3fc4e3ac7d84 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -19,7 +19,7 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "acl.h"
-#include "eattr.h"
+#include "xattr.h"
 #include "glock.h"
 #include "inode.h"
 #include "meta_io.h"
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
deleted file mode 100644
index d9a9e260d5ac..000000000000
--- a/fs/gfs2/eattr.c
+++ /dev/null
@@ -1,1558 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/xattr.h>
-#include <linux/gfs2_ondisk.h>
-#include <asm/uaccess.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "acl.h"
-#include "eattr.h"
-#include "glock.h"
-#include "inode.h"
-#include "meta_io.h"
-#include "quota.h"
-#include "rgrp.h"
-#include "trans.h"
-#include "util.h"
-
-/**
- * ea_calc_size - returns the acutal number of bytes the request will take up
- *                (not counting any unstuffed data blocks)
- * @sdp:
- * @er:
- * @size:
- *
- * Returns: 1 if the EA should be stuffed
- */
-
-static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize,
-			unsigned int *size)
-{
-	unsigned int jbsize = sdp->sd_jbsize;
-
-	/* Stuffed */
-	*size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8);
-
-	if (*size <= jbsize)
-		return 1;
-
-	/* Unstuffed */
-	*size = ALIGN(sizeof(struct gfs2_ea_header) + nsize +
-		      (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8);
-
-	return 0;
-}
-
-static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize)
-{
-	unsigned int size;
-
-	if (dsize > GFS2_EA_MAX_DATA_LEN)
-		return -ERANGE;
-
-	ea_calc_size(sdp, nsize, dsize, &size);
-
-	/* This can only happen with 512 byte blocks */
-	if (size > sdp->sd_jbsize)
-		return -ERANGE;
-
-	return 0;
-}
-
-typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh,
-			  struct gfs2_ea_header *ea,
-			  struct gfs2_ea_header *prev, void *private);
-
-static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
-			ea_call_t ea_call, void *data)
-{
-	struct gfs2_ea_header *ea, *prev = NULL;
-	int error = 0;
-
-	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA))
-		return -EIO;
-
-	for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
-		if (!GFS2_EA_REC_LEN(ea))
-			goto fail;
-		if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <=
-						  bh->b_data + bh->b_size))
-			goto fail;
-		if (!GFS2_EATYPE_VALID(ea->ea_type))
-			goto fail;
-
-		error = ea_call(ip, bh, ea, prev, data);
-		if (error)
-			return error;
-
-		if (GFS2_EA_IS_LAST(ea)) {
-			if ((char *)GFS2_EA2NEXT(ea) !=
-			    bh->b_data + bh->b_size)
-				goto fail;
-			break;
-		}
-	}
-
-	return error;
-
-fail:
-	gfs2_consist_inode(ip);
-	return -EIO;
-}
-
-static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
-{
-	struct buffer_head *bh, *eabh;
-	__be64 *eablk, *end;
-	int error;
-
-	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
-	if (error)
-		return error;
-
-	if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) {
-		error = ea_foreach_i(ip, bh, ea_call, data);
-		goto out;
-	}
-
-	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) {
-		error = -EIO;
-		goto out;
-	}
-
-	eablk = (__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header));
-	end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs;
-
-	for (; eablk < end; eablk++) {
-		u64 bn;
-
-		if (!*eablk)
-			break;
-		bn = be64_to_cpu(*eablk);
-
-		error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh);
-		if (error)
-			break;
-		error = ea_foreach_i(ip, eabh, ea_call, data);
-		brelse(eabh);
-		if (error)
-			break;
-	}
-out:
-	brelse(bh);
-	return error;
-}
-
-struct ea_find {
-	int type;
-	const char *name;
-	size_t namel;
-	struct gfs2_ea_location *ef_el;
-};
-
-static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
-		     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
-		     void *private)
-{
-	struct ea_find *ef = private;
-
-	if (ea->ea_type == GFS2_EATYPE_UNUSED)
-		return 0;
-
-	if (ea->ea_type == ef->type) {
-		if (ea->ea_name_len == ef->namel &&
-		    !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) {
-			struct gfs2_ea_location *el = ef->ef_el;
-			get_bh(bh);
-			el->el_bh = bh;
-			el->el_ea = ea;
-			el->el_prev = prev;
-			return 1;
-		}
-	}
-
-	return 0;
-}
-
-int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
-		 struct gfs2_ea_location *el)
-{
-	struct ea_find ef;
-	int error;
-
-	ef.type = type;
-	ef.name = name;
-	ef.namel = strlen(name);
-	ef.ef_el = el;
-
-	memset(el, 0, sizeof(struct gfs2_ea_location));
-
-	error = ea_foreach(ip, ea_find_i, &ef);
-	if (error > 0)
-		return 0;
-
-	return error;
-}
-
-/**
- * ea_dealloc_unstuffed -
- * @ip:
- * @bh:
- * @ea:
- * @prev:
- * @private:
- *
- * Take advantage of the fact that all unstuffed blocks are
- * allocated from the same RG.  But watch, this may not always
- * be true.
- *
- * Returns: errno
- */
-
-static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
-				struct gfs2_ea_header *ea,
-				struct gfs2_ea_header *prev, void *private)
-{
-	int *leave = private;
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_rgrpd *rgd;
-	struct gfs2_holder rg_gh;
-	struct buffer_head *dibh;
-	__be64 *dataptrs;
-	u64 bn = 0;
-	u64 bstart = 0;
-	unsigned int blen = 0;
-	unsigned int blks = 0;
-	unsigned int x;
-	int error;
-
-	if (GFS2_EA_IS_STUFFED(ea))
-		return 0;
-
-	dataptrs = GFS2_EA2DATAPTRS(ea);
-	for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
-		if (*dataptrs) {
-			blks++;
-			bn = be64_to_cpu(*dataptrs);
-		}
-	}
-	if (!blks)
-		return 0;
-
-	rgd = gfs2_blk2rgrpd(sdp, bn);
-	if (!rgd) {
-		gfs2_consist_inode(ip);
-		return -EIO;
-	}
-
-	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
-	if (error)
-		return error;
-
-	error = gfs2_trans_begin(sdp, rgd->rd_length + RES_DINODE +
-				 RES_EATTR + RES_STATFS + RES_QUOTA, blks);
-	if (error)
-		goto out_gunlock;
-
-	gfs2_trans_add_bh(ip->i_gl, bh, 1);
-
-	dataptrs = GFS2_EA2DATAPTRS(ea);
-	for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
-		if (!*dataptrs)
-			break;
-		bn = be64_to_cpu(*dataptrs);
-
-		if (bstart + blen == bn)
-			blen++;
-		else {
-			if (bstart)
-				gfs2_free_meta(ip, bstart, blen);
-			bstart = bn;
-			blen = 1;
-		}
-
-		*dataptrs = 0;
-		gfs2_add_inode_blocks(&ip->i_inode, -1);
-	}
-	if (bstart)
-		gfs2_free_meta(ip, bstart, blen);
-
-	if (prev && !leave) {
-		u32 len;
-
-		len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
-		prev->ea_rec_len = cpu_to_be32(len);
-
-		if (GFS2_EA_IS_LAST(ea))
-			prev->ea_flags |= GFS2_EAFLAG_LAST;
-	} else {
-		ea->ea_type = GFS2_EATYPE_UNUSED;
-		ea->ea_num_ptrs = 0;
-	}
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (!error) {
-		ip->i_inode.i_ctime = CURRENT_TIME;
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
-	}
-
-	gfs2_trans_end(sdp);
-
-out_gunlock:
-	gfs2_glock_dq_uninit(&rg_gh);
-	return error;
-}
-
-static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
-			       struct gfs2_ea_header *ea,
-			       struct gfs2_ea_header *prev, int leave)
-{
-	struct gfs2_alloc *al;
-	int error;
-
-	al = gfs2_alloc_get(ip);
-	if (!al)
-		return -ENOMEM;
-
-	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
-	if (error)
-		goto out_alloc;
-
-	error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
-	if (error)
-		goto out_quota;
-
-	error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
-
-	gfs2_glock_dq_uninit(&al->al_ri_gh);
-
-out_quota:
-	gfs2_quota_unhold(ip);
-out_alloc:
-	gfs2_alloc_put(ip);
-	return error;
-}
-
-struct ea_list {
-	struct gfs2_ea_request *ei_er;
-	unsigned int ei_size;
-};
-
-static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
-{
-	switch (ea->ea_type) {
-	case GFS2_EATYPE_USR:
-		return 5 + ea->ea_name_len + 1;
-	case GFS2_EATYPE_SYS:
-		return 7 + ea->ea_name_len + 1;
-	case GFS2_EATYPE_SECURITY:
-		return 9 + ea->ea_name_len + 1;
-	default:
-		return 0;
-	}
-}
-
-static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
-		     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
-		     void *private)
-{
-	struct ea_list *ei = private;
-	struct gfs2_ea_request *er = ei->ei_er;
-	unsigned int ea_size = gfs2_ea_strlen(ea);
-
-	if (ea->ea_type == GFS2_EATYPE_UNUSED)
-		return 0;
-
-	if (er->er_data_len) {
-		char *prefix = NULL;
-		unsigned int l = 0;
-		char c = 0;
-
-		if (ei->ei_size + ea_size > er->er_data_len)
-			return -ERANGE;
-
-		switch (ea->ea_type) {
-		case GFS2_EATYPE_USR:
-			prefix = "user.";
-			l = 5;
-			break;
-		case GFS2_EATYPE_SYS:
-			prefix = "system.";
-			l = 7;
-			break;
-		case GFS2_EATYPE_SECURITY:
-			prefix = "security.";
-			l = 9;
-			break;
-		}
-
-		BUG_ON(l == 0);
-
-		memcpy(er->er_data + ei->ei_size, prefix, l);
-		memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea),
-		       ea->ea_name_len);
-		memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1);
-	}
-
-	ei->ei_size += ea_size;
-
-	return 0;
-}
-
-/**
- * gfs2_listxattr - List gfs2 extended attributes
- * @dentry: The dentry whose inode we are interested in
- * @buffer: The buffer to write the results
- * @size: The size of the buffer
- *
- * Returns: actual size of data on success, -errno on error
- */
-
-ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
-	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
-	struct gfs2_ea_request er;
-	struct gfs2_holder i_gh;
-	int error;
-
-	memset(&er, 0, sizeof(struct gfs2_ea_request));
-	if (size) {
-		er.er_data = buffer;
-		er.er_data_len = size;
-	}
-
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-	if (error)
-		return error;
-
-	if (ip->i_eattr) {
-		struct ea_list ei = { .ei_er = &er, .ei_size = 0 };
-
-		error = ea_foreach(ip, ea_list_i, &ei);
-		if (!error)
-			error = ei.ei_size;
-	}
-
-	gfs2_glock_dq_uninit(&i_gh);
-
-	return error;
-}
-
-/**
- * ea_get_unstuffed - actually copies the unstuffed data into the
- *                    request buffer
- * @ip: The GFS2 inode
- * @ea: The extended attribute header structure
- * @data: The data to be copied
- *
- * Returns: errno
- */
-
-static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
-			    char *data)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct buffer_head **bh;
-	unsigned int amount = GFS2_EA_DATA_LEN(ea);
-	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
-	__be64 *dataptrs = GFS2_EA2DATAPTRS(ea);
-	unsigned int x;
-	int error = 0;
-
-	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
-	if (!bh)
-		return -ENOMEM;
-
-	for (x = 0; x < nptrs; x++) {
-		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
-				       bh + x);
-		if (error) {
-			while (x--)
-				brelse(bh[x]);
-			goto out;
-		}
-		dataptrs++;
-	}
-
-	for (x = 0; x < nptrs; x++) {
-		error = gfs2_meta_wait(sdp, bh[x]);
-		if (error) {
-			for (; x < nptrs; x++)
-				brelse(bh[x]);
-			goto out;
-		}
-		if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
-			for (; x < nptrs; x++)
-				brelse(bh[x]);
-			error = -EIO;
-			goto out;
-		}
-
-		memcpy(data, bh[x]->b_data + sizeof(struct gfs2_meta_header),
-		       (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
-
-		amount -= sdp->sd_jbsize;
-		data += sdp->sd_jbsize;
-
-		brelse(bh[x]);
-	}
-
-out:
-	kfree(bh);
-	return error;
-}
-
-int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-		     char *data, size_t size)
-{
-	int ret;
-	size_t len = GFS2_EA_DATA_LEN(el->el_ea);
-	if (len > size)
-		return -ERANGE;
-
-	if (GFS2_EA_IS_STUFFED(el->el_ea)) {
-		memcpy(data, GFS2_EA2DATA(el->el_ea), len);
-		return len;
-	}
-	ret = ea_get_unstuffed(ip, el->el_ea, data);
-	if (ret < 0)
-		return ret;
-	return len;
-}
-
-/**
- * gfs2_xattr_get - Get a GFS2 extended attribute
- * @inode: The inode
- * @type: The type of extended attribute
- * @name: The name of the extended attribute
- * @buffer: The buffer to write the result into
- * @size: The size of the buffer
- *
- * Returns: actual size of data on success, -errno on error
- */
-
-int gfs2_xattr_get(struct inode *inode, int type, const char *name,
-		   void *buffer, size_t size)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_ea_location el;
-	int error;
-
-	if (!ip->i_eattr)
-		return -ENODATA;
-	if (strlen(name) > GFS2_EA_MAX_NAME_LEN)
-		return -EINVAL;
-
-	error = gfs2_ea_find(ip, type, name, &el);
-	if (error)
-		return error;
-	if (!el.el_ea)
-		return -ENODATA;
-	if (size) 
-		error = gfs2_ea_get_copy(ip, &el, buffer, size);
-	else
-		error = GFS2_EA_DATA_LEN(el.el_ea);
-	brelse(el.el_bh);
-
-	return error;
-}
-
-/**
- * ea_alloc_blk - allocates a new block for extended attributes.
- * @ip: A pointer to the inode that's getting extended attributes
- * @bhp: Pointer to pointer to a struct buffer_head
- *
- * Returns: errno
- */
-
-static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_ea_header *ea;
-	unsigned int n = 1;
-	u64 block;
-	int error;
-
-	error = gfs2_alloc_block(ip, &block, &n);
-	if (error)
-		return error;
-	gfs2_trans_add_unrevoke(sdp, block, 1);
-	*bhp = gfs2_meta_new(ip->i_gl, block);
-	gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
-	gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
-	gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
-
-	ea = GFS2_EA_BH2FIRST(*bhp);
-	ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
-	ea->ea_type = GFS2_EATYPE_UNUSED;
-	ea->ea_flags = GFS2_EAFLAG_LAST;
-	ea->ea_num_ptrs = 0;
-
-	gfs2_add_inode_blocks(&ip->i_inode, 1);
-
-	return 0;
-}
-
-/**
- * ea_write - writes the request info to an ea, creating new blocks if
- *            necessary
- * @ip: inode that is being modified
- * @ea: the location of the new ea in a block
- * @er: the write request
- *
- * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
- *
- * returns : errno
- */
-
-static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
-		    struct gfs2_ea_request *er)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	int error;
-
-	ea->ea_data_len = cpu_to_be32(er->er_data_len);
-	ea->ea_name_len = er->er_name_len;
-	ea->ea_type = er->er_type;
-	ea->__pad = 0;
-
-	memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
-
-	if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
-		ea->ea_num_ptrs = 0;
-		memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
-	} else {
-		__be64 *dataptr = GFS2_EA2DATAPTRS(ea);
-		const char *data = er->er_data;
-		unsigned int data_len = er->er_data_len;
-		unsigned int copy;
-		unsigned int x;
-
-		ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize);
-		for (x = 0; x < ea->ea_num_ptrs; x++) {
-			struct buffer_head *bh;
-			u64 block;
-			int mh_size = sizeof(struct gfs2_meta_header);
-			unsigned int n = 1;
-
-			error = gfs2_alloc_block(ip, &block, &n);
-			if (error)
-				return error;
-			gfs2_trans_add_unrevoke(sdp, block, 1);
-			bh = gfs2_meta_new(ip->i_gl, block);
-			gfs2_trans_add_bh(ip->i_gl, bh, 1);
-			gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
-
-			gfs2_add_inode_blocks(&ip->i_inode, 1);
-
-			copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
-							   data_len;
-			memcpy(bh->b_data + mh_size, data, copy);
-			if (copy < sdp->sd_jbsize)
-				memset(bh->b_data + mh_size + copy, 0,
-				       sdp->sd_jbsize - copy);
-
-			*dataptr++ = cpu_to_be64(bh->b_blocknr);
-			data += copy;
-			data_len -= copy;
-
-			brelse(bh);
-		}
-
-		gfs2_assert_withdraw(sdp, !data_len);
-	}
-
-	return 0;
-}
-
-typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
-				   struct gfs2_ea_request *er, void *private);
-
-static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
-			     unsigned int blks,
-			     ea_skeleton_call_t skeleton_call, void *private)
-{
-	struct gfs2_alloc *al;
-	struct buffer_head *dibh;
-	int error;
-
-	al = gfs2_alloc_get(ip);
-	if (!al)
-		return -ENOMEM;
-
-	error = gfs2_quota_lock_check(ip);
-	if (error)
-		goto out;
-
-	al->al_requested = blks;
-
-	error = gfs2_inplace_reserve(ip);
-	if (error)
-		goto out_gunlock_q;
-
-	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
-				 blks + al->al_rgd->rd_length +
-				 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
-	if (error)
-		goto out_ipres;
-
-	error = skeleton_call(ip, er, private);
-	if (error)
-		goto out_end_trans;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (!error) {
-		ip->i_inode.i_ctime = CURRENT_TIME;
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
-	}
-
-out_end_trans:
-	gfs2_trans_end(GFS2_SB(&ip->i_inode));
-out_ipres:
-	gfs2_inplace_release(ip);
-out_gunlock_q:
-	gfs2_quota_unlock(ip);
-out:
-	gfs2_alloc_put(ip);
-	return error;
-}
-
-static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
-		     void *private)
-{
-	struct buffer_head *bh;
-	int error;
-
-	error = ea_alloc_blk(ip, &bh);
-	if (error)
-		return error;
-
-	ip->i_eattr = bh->b_blocknr;
-	error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
-
-	brelse(bh);
-
-	return error;
-}
-
-/**
- * ea_init - initializes a new eattr block
- * @ip:
- * @er:
- *
- * Returns: errno
- */
-
-static int ea_init(struct gfs2_inode *ip, int type, const char *name,
-		   const void *data, size_t size)
-{
-	struct gfs2_ea_request er;
-	unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
-	unsigned int blks = 1;
-
-	er.er_type = type;
-	er.er_name = name;
-	er.er_name_len = strlen(name);
-	er.er_data = (void *)data;
-	er.er_data_len = size;
-
-	if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize)
-		blks += DIV_ROUND_UP(er.er_data_len, jbsize);
-
-	return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL);
-}
-
-static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
-{
-	u32 ea_size = GFS2_EA_SIZE(ea);
-	struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea +
-				     ea_size);
-	u32 new_size = GFS2_EA_REC_LEN(ea) - ea_size;
-	int last = ea->ea_flags & GFS2_EAFLAG_LAST;
-
-	ea->ea_rec_len = cpu_to_be32(ea_size);
-	ea->ea_flags ^= last;
-
-	new->ea_rec_len = cpu_to_be32(new_size);
-	new->ea_flags = last;
-
-	return new;
-}
-
-static void ea_set_remove_stuffed(struct gfs2_inode *ip,
-				  struct gfs2_ea_location *el)
-{
-	struct gfs2_ea_header *ea = el->el_ea;
-	struct gfs2_ea_header *prev = el->el_prev;
-	u32 len;
-
-	gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
-
-	if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
-		ea->ea_type = GFS2_EATYPE_UNUSED;
-		return;
-	} else if (GFS2_EA2NEXT(prev) != ea) {
-		prev = GFS2_EA2NEXT(prev);
-		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea);
-	}
-
-	len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
-	prev->ea_rec_len = cpu_to_be32(len);
-
-	if (GFS2_EA_IS_LAST(ea))
-		prev->ea_flags |= GFS2_EAFLAG_LAST;
-}
-
-struct ea_set {
-	int ea_split;
-
-	struct gfs2_ea_request *es_er;
-	struct gfs2_ea_location *es_el;
-
-	struct buffer_head *es_bh;
-	struct gfs2_ea_header *es_ea;
-};
-
-static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
-				 struct gfs2_ea_header *ea, struct ea_set *es)
-{
-	struct gfs2_ea_request *er = es->es_er;
-	struct buffer_head *dibh;
-	int error;
-
-	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0);
-	if (error)
-		return error;
-
-	gfs2_trans_add_bh(ip->i_gl, bh, 1);
-
-	if (es->ea_split)
-		ea = ea_split_ea(ea);
-
-	ea_write(ip, ea, er);
-
-	if (es->es_el)
-		ea_set_remove_stuffed(ip, es->es_el);
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
-		goto out;
-	ip->i_inode.i_ctime = CURRENT_TIME;
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(ip, dibh->b_data);
-	brelse(dibh);
-out:
-	gfs2_trans_end(GFS2_SB(&ip->i_inode));
-	return error;
-}
-
-static int ea_set_simple_alloc(struct gfs2_inode *ip,
-			       struct gfs2_ea_request *er, void *private)
-{
-	struct ea_set *es = private;
-	struct gfs2_ea_header *ea = es->es_ea;
-	int error;
-
-	gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
-
-	if (es->ea_split)
-		ea = ea_split_ea(ea);
-
-	error = ea_write(ip, ea, er);
-	if (error)
-		return error;
-
-	if (es->es_el)
-		ea_set_remove_stuffed(ip, es->es_el);
-
-	return 0;
-}
-
-static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
-			 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
-			 void *private)
-{
-	struct ea_set *es = private;
-	unsigned int size;
-	int stuffed;
-	int error;
-
-	stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len,
-			       es->es_er->er_data_len, &size);
-
-	if (ea->ea_type == GFS2_EATYPE_UNUSED) {
-		if (GFS2_EA_REC_LEN(ea) < size)
-			return 0;
-		if (!GFS2_EA_IS_STUFFED(ea)) {
-			error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
-			if (error)
-				return error;
-		}
-		es->ea_split = 0;
-	} else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
-		es->ea_split = 1;
-	else
-		return 0;
-
-	if (stuffed) {
-		error = ea_set_simple_noalloc(ip, bh, ea, es);
-		if (error)
-			return error;
-	} else {
-		unsigned int blks;
-
-		es->es_bh = bh;
-		es->es_ea = ea;
-		blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len,
-					GFS2_SB(&ip->i_inode)->sd_jbsize);
-
-		error = ea_alloc_skeleton(ip, es->es_er, blks,
-					  ea_set_simple_alloc, es);
-		if (error)
-			return error;
-	}
-
-	return 1;
-}
-
-static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
-			void *private)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct buffer_head *indbh, *newbh;
-	__be64 *eablk;
-	int error;
-	int mh_size = sizeof(struct gfs2_meta_header);
-
-	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
-		__be64 *end;
-
-		error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
-				       &indbh);
-		if (error)
-			return error;
-
-		if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
-			error = -EIO;
-			goto out;
-		}
-
-		eablk = (__be64 *)(indbh->b_data + mh_size);
-		end = eablk + sdp->sd_inptrs;
-
-		for (; eablk < end; eablk++)
-			if (!*eablk)
-				break;
-
-		if (eablk == end) {
-			error = -ENOSPC;
-			goto out;
-		}
-
-		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
-	} else {
-		u64 blk;
-		unsigned int n = 1;
-		error = gfs2_alloc_block(ip, &blk, &n);
-		if (error)
-			return error;
-		gfs2_trans_add_unrevoke(sdp, blk, 1);
-		indbh = gfs2_meta_new(ip->i_gl, blk);
-		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
-		gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
-		gfs2_buffer_clear_tail(indbh, mh_size);
-
-		eablk = (__be64 *)(indbh->b_data + mh_size);
-		*eablk = cpu_to_be64(ip->i_eattr);
-		ip->i_eattr = blk;
-		ip->i_diskflags |= GFS2_DIF_EA_INDIRECT;
-		gfs2_add_inode_blocks(&ip->i_inode, 1);
-
-		eablk++;
-	}
-
-	error = ea_alloc_blk(ip, &newbh);
-	if (error)
-		goto out;
-
-	*eablk = cpu_to_be64((u64)newbh->b_blocknr);
-	error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
-	brelse(newbh);
-	if (error)
-		goto out;
-
-	if (private)
-		ea_set_remove_stuffed(ip, private);
-
-out:
-	brelse(indbh);
-	return error;
-}
-
-static int ea_set_i(struct gfs2_inode *ip, int type, const char *name,
-		    const void *value, size_t size, struct gfs2_ea_location *el)
-{
-	struct gfs2_ea_request er;
-	struct ea_set es;
-	unsigned int blks = 2;
-	int error;
-
-	er.er_type = type;
-	er.er_name = name;
-	er.er_data = (void *)value;
-	er.er_name_len = strlen(name);
-	er.er_data_len = size;
-
-	memset(&es, 0, sizeof(struct ea_set));
-	es.es_er = &er;
-	es.es_el = el;
-
-	error = ea_foreach(ip, ea_set_simple, &es);
-	if (error > 0)
-		return 0;
-	if (error)
-		return error;
-
-	if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
-		blks++;
-	if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
-		blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
-
-	return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el);
-}
-
-static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
-				   struct gfs2_ea_location *el)
-{
-	if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
-		el->el_prev = GFS2_EA2NEXT(el->el_prev);
-		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
-				     GFS2_EA2NEXT(el->el_prev) == el->el_ea);
-	}
-
-	return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
-}
-
-static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
-{
-	struct gfs2_ea_header *ea = el->el_ea;
-	struct gfs2_ea_header *prev = el->el_prev;
-	struct buffer_head *dibh;
-	int error;
-
-	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
-	if (error)
-		return error;
-
-	gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
-
-	if (prev) {
-		u32 len;
-
-		len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
-		prev->ea_rec_len = cpu_to_be32(len);
-
-		if (GFS2_EA_IS_LAST(ea))
-			prev->ea_flags |= GFS2_EAFLAG_LAST;
-	} else {
-		ea->ea_type = GFS2_EATYPE_UNUSED;
-	}
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (!error) {
-		ip->i_inode.i_ctime = CURRENT_TIME;
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
-	}
-
-	gfs2_trans_end(GFS2_SB(&ip->i_inode));
-
-	return error;
-}
-
-/**
- * gfs2_xattr_remove - Remove a GFS2 extended attribute
- * @inode: The inode
- * @type: The type of the extended attribute
- * @name: The name of the extended attribute
- *
- * This is not called directly by the VFS since we use the (common)
- * scheme of making a "set with NULL data" mean a remove request. Note
- * that this is different from a set with zero length data.
- *
- * Returns: 0, or errno on failure
- */
-
-static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_ea_location el;
-	int error;
-
-	if (!ip->i_eattr)
-		return -ENODATA;
-
-	error = gfs2_ea_find(ip, type, name, &el);
-	if (error)
-		return error;
-	if (!el.el_ea)
-		return -ENODATA;
-
-	if (GFS2_EA_IS_STUFFED(el.el_ea))
-		error = ea_remove_stuffed(ip, &el);
-	else
-		error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0);
-
-	brelse(el.el_bh);
-
-	return error;
-}
-
-/**
- * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
- * @inode: The inode
- * @type: The type of the extended attribute
- * @name: The name of the extended attribute
- * @value: The value of the extended attribute (NULL for remove)
- * @size: The size of the @value argument
- * @flags: Create or Replace
- *
- * See gfs2_xattr_remove() for details of the removal of xattrs.
- *
- * Returns: 0 or errno on failure
- */
-
-int gfs2_xattr_set(struct inode *inode, int type, const char *name,
-		   const void *value, size_t size, int flags)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_ea_location el;
-	unsigned int namel = strlen(name);
-	int error;
-
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		return -EPERM;
-	if (namel > GFS2_EA_MAX_NAME_LEN)
-		return -ERANGE;
-
-	if (value == NULL)
-		return gfs2_xattr_remove(inode, type, name);
-
-	if (ea_check_size(sdp, namel, size))
-		return -ERANGE;
-
-	if (!ip->i_eattr) {
-		if (flags & XATTR_REPLACE)
-			return -ENODATA;
-		return ea_init(ip, type, name, value, size);
-	}
-
-	error = gfs2_ea_find(ip, type, name, &el);
-	if (error)
-		return error;
-
-	if (el.el_ea) {
-		if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
-			brelse(el.el_bh);
-			return -EPERM;
-		}
-
-		error = -EEXIST;
-		if (!(flags & XATTR_CREATE)) {
-			int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
-			error = ea_set_i(ip, type, name, value, size, &el);
-			if (!error && unstuffed)
-				ea_set_remove_unstuffed(ip, &el);
-		}
-
-		brelse(el.el_bh);
-		return error;
-	}
-
-	error = -ENODATA;
-	if (!(flags & XATTR_REPLACE))
-		error = ea_set_i(ip, type, name, value, size, NULL);
-
-	return error;
-}
-
-static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
-				  struct gfs2_ea_header *ea, char *data)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct buffer_head **bh;
-	unsigned int amount = GFS2_EA_DATA_LEN(ea);
-	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
-	__be64 *dataptrs = GFS2_EA2DATAPTRS(ea);
-	unsigned int x;
-	int error;
-
-	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
-	if (!bh)
-		return -ENOMEM;
-
-	error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
-	if (error)
-		goto out;
-
-	for (x = 0; x < nptrs; x++) {
-		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
-				       bh + x);
-		if (error) {
-			while (x--)
-				brelse(bh[x]);
-			goto fail;
-		}
-		dataptrs++;
-	}
-
-	for (x = 0; x < nptrs; x++) {
-		error = gfs2_meta_wait(sdp, bh[x]);
-		if (error) {
-			for (; x < nptrs; x++)
-				brelse(bh[x]);
-			goto fail;
-		}
-		if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
-			for (; x < nptrs; x++)
-				brelse(bh[x]);
-			error = -EIO;
-			goto fail;
-		}
-
-		gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
-
-		memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), data,
-		       (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
-
-		amount -= sdp->sd_jbsize;
-		data += sdp->sd_jbsize;
-
-		brelse(bh[x]);
-	}
-
-out:
-	kfree(bh);
-	return error;
-
-fail:
-	gfs2_trans_end(sdp);
-	kfree(bh);
-	return error;
-}
-
-int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-		      struct iattr *attr, char *data)
-{
-	struct buffer_head *dibh;
-	int error;
-
-	if (GFS2_EA_IS_STUFFED(el->el_ea)) {
-		error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
-		if (error)
-			return error;
-
-		gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
-		memcpy(GFS2_EA2DATA(el->el_ea), data,
-		       GFS2_EA_DATA_LEN(el->el_ea));
-	} else
-		error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
-
-	if (error)
-		return error;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (!error) {
-		error = inode_setattr(&ip->i_inode, attr);
-		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
-	}
-
-	gfs2_trans_end(GFS2_SB(&ip->i_inode));
-
-	return error;
-}
-
-static int ea_dealloc_indirect(struct gfs2_inode *ip)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_rgrp_list rlist;
-	struct buffer_head *indbh, *dibh;
-	__be64 *eablk, *end;
-	unsigned int rg_blocks = 0;
-	u64 bstart = 0;
-	unsigned int blen = 0;
-	unsigned int blks = 0;
-	unsigned int x;
-	int error;
-
-	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
-
-	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
-	if (error)
-		return error;
-
-	if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
-		error = -EIO;
-		goto out;
-	}
-
-	eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
-	end = eablk + sdp->sd_inptrs;
-
-	for (; eablk < end; eablk++) {
-		u64 bn;
-
-		if (!*eablk)
-			break;
-		bn = be64_to_cpu(*eablk);
-
-		if (bstart + blen == bn)
-			blen++;
-		else {
-			if (bstart)
-				gfs2_rlist_add(sdp, &rlist, bstart);
-			bstart = bn;
-			blen = 1;
-		}
-		blks++;
-	}
-	if (bstart)
-		gfs2_rlist_add(sdp, &rlist, bstart);
-	else
-		goto out;
-
-	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
-
-	for (x = 0; x < rlist.rl_rgrps; x++) {
-		struct gfs2_rgrpd *rgd;
-		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
-		rg_blocks += rgd->rd_length;
-	}
-
-	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
-	if (error)
-		goto out_rlist_free;
-
-	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT +
-				 RES_STATFS + RES_QUOTA, blks);
-	if (error)
-		goto out_gunlock;
-
-	gfs2_trans_add_bh(ip->i_gl, indbh, 1);
-
-	eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
-	bstart = 0;
-	blen = 0;
-
-	for (; eablk < end; eablk++) {
-		u64 bn;
-
-		if (!*eablk)
-			break;
-		bn = be64_to_cpu(*eablk);
-
-		if (bstart + blen == bn)
-			blen++;
-		else {
-			if (bstart)
-				gfs2_free_meta(ip, bstart, blen);
-			bstart = bn;
-			blen = 1;
-		}
-
-		*eablk = 0;
-		gfs2_add_inode_blocks(&ip->i_inode, -1);
-	}
-	if (bstart)
-		gfs2_free_meta(ip, bstart, blen);
-
-	ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (!error) {
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
-	}
-
-	gfs2_trans_end(sdp);
-
-out_gunlock:
-	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
-out_rlist_free:
-	gfs2_rlist_free(&rlist);
-out:
-	brelse(indbh);
-	return error;
-}
-
-static int ea_dealloc_block(struct gfs2_inode *ip)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_alloc *al = ip->i_alloc;
-	struct gfs2_rgrpd *rgd;
-	struct buffer_head *dibh;
-	int error;
-
-	rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr);
-	if (!rgd) {
-		gfs2_consist_inode(ip);
-		return -EIO;
-	}
-
-	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
-				   &al->al_rgd_gh);
-	if (error)
-		return error;
-
-	error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + RES_STATFS +
-				 RES_QUOTA, 1);
-	if (error)
-		goto out_gunlock;
-
-	gfs2_free_meta(ip, ip->i_eattr, 1);
-
-	ip->i_eattr = 0;
-	gfs2_add_inode_blocks(&ip->i_inode, -1);
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (!error) {
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
-	}
-
-	gfs2_trans_end(sdp);
-
-out_gunlock:
-	gfs2_glock_dq_uninit(&al->al_rgd_gh);
-	return error;
-}
-
-/**
- * gfs2_ea_dealloc - deallocate the extended attribute fork
- * @ip: the inode
- *
- * Returns: errno
- */
-
-int gfs2_ea_dealloc(struct gfs2_inode *ip)
-{
-	struct gfs2_alloc *al;
-	int error;
-
-	al = gfs2_alloc_get(ip);
-	if (!al)
-		return -ENOMEM;
-
-	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
-	if (error)
-		goto out_alloc;
-
-	error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
-	if (error)
-		goto out_quota;
-
-	error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
-	if (error)
-		goto out_rindex;
-
-	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
-		error = ea_dealloc_indirect(ip);
-		if (error)
-			goto out_rindex;
-	}
-
-	error = ea_dealloc_block(ip);
-
-out_rindex:
-	gfs2_glock_dq_uninit(&al->al_ri_gh);
-out_quota:
-	gfs2_quota_unhold(ip);
-out_alloc:
-	gfs2_alloc_put(ip);
-	return error;
-}
-
-static int gfs2_xattr_user_get(struct inode *inode, const char *name,
-			       void *buffer, size_t size)
-{
-	return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size);
-}
-
-static int gfs2_xattr_user_set(struct inode *inode, const char *name,
-			       const void *value, size_t size, int flags)
-{
-	return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
-}
-
-static int gfs2_xattr_system_get(struct inode *inode, const char *name,
-				 void *buffer, size_t size)
-{
-	return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
-}
-
-static int gfs2_xattr_system_set(struct inode *inode, const char *name,
-				 const void *value, size_t size, int flags)
-{
-	return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
-}
-
-static int gfs2_xattr_security_get(struct inode *inode, const char *name,
-				   void *buffer, size_t size)
-{
-	return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size);
-}
-
-static int gfs2_xattr_security_set(struct inode *inode, const char *name,
-				   const void *value, size_t size, int flags)
-{
-	return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags);
-}
-
-static struct xattr_handler gfs2_xattr_user_handler = {
-	.prefix = XATTR_USER_PREFIX,
-	.get    = gfs2_xattr_user_get,
-	.set    = gfs2_xattr_user_set,
-};
-
-static struct xattr_handler gfs2_xattr_security_handler = {
-	.prefix = XATTR_SECURITY_PREFIX,
-	.get    = gfs2_xattr_security_get,
-	.set    = gfs2_xattr_security_set,
-};
-
-static struct xattr_handler gfs2_xattr_system_handler = {
-	.prefix = XATTR_SYSTEM_PREFIX,
-	.get    = gfs2_xattr_system_get,
-	.set    = gfs2_xattr_system_set,
-};
-
-struct xattr_handler *gfs2_xattr_handlers[] = {
-	&gfs2_xattr_user_handler,
-	&gfs2_xattr_security_handler,
-	&gfs2_xattr_system_handler,
-	NULL,
-};
-
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
deleted file mode 100644
index 4040a188f63b..000000000000
--- a/fs/gfs2/eattr.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __EATTR_DOT_H__
-#define __EATTR_DOT_H__
-
-struct gfs2_inode;
-struct iattr;
-
-#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
-#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
-
-#define GFS2_EA_SIZE(ea) \
-ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
-      ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
-                                  (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
-
-#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
-#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
-
-#define GFS2_EAREQ_SIZE_STUFFED(er) \
-ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
-
-#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
-#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
-
-#define GFS2_EA2DATAPTRS(ea) \
-((__be64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
-
-#define GFS2_EA2NEXT(ea) \
-((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
-
-#define GFS2_EA_BH2FIRST(bh) \
-((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
-
-struct gfs2_ea_request {
-	const char *er_name;
-	char *er_data;
-	unsigned int er_name_len;
-	unsigned int er_data_len;
-	unsigned int er_type; /* GFS2_EATYPE_... */
-};
-
-struct gfs2_ea_location {
-	struct buffer_head *el_bh;
-	struct gfs2_ea_header *el_ea;
-	struct gfs2_ea_header *el_prev;
-};
-
-extern int gfs2_xattr_get(struct inode *inode, int type, const char *name,
-			  void *buffer, size_t size);
-extern int gfs2_xattr_set(struct inode *inode, int type, const char *name,
-			  const void *value, size_t size, int flags);
-extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
-extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
-
-/* Exported to acl.c */
-
-extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
-			struct gfs2_ea_location *el);
-extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-			    char *data, size_t size);
-extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-			     struct iattr *attr, char *data);
-
-#endif /* __EATTR_DOT_H__ */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index f936d2d68138..4f5e442d60d8 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -24,7 +24,7 @@
 #include "acl.h"
 #include "bmap.h"
 #include "dir.h"
-#include "eattr.h"
+#include "xattr.h"
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1ee0c2657467..c3ac18054057 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -26,7 +26,7 @@
 #include "acl.h"
 #include "bmap.h"
 #include "dir.h"
-#include "eattr.h"
+#include "xattr.h"
 #include "glock.h"
 #include "inode.h"
 #include "meta_io.h"
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7a5c128e8776..2e78a3f68a76 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -38,7 +38,7 @@
 #include "trans.h"
 #include "util.h"
 #include "sys.h"
-#include "eattr.h"
+#include "xattr.h"
 
 #define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
 
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
new file mode 100644
index 000000000000..34ae9ba4c9fe
--- /dev/null
+++ b/fs/gfs2/xattr.c
@@ -0,0 +1,1558 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "xattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+
+/**
+ * ea_calc_size - returns the acutal number of bytes the request will take up
+ *                (not counting any unstuffed data blocks)
+ * @sdp:
+ * @er:
+ * @size:
+ *
+ * Returns: 1 if the EA should be stuffed
+ */
+
+static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize,
+			unsigned int *size)
+{
+	unsigned int jbsize = sdp->sd_jbsize;
+
+	/* Stuffed */
+	*size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8);
+
+	if (*size <= jbsize)
+		return 1;
+
+	/* Unstuffed */
+	*size = ALIGN(sizeof(struct gfs2_ea_header) + nsize +
+		      (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8);
+
+	return 0;
+}
+
+static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize)
+{
+	unsigned int size;
+
+	if (dsize > GFS2_EA_MAX_DATA_LEN)
+		return -ERANGE;
+
+	ea_calc_size(sdp, nsize, dsize, &size);
+
+	/* This can only happen with 512 byte blocks */
+	if (size > sdp->sd_jbsize)
+		return -ERANGE;
+
+	return 0;
+}
+
+typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh,
+			  struct gfs2_ea_header *ea,
+			  struct gfs2_ea_header *prev, void *private);
+
+static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
+			ea_call_t ea_call, void *data)
+{
+	struct gfs2_ea_header *ea, *prev = NULL;
+	int error = 0;
+
+	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA))
+		return -EIO;
+
+	for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
+		if (!GFS2_EA_REC_LEN(ea))
+			goto fail;
+		if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <=
+						  bh->b_data + bh->b_size))
+			goto fail;
+		if (!GFS2_EATYPE_VALID(ea->ea_type))
+			goto fail;
+
+		error = ea_call(ip, bh, ea, prev, data);
+		if (error)
+			return error;
+
+		if (GFS2_EA_IS_LAST(ea)) {
+			if ((char *)GFS2_EA2NEXT(ea) !=
+			    bh->b_data + bh->b_size)
+				goto fail;
+			break;
+		}
+	}
+
+	return error;
+
+fail:
+	gfs2_consist_inode(ip);
+	return -EIO;
+}
+
+static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
+{
+	struct buffer_head *bh, *eabh;
+	__be64 *eablk, *end;
+	int error;
+
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
+	if (error)
+		return error;
+
+	if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) {
+		error = ea_foreach_i(ip, bh, ea_call, data);
+		goto out;
+	}
+
+	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) {
+		error = -EIO;
+		goto out;
+	}
+
+	eablk = (__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header));
+	end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs;
+
+	for (; eablk < end; eablk++) {
+		u64 bn;
+
+		if (!*eablk)
+			break;
+		bn = be64_to_cpu(*eablk);
+
+		error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh);
+		if (error)
+			break;
+		error = ea_foreach_i(ip, eabh, ea_call, data);
+		brelse(eabh);
+		if (error)
+			break;
+	}
+out:
+	brelse(bh);
+	return error;
+}
+
+struct ea_find {
+	int type;
+	const char *name;
+	size_t namel;
+	struct gfs2_ea_location *ef_el;
+};
+
+static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
+		     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+		     void *private)
+{
+	struct ea_find *ef = private;
+
+	if (ea->ea_type == GFS2_EATYPE_UNUSED)
+		return 0;
+
+	if (ea->ea_type == ef->type) {
+		if (ea->ea_name_len == ef->namel &&
+		    !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) {
+			struct gfs2_ea_location *el = ef->ef_el;
+			get_bh(bh);
+			el->el_bh = bh;
+			el->el_ea = ea;
+			el->el_prev = prev;
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+		 struct gfs2_ea_location *el)
+{
+	struct ea_find ef;
+	int error;
+
+	ef.type = type;
+	ef.name = name;
+	ef.namel = strlen(name);
+	ef.ef_el = el;
+
+	memset(el, 0, sizeof(struct gfs2_ea_location));
+
+	error = ea_foreach(ip, ea_find_i, &ef);
+	if (error > 0)
+		return 0;
+
+	return error;
+}
+
+/**
+ * ea_dealloc_unstuffed -
+ * @ip:
+ * @bh:
+ * @ea:
+ * @prev:
+ * @private:
+ *
+ * Take advantage of the fact that all unstuffed blocks are
+ * allocated from the same RG.  But watch, this may not always
+ * be true.
+ *
+ * Returns: errno
+ */
+
+static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+				struct gfs2_ea_header *ea,
+				struct gfs2_ea_header *prev, void *private)
+{
+	int *leave = private;
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_holder rg_gh;
+	struct buffer_head *dibh;
+	__be64 *dataptrs;
+	u64 bn = 0;
+	u64 bstart = 0;
+	unsigned int blen = 0;
+	unsigned int blks = 0;
+	unsigned int x;
+	int error;
+
+	if (GFS2_EA_IS_STUFFED(ea))
+		return 0;
+
+	dataptrs = GFS2_EA2DATAPTRS(ea);
+	for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+		if (*dataptrs) {
+			blks++;
+			bn = be64_to_cpu(*dataptrs);
+		}
+	}
+	if (!blks)
+		return 0;
+
+	rgd = gfs2_blk2rgrpd(sdp, bn);
+	if (!rgd) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
+	if (error)
+		return error;
+
+	error = gfs2_trans_begin(sdp, rgd->rd_length + RES_DINODE +
+				 RES_EATTR + RES_STATFS + RES_QUOTA, blks);
+	if (error)
+		goto out_gunlock;
+
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+	dataptrs = GFS2_EA2DATAPTRS(ea);
+	for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+		if (!*dataptrs)
+			break;
+		bn = be64_to_cpu(*dataptrs);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart)
+				gfs2_free_meta(ip, bstart, blen);
+			bstart = bn;
+			blen = 1;
+		}
+
+		*dataptrs = 0;
+		gfs2_add_inode_blocks(&ip->i_inode, -1);
+	}
+	if (bstart)
+		gfs2_free_meta(ip, bstart, blen);
+
+	if (prev && !leave) {
+		u32 len;
+
+		len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+		prev->ea_rec_len = cpu_to_be32(len);
+
+		if (GFS2_EA_IS_LAST(ea))
+			prev->ea_flags |= GFS2_EAFLAG_LAST;
+	} else {
+		ea->ea_type = GFS2_EATYPE_UNUSED;
+		ea->ea_num_ptrs = 0;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(sdp);
+
+out_gunlock:
+	gfs2_glock_dq_uninit(&rg_gh);
+	return error;
+}
+
+static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+			       struct gfs2_ea_header *ea,
+			       struct gfs2_ea_header *prev, int leave)
+{
+	struct gfs2_alloc *al;
+	int error;
+
+	al = gfs2_alloc_get(ip);
+	if (!al)
+		return -ENOMEM;
+
+	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto out_alloc;
+
+	error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
+	if (error)
+		goto out_quota;
+
+	error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
+
+	gfs2_glock_dq_uninit(&al->al_ri_gh);
+
+out_quota:
+	gfs2_quota_unhold(ip);
+out_alloc:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+struct ea_list {
+	struct gfs2_ea_request *ei_er;
+	unsigned int ei_size;
+};
+
+static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
+{
+	switch (ea->ea_type) {
+	case GFS2_EATYPE_USR:
+		return 5 + ea->ea_name_len + 1;
+	case GFS2_EATYPE_SYS:
+		return 7 + ea->ea_name_len + 1;
+	case GFS2_EATYPE_SECURITY:
+		return 9 + ea->ea_name_len + 1;
+	default:
+		return 0;
+	}
+}
+
+static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
+		     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+		     void *private)
+{
+	struct ea_list *ei = private;
+	struct gfs2_ea_request *er = ei->ei_er;
+	unsigned int ea_size = gfs2_ea_strlen(ea);
+
+	if (ea->ea_type == GFS2_EATYPE_UNUSED)
+		return 0;
+
+	if (er->er_data_len) {
+		char *prefix = NULL;
+		unsigned int l = 0;
+		char c = 0;
+
+		if (ei->ei_size + ea_size > er->er_data_len)
+			return -ERANGE;
+
+		switch (ea->ea_type) {
+		case GFS2_EATYPE_USR:
+			prefix = "user.";
+			l = 5;
+			break;
+		case GFS2_EATYPE_SYS:
+			prefix = "system.";
+			l = 7;
+			break;
+		case GFS2_EATYPE_SECURITY:
+			prefix = "security.";
+			l = 9;
+			break;
+		}
+
+		BUG_ON(l == 0);
+
+		memcpy(er->er_data + ei->ei_size, prefix, l);
+		memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea),
+		       ea->ea_name_len);
+		memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1);
+	}
+
+	ei->ei_size += ea_size;
+
+	return 0;
+}
+
+/**
+ * gfs2_listxattr - List gfs2 extended attributes
+ * @dentry: The dentry whose inode we are interested in
+ * @buffer: The buffer to write the results
+ * @size: The size of the buffer
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+
+ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+	struct gfs2_ea_request er;
+	struct gfs2_holder i_gh;
+	int error;
+
+	memset(&er, 0, sizeof(struct gfs2_ea_request));
+	if (size) {
+		er.er_data = buffer;
+		er.er_data_len = size;
+	}
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+	if (error)
+		return error;
+
+	if (ip->i_eattr) {
+		struct ea_list ei = { .ei_er = &er, .ei_size = 0 };
+
+		error = ea_foreach(ip, ea_list_i, &ei);
+		if (!error)
+			error = ei.ei_size;
+	}
+
+	gfs2_glock_dq_uninit(&i_gh);
+
+	return error;
+}
+
+/**
+ * ea_get_unstuffed - actually copies the unstuffed data into the
+ *                    request buffer
+ * @ip: The GFS2 inode
+ * @ea: The extended attribute header structure
+ * @data: The data to be copied
+ *
+ * Returns: errno
+ */
+
+static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+			    char *data)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head **bh;
+	unsigned int amount = GFS2_EA_DATA_LEN(ea);
+	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
+	__be64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+	unsigned int x;
+	int error = 0;
+
+	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
+	if (!bh)
+		return -ENOMEM;
+
+	for (x = 0; x < nptrs; x++) {
+		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+				       bh + x);
+		if (error) {
+			while (x--)
+				brelse(bh[x]);
+			goto out;
+		}
+		dataptrs++;
+	}
+
+	for (x = 0; x < nptrs; x++) {
+		error = gfs2_meta_wait(sdp, bh[x]);
+		if (error) {
+			for (; x < nptrs; x++)
+				brelse(bh[x]);
+			goto out;
+		}
+		if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+			for (; x < nptrs; x++)
+				brelse(bh[x]);
+			error = -EIO;
+			goto out;
+		}
+
+		memcpy(data, bh[x]->b_data + sizeof(struct gfs2_meta_header),
+		       (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+
+		amount -= sdp->sd_jbsize;
+		data += sdp->sd_jbsize;
+
+		brelse(bh[x]);
+	}
+
+out:
+	kfree(bh);
+	return error;
+}
+
+int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+		     char *data, size_t size)
+{
+	int ret;
+	size_t len = GFS2_EA_DATA_LEN(el->el_ea);
+	if (len > size)
+		return -ERANGE;
+
+	if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+		memcpy(data, GFS2_EA2DATA(el->el_ea), len);
+		return len;
+	}
+	ret = ea_get_unstuffed(ip, el->el_ea, data);
+	if (ret < 0)
+		return ret;
+	return len;
+}
+
+/**
+ * gfs2_xattr_get - Get a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of extended attribute
+ * @name: The name of the extended attribute
+ * @buffer: The buffer to write the result into
+ * @size: The size of the buffer
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+
+int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+		   void *buffer, size_t size)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_ea_location el;
+	int error;
+
+	if (!ip->i_eattr)
+		return -ENODATA;
+	if (strlen(name) > GFS2_EA_MAX_NAME_LEN)
+		return -EINVAL;
+
+	error = gfs2_ea_find(ip, type, name, &el);
+	if (error)
+		return error;
+	if (!el.el_ea)
+		return -ENODATA;
+	if (size) 
+		error = gfs2_ea_get_copy(ip, &el, buffer, size);
+	else
+		error = GFS2_EA_DATA_LEN(el.el_ea);
+	brelse(el.el_bh);
+
+	return error;
+}
+
+/**
+ * ea_alloc_blk - allocates a new block for extended attributes.
+ * @ip: A pointer to the inode that's getting extended attributes
+ * @bhp: Pointer to pointer to a struct buffer_head
+ *
+ * Returns: errno
+ */
+
+static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_ea_header *ea;
+	unsigned int n = 1;
+	u64 block;
+	int error;
+
+	error = gfs2_alloc_block(ip, &block, &n);
+	if (error)
+		return error;
+	gfs2_trans_add_unrevoke(sdp, block, 1);
+	*bhp = gfs2_meta_new(ip->i_gl, block);
+	gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
+	gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
+	gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
+
+	ea = GFS2_EA_BH2FIRST(*bhp);
+	ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
+	ea->ea_type = GFS2_EATYPE_UNUSED;
+	ea->ea_flags = GFS2_EAFLAG_LAST;
+	ea->ea_num_ptrs = 0;
+
+	gfs2_add_inode_blocks(&ip->i_inode, 1);
+
+	return 0;
+}
+
+/**
+ * ea_write - writes the request info to an ea, creating new blocks if
+ *            necessary
+ * @ip: inode that is being modified
+ * @ea: the location of the new ea in a block
+ * @er: the write request
+ *
+ * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
+ *
+ * returns : errno
+ */
+
+static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+		    struct gfs2_ea_request *er)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	int error;
+
+	ea->ea_data_len = cpu_to_be32(er->er_data_len);
+	ea->ea_name_len = er->er_name_len;
+	ea->ea_type = er->er_type;
+	ea->__pad = 0;
+
+	memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
+
+	if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
+		ea->ea_num_ptrs = 0;
+		memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
+	} else {
+		__be64 *dataptr = GFS2_EA2DATAPTRS(ea);
+		const char *data = er->er_data;
+		unsigned int data_len = er->er_data_len;
+		unsigned int copy;
+		unsigned int x;
+
+		ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize);
+		for (x = 0; x < ea->ea_num_ptrs; x++) {
+			struct buffer_head *bh;
+			u64 block;
+			int mh_size = sizeof(struct gfs2_meta_header);
+			unsigned int n = 1;
+
+			error = gfs2_alloc_block(ip, &block, &n);
+			if (error)
+				return error;
+			gfs2_trans_add_unrevoke(sdp, block, 1);
+			bh = gfs2_meta_new(ip->i_gl, block);
+			gfs2_trans_add_bh(ip->i_gl, bh, 1);
+			gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
+
+			gfs2_add_inode_blocks(&ip->i_inode, 1);
+
+			copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
+							   data_len;
+			memcpy(bh->b_data + mh_size, data, copy);
+			if (copy < sdp->sd_jbsize)
+				memset(bh->b_data + mh_size + copy, 0,
+				       sdp->sd_jbsize - copy);
+
+			*dataptr++ = cpu_to_be64(bh->b_blocknr);
+			data += copy;
+			data_len -= copy;
+
+			brelse(bh);
+		}
+
+		gfs2_assert_withdraw(sdp, !data_len);
+	}
+
+	return 0;
+}
+
+typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
+				   struct gfs2_ea_request *er, void *private);
+
+static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+			     unsigned int blks,
+			     ea_skeleton_call_t skeleton_call, void *private)
+{
+	struct gfs2_alloc *al;
+	struct buffer_head *dibh;
+	int error;
+
+	al = gfs2_alloc_get(ip);
+	if (!al)
+		return -ENOMEM;
+
+	error = gfs2_quota_lock_check(ip);
+	if (error)
+		goto out;
+
+	al->al_requested = blks;
+
+	error = gfs2_inplace_reserve(ip);
+	if (error)
+		goto out_gunlock_q;
+
+	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
+				 blks + al->al_rgd->rd_length +
+				 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
+	if (error)
+		goto out_ipres;
+
+	error = skeleton_call(ip, er, private);
+	if (error)
+		goto out_end_trans;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+out_end_trans:
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+out_ipres:
+	gfs2_inplace_release(ip);
+out_gunlock_q:
+	gfs2_quota_unlock(ip);
+out:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+		     void *private)
+{
+	struct buffer_head *bh;
+	int error;
+
+	error = ea_alloc_blk(ip, &bh);
+	if (error)
+		return error;
+
+	ip->i_eattr = bh->b_blocknr;
+	error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
+
+	brelse(bh);
+
+	return error;
+}
+
+/**
+ * ea_init - initializes a new eattr block
+ * @ip:
+ * @er:
+ *
+ * Returns: errno
+ */
+
+static int ea_init(struct gfs2_inode *ip, int type, const char *name,
+		   const void *data, size_t size)
+{
+	struct gfs2_ea_request er;
+	unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
+	unsigned int blks = 1;
+
+	er.er_type = type;
+	er.er_name = name;
+	er.er_name_len = strlen(name);
+	er.er_data = (void *)data;
+	er.er_data_len = size;
+
+	if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize)
+		blks += DIV_ROUND_UP(er.er_data_len, jbsize);
+
+	return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL);
+}
+
+static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
+{
+	u32 ea_size = GFS2_EA_SIZE(ea);
+	struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea +
+				     ea_size);
+	u32 new_size = GFS2_EA_REC_LEN(ea) - ea_size;
+	int last = ea->ea_flags & GFS2_EAFLAG_LAST;
+
+	ea->ea_rec_len = cpu_to_be32(ea_size);
+	ea->ea_flags ^= last;
+
+	new->ea_rec_len = cpu_to_be32(new_size);
+	new->ea_flags = last;
+
+	return new;
+}
+
+static void ea_set_remove_stuffed(struct gfs2_inode *ip,
+				  struct gfs2_ea_location *el)
+{
+	struct gfs2_ea_header *ea = el->el_ea;
+	struct gfs2_ea_header *prev = el->el_prev;
+	u32 len;
+
+	gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+
+	if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
+		ea->ea_type = GFS2_EATYPE_UNUSED;
+		return;
+	} else if (GFS2_EA2NEXT(prev) != ea) {
+		prev = GFS2_EA2NEXT(prev);
+		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea);
+	}
+
+	len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+	prev->ea_rec_len = cpu_to_be32(len);
+
+	if (GFS2_EA_IS_LAST(ea))
+		prev->ea_flags |= GFS2_EAFLAG_LAST;
+}
+
+struct ea_set {
+	int ea_split;
+
+	struct gfs2_ea_request *es_er;
+	struct gfs2_ea_location *es_el;
+
+	struct buffer_head *es_bh;
+	struct gfs2_ea_header *es_ea;
+};
+
+static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
+				 struct gfs2_ea_header *ea, struct ea_set *es)
+{
+	struct gfs2_ea_request *er = es->es_er;
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0);
+	if (error)
+		return error;
+
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+	if (es->ea_split)
+		ea = ea_split_ea(ea);
+
+	ea_write(ip, ea, er);
+
+	if (es->es_el)
+		ea_set_remove_stuffed(ip, es->es_el);
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out;
+	ip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+out:
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+	return error;
+}
+
+static int ea_set_simple_alloc(struct gfs2_inode *ip,
+			       struct gfs2_ea_request *er, void *private)
+{
+	struct ea_set *es = private;
+	struct gfs2_ea_header *ea = es->es_ea;
+	int error;
+
+	gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
+
+	if (es->ea_split)
+		ea = ea_split_ea(ea);
+
+	error = ea_write(ip, ea, er);
+	if (error)
+		return error;
+
+	if (es->es_el)
+		ea_set_remove_stuffed(ip, es->es_el);
+
+	return 0;
+}
+
+static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
+			 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+			 void *private)
+{
+	struct ea_set *es = private;
+	unsigned int size;
+	int stuffed;
+	int error;
+
+	stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len,
+			       es->es_er->er_data_len, &size);
+
+	if (ea->ea_type == GFS2_EATYPE_UNUSED) {
+		if (GFS2_EA_REC_LEN(ea) < size)
+			return 0;
+		if (!GFS2_EA_IS_STUFFED(ea)) {
+			error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
+			if (error)
+				return error;
+		}
+		es->ea_split = 0;
+	} else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
+		es->ea_split = 1;
+	else
+		return 0;
+
+	if (stuffed) {
+		error = ea_set_simple_noalloc(ip, bh, ea, es);
+		if (error)
+			return error;
+	} else {
+		unsigned int blks;
+
+		es->es_bh = bh;
+		es->es_ea = ea;
+		blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len,
+					GFS2_SB(&ip->i_inode)->sd_jbsize);
+
+		error = ea_alloc_skeleton(ip, es->es_er, blks,
+					  ea_set_simple_alloc, es);
+		if (error)
+			return error;
+	}
+
+	return 1;
+}
+
+static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+			void *private)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *indbh, *newbh;
+	__be64 *eablk;
+	int error;
+	int mh_size = sizeof(struct gfs2_meta_header);
+
+	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
+		__be64 *end;
+
+		error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
+				       &indbh);
+		if (error)
+			return error;
+
+		if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+			error = -EIO;
+			goto out;
+		}
+
+		eablk = (__be64 *)(indbh->b_data + mh_size);
+		end = eablk + sdp->sd_inptrs;
+
+		for (; eablk < end; eablk++)
+			if (!*eablk)
+				break;
+
+		if (eablk == end) {
+			error = -ENOSPC;
+			goto out;
+		}
+
+		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+	} else {
+		u64 blk;
+		unsigned int n = 1;
+		error = gfs2_alloc_block(ip, &blk, &n);
+		if (error)
+			return error;
+		gfs2_trans_add_unrevoke(sdp, blk, 1);
+		indbh = gfs2_meta_new(ip->i_gl, blk);
+		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+		gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+		gfs2_buffer_clear_tail(indbh, mh_size);
+
+		eablk = (__be64 *)(indbh->b_data + mh_size);
+		*eablk = cpu_to_be64(ip->i_eattr);
+		ip->i_eattr = blk;
+		ip->i_diskflags |= GFS2_DIF_EA_INDIRECT;
+		gfs2_add_inode_blocks(&ip->i_inode, 1);
+
+		eablk++;
+	}
+
+	error = ea_alloc_blk(ip, &newbh);
+	if (error)
+		goto out;
+
+	*eablk = cpu_to_be64((u64)newbh->b_blocknr);
+	error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
+	brelse(newbh);
+	if (error)
+		goto out;
+
+	if (private)
+		ea_set_remove_stuffed(ip, private);
+
+out:
+	brelse(indbh);
+	return error;
+}
+
+static int ea_set_i(struct gfs2_inode *ip, int type, const char *name,
+		    const void *value, size_t size, struct gfs2_ea_location *el)
+{
+	struct gfs2_ea_request er;
+	struct ea_set es;
+	unsigned int blks = 2;
+	int error;
+
+	er.er_type = type;
+	er.er_name = name;
+	er.er_data = (void *)value;
+	er.er_name_len = strlen(name);
+	er.er_data_len = size;
+
+	memset(&es, 0, sizeof(struct ea_set));
+	es.es_er = &er;
+	es.es_el = el;
+
+	error = ea_foreach(ip, ea_set_simple, &es);
+	if (error > 0)
+		return 0;
+	if (error)
+		return error;
+
+	if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
+		blks++;
+	if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
+		blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
+
+	return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el);
+}
+
+static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
+				   struct gfs2_ea_location *el)
+{
+	if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
+		el->el_prev = GFS2_EA2NEXT(el->el_prev);
+		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+				     GFS2_EA2NEXT(el->el_prev) == el->el_ea);
+	}
+
+	return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
+}
+
+static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
+{
+	struct gfs2_ea_header *ea = el->el_ea;
+	struct gfs2_ea_header *prev = el->el_prev;
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
+	if (error)
+		return error;
+
+	gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+
+	if (prev) {
+		u32 len;
+
+		len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+		prev->ea_rec_len = cpu_to_be32(len);
+
+		if (GFS2_EA_IS_LAST(ea))
+			prev->ea_flags |= GFS2_EAFLAG_LAST;
+	} else {
+		ea->ea_type = GFS2_EATYPE_UNUSED;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+
+	return error;
+}
+
+/**
+ * gfs2_xattr_remove - Remove a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ *
+ * This is not called directly by the VFS since we use the (common)
+ * scheme of making a "set with NULL data" mean a remove request. Note
+ * that this is different from a set with zero length data.
+ *
+ * Returns: 0, or errno on failure
+ */
+
+static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_ea_location el;
+	int error;
+
+	if (!ip->i_eattr)
+		return -ENODATA;
+
+	error = gfs2_ea_find(ip, type, name, &el);
+	if (error)
+		return error;
+	if (!el.el_ea)
+		return -ENODATA;
+
+	if (GFS2_EA_IS_STUFFED(el.el_ea))
+		error = ea_remove_stuffed(ip, &el);
+	else
+		error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0);
+
+	brelse(el.el_bh);
+
+	return error;
+}
+
+/**
+ * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ * @value: The value of the extended attribute (NULL for remove)
+ * @size: The size of the @value argument
+ * @flags: Create or Replace
+ *
+ * See gfs2_xattr_remove() for details of the removal of xattrs.
+ *
+ * Returns: 0 or errno on failure
+ */
+
+int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+		   const void *value, size_t size, int flags)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_ea_location el;
+	unsigned int namel = strlen(name);
+	int error;
+
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		return -EPERM;
+	if (namel > GFS2_EA_MAX_NAME_LEN)
+		return -ERANGE;
+
+	if (value == NULL)
+		return gfs2_xattr_remove(inode, type, name);
+
+	if (ea_check_size(sdp, namel, size))
+		return -ERANGE;
+
+	if (!ip->i_eattr) {
+		if (flags & XATTR_REPLACE)
+			return -ENODATA;
+		return ea_init(ip, type, name, value, size);
+	}
+
+	error = gfs2_ea_find(ip, type, name, &el);
+	if (error)
+		return error;
+
+	if (el.el_ea) {
+		if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
+			brelse(el.el_bh);
+			return -EPERM;
+		}
+
+		error = -EEXIST;
+		if (!(flags & XATTR_CREATE)) {
+			int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
+			error = ea_set_i(ip, type, name, value, size, &el);
+			if (!error && unstuffed)
+				ea_set_remove_unstuffed(ip, &el);
+		}
+
+		brelse(el.el_bh);
+		return error;
+	}
+
+	error = -ENODATA;
+	if (!(flags & XATTR_REPLACE))
+		error = ea_set_i(ip, type, name, value, size, NULL);
+
+	return error;
+}
+
+static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
+				  struct gfs2_ea_header *ea, char *data)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head **bh;
+	unsigned int amount = GFS2_EA_DATA_LEN(ea);
+	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
+	__be64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+	unsigned int x;
+	int error;
+
+	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
+	if (!bh)
+		return -ENOMEM;
+
+	error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
+	if (error)
+		goto out;
+
+	for (x = 0; x < nptrs; x++) {
+		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+				       bh + x);
+		if (error) {
+			while (x--)
+				brelse(bh[x]);
+			goto fail;
+		}
+		dataptrs++;
+	}
+
+	for (x = 0; x < nptrs; x++) {
+		error = gfs2_meta_wait(sdp, bh[x]);
+		if (error) {
+			for (; x < nptrs; x++)
+				brelse(bh[x]);
+			goto fail;
+		}
+		if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+			for (; x < nptrs; x++)
+				brelse(bh[x]);
+			error = -EIO;
+			goto fail;
+		}
+
+		gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
+
+		memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), data,
+		       (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+
+		amount -= sdp->sd_jbsize;
+		data += sdp->sd_jbsize;
+
+		brelse(bh[x]);
+	}
+
+out:
+	kfree(bh);
+	return error;
+
+fail:
+	gfs2_trans_end(sdp);
+	kfree(bh);
+	return error;
+}
+
+int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+		      struct iattr *attr, char *data)
+{
+	struct buffer_head *dibh;
+	int error;
+
+	if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+		error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
+		if (error)
+			return error;
+
+		gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+		memcpy(GFS2_EA2DATA(el->el_ea), data,
+		       GFS2_EA_DATA_LEN(el->el_ea));
+	} else
+		error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
+
+	if (error)
+		return error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		error = inode_setattr(&ip->i_inode, attr);
+		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+
+	return error;
+}
+
+static int ea_dealloc_indirect(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrp_list rlist;
+	struct buffer_head *indbh, *dibh;
+	__be64 *eablk, *end;
+	unsigned int rg_blocks = 0;
+	u64 bstart = 0;
+	unsigned int blen = 0;
+	unsigned int blks = 0;
+	unsigned int x;
+	int error;
+
+	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
+	if (error)
+		return error;
+
+	if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+		error = -EIO;
+		goto out;
+	}
+
+	eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+	end = eablk + sdp->sd_inptrs;
+
+	for (; eablk < end; eablk++) {
+		u64 bn;
+
+		if (!*eablk)
+			break;
+		bn = be64_to_cpu(*eablk);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart)
+				gfs2_rlist_add(sdp, &rlist, bstart);
+			bstart = bn;
+			blen = 1;
+		}
+		blks++;
+	}
+	if (bstart)
+		gfs2_rlist_add(sdp, &rlist, bstart);
+	else
+		goto out;
+
+	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
+
+	for (x = 0; x < rlist.rl_rgrps; x++) {
+		struct gfs2_rgrpd *rgd;
+		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+		rg_blocks += rgd->rd_length;
+	}
+
+	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+	if (error)
+		goto out_rlist_free;
+
+	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT +
+				 RES_STATFS + RES_QUOTA, blks);
+	if (error)
+		goto out_gunlock;
+
+	gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+
+	eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+	bstart = 0;
+	blen = 0;
+
+	for (; eablk < end; eablk++) {
+		u64 bn;
+
+		if (!*eablk)
+			break;
+		bn = be64_to_cpu(*eablk);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart)
+				gfs2_free_meta(ip, bstart, blen);
+			bstart = bn;
+			blen = 1;
+		}
+
+		*eablk = 0;
+		gfs2_add_inode_blocks(&ip->i_inode, -1);
+	}
+	if (bstart)
+		gfs2_free_meta(ip, bstart, blen);
+
+	ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(sdp);
+
+out_gunlock:
+	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist_free:
+	gfs2_rlist_free(&rlist);
+out:
+	brelse(indbh);
+	return error;
+}
+
+static int ea_dealloc_block(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al = ip->i_alloc;
+	struct gfs2_rgrpd *rgd;
+	struct buffer_head *dibh;
+	int error;
+
+	rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr);
+	if (!rgd) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
+				   &al->al_rgd_gh);
+	if (error)
+		return error;
+
+	error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + RES_STATFS +
+				 RES_QUOTA, 1);
+	if (error)
+		goto out_gunlock;
+
+	gfs2_free_meta(ip, ip->i_eattr, 1);
+
+	ip->i_eattr = 0;
+	gfs2_add_inode_blocks(&ip->i_inode, -1);
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(sdp);
+
+out_gunlock:
+	gfs2_glock_dq_uninit(&al->al_rgd_gh);
+	return error;
+}
+
+/**
+ * gfs2_ea_dealloc - deallocate the extended attribute fork
+ * @ip: the inode
+ *
+ * Returns: errno
+ */
+
+int gfs2_ea_dealloc(struct gfs2_inode *ip)
+{
+	struct gfs2_alloc *al;
+	int error;
+
+	al = gfs2_alloc_get(ip);
+	if (!al)
+		return -ENOMEM;
+
+	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto out_alloc;
+
+	error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
+	if (error)
+		goto out_quota;
+
+	error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
+	if (error)
+		goto out_rindex;
+
+	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
+		error = ea_dealloc_indirect(ip);
+		if (error)
+			goto out_rindex;
+	}
+
+	error = ea_dealloc_block(ip);
+
+out_rindex:
+	gfs2_glock_dq_uninit(&al->al_ri_gh);
+out_quota:
+	gfs2_quota_unhold(ip);
+out_alloc:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+static int gfs2_xattr_user_get(struct inode *inode, const char *name,
+			       void *buffer, size_t size)
+{
+	return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size);
+}
+
+static int gfs2_xattr_user_set(struct inode *inode, const char *name,
+			       const void *value, size_t size, int flags)
+{
+	return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
+}
+
+static int gfs2_xattr_system_get(struct inode *inode, const char *name,
+				 void *buffer, size_t size)
+{
+	return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
+}
+
+static int gfs2_xattr_system_set(struct inode *inode, const char *name,
+				 const void *value, size_t size, int flags)
+{
+	return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
+}
+
+static int gfs2_xattr_security_get(struct inode *inode, const char *name,
+				   void *buffer, size_t size)
+{
+	return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size);
+}
+
+static int gfs2_xattr_security_set(struct inode *inode, const char *name,
+				   const void *value, size_t size, int flags)
+{
+	return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags);
+}
+
+static struct xattr_handler gfs2_xattr_user_handler = {
+	.prefix = XATTR_USER_PREFIX,
+	.get    = gfs2_xattr_user_get,
+	.set    = gfs2_xattr_user_set,
+};
+
+static struct xattr_handler gfs2_xattr_security_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.get    = gfs2_xattr_security_get,
+	.set    = gfs2_xattr_security_set,
+};
+
+static struct xattr_handler gfs2_xattr_system_handler = {
+	.prefix = XATTR_SYSTEM_PREFIX,
+	.get    = gfs2_xattr_system_get,
+	.set    = gfs2_xattr_system_set,
+};
+
+struct xattr_handler *gfs2_xattr_handlers[] = {
+	&gfs2_xattr_user_handler,
+	&gfs2_xattr_security_handler,
+	&gfs2_xattr_system_handler,
+	NULL,
+};
+
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
new file mode 100644
index 000000000000..4040a188f63b
--- /dev/null
+++ b/fs/gfs2/xattr.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __EATTR_DOT_H__
+#define __EATTR_DOT_H__
+
+struct gfs2_inode;
+struct iattr;
+
+#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
+#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
+
+#define GFS2_EA_SIZE(ea) \
+ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
+      ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
+                                  (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
+
+#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
+#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
+
+#define GFS2_EAREQ_SIZE_STUFFED(er) \
+ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
+
+#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
+#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
+
+#define GFS2_EA2DATAPTRS(ea) \
+((__be64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
+
+#define GFS2_EA2NEXT(ea) \
+((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
+
+#define GFS2_EA_BH2FIRST(bh) \
+((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
+
+struct gfs2_ea_request {
+	const char *er_name;
+	char *er_data;
+	unsigned int er_name_len;
+	unsigned int er_data_len;
+	unsigned int er_type; /* GFS2_EATYPE_... */
+};
+
+struct gfs2_ea_location {
+	struct buffer_head *el_bh;
+	struct gfs2_ea_header *el_ea;
+	struct gfs2_ea_header *el_prev;
+};
+
+extern int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+			  void *buffer, size_t size);
+extern int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+			  const void *value, size_t size, int flags);
+extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
+extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
+
+/* Exported to acl.c */
+
+extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+			struct gfs2_ea_location *el);
+extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+			    char *data, size_t size);
+extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+			     struct iattr *attr, char *data);
+
+#endif /* __EATTR_DOT_H__ */
-- 
cgit v1.2.2


From 8d8291ae93ecb4a246e87e452d55cca412373300 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 27 Aug 2009 15:51:07 +0100
Subject: GFS2: Remove no_formal_ino generating code

The inum structure used throughout GFS2 has two fields. One
no_addr is the disk block number of the inode in question and
is used everywhere as the inode number. The other, no_formal_ino,
is used only as the generation number for NFS.

Historically the no_formal_ino field was set using a complicated
system of one global and one per-node file containing inode numbers
in order to ensure that each no_formal_ino was unique. Also this
code made no provision for what would happen when eventually the
(64 bit) numbers ran out. Now I know that is pretty unlikely to
happen given the large space of numbers, but it is possible
nevertheless.

The only guarantee required for no_formal_ino is that, for any
single inode, the same number doesn't get reused too quickly.

We already have a generation number which is kept in the inode
and initialised from a counter in the resource group (almost
no overhead, since we have to touch the resource group anyway
in order to allocate an inode in the first place). Aside from
ensuring that we never use the value 0 in the no_formal_ino
field, we can use that counter directly.

As a result of that change, we lose about 200 lines of code and
also gain about 10 creates/sec on the postmark benchmark (on
my test machine).

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h     |   7 ---
 fs/gfs2/inode.c      | 143 ++-------------------------------------------------
 fs/gfs2/ops_fstype.c |  45 ++--------------
 fs/gfs2/rgrp.c       |   2 +
 fs/gfs2/super.c      |   3 --
 5 files changed, 10 insertions(+), 190 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 1d11e6e0373e..52436abc3b4d 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -548,18 +548,12 @@ struct gfs2_sbd {
 	struct dentry *sd_root_dir;
 
 	struct inode *sd_jindex;
-	struct inode *sd_inum_inode;
 	struct inode *sd_statfs_inode;
-	struct inode *sd_ir_inode;
 	struct inode *sd_sc_inode;
 	struct inode *sd_qc_inode;
 	struct inode *sd_rindex;
 	struct inode *sd_quota_inode;
 
-	/* Inum stuff */
-
-	struct mutex sd_inum_mutex;
-
 	/* StatFS stuff */
 
 	spinlock_t sd_statfs_spin;
@@ -587,7 +581,6 @@ struct gfs2_sbd {
 	struct gfs2_holder sd_journal_gh;
 	struct gfs2_holder sd_jinode_gh;
 
-	struct gfs2_holder sd_ir_gh;
 	struct gfs2_holder sd_sc_gh;
 	struct gfs2_holder sd_qc_gh;
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 4f5e442d60d8..fb15d3b1f409 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -519,139 +519,6 @@ out:
 	return inode ? inode : ERR_PTR(error);
 }
 
-static void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf)
-{
-	const struct gfs2_inum_range *str = buf;
-
-	ir->ir_start = be64_to_cpu(str->ir_start);
-	ir->ir_length = be64_to_cpu(str->ir_length);
-}
-
-static void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf)
-{
-	struct gfs2_inum_range *str = buf;
-
-	str->ir_start = cpu_to_be64(ir->ir_start);
-	str->ir_length = cpu_to_be64(ir->ir_length);
-}
-
-static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
-{
-	struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
-	struct buffer_head *bh;
-	struct gfs2_inum_range_host ir;
-	int error;
-
-	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
-	if (error)
-		return error;
-	mutex_lock(&sdp->sd_inum_mutex);
-
-	error = gfs2_meta_inode_buffer(ip, &bh);
-	if (error) {
-		mutex_unlock(&sdp->sd_inum_mutex);
-		gfs2_trans_end(sdp);
-		return error;
-	}
-
-	gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-
-	if (ir.ir_length) {
-		*formal_ino = ir.ir_start++;
-		ir.ir_length--;
-		gfs2_trans_add_bh(ip->i_gl, bh, 1);
-		gfs2_inum_range_out(&ir,
-				    bh->b_data + sizeof(struct gfs2_dinode));
-		brelse(bh);
-		mutex_unlock(&sdp->sd_inum_mutex);
-		gfs2_trans_end(sdp);
-		return 0;
-	}
-
-	brelse(bh);
-
-	mutex_unlock(&sdp->sd_inum_mutex);
-	gfs2_trans_end(sdp);
-
-	return 1;
-}
-
-static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino)
-{
-	struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
-	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
-	struct gfs2_holder gh;
-	struct buffer_head *bh;
-	struct gfs2_inum_range_host ir;
-	int error;
-
-	error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-	if (error)
-		return error;
-
-	error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
-	if (error)
-		goto out;
-	mutex_lock(&sdp->sd_inum_mutex);
-
-	error = gfs2_meta_inode_buffer(ip, &bh);
-	if (error)
-		goto out_end_trans;
-
-	gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-
-	if (!ir.ir_length) {
-		struct buffer_head *m_bh;
-		u64 x, y;
-		__be64 z;
-
-		error = gfs2_meta_inode_buffer(m_ip, &m_bh);
-		if (error)
-			goto out_brelse;
-
-		z = *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode));
-		x = y = be64_to_cpu(z);
-		ir.ir_start = x;
-		ir.ir_length = GFS2_INUM_QUANTUM;
-		x += GFS2_INUM_QUANTUM;
-		if (x < y)
-			gfs2_consist_inode(m_ip);
-		z = cpu_to_be64(x);
-		gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
-		*(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = z;
-
-		brelse(m_bh);
-	}
-
-	*formal_ino = ir.ir_start++;
-	ir.ir_length--;
-
-	gfs2_trans_add_bh(ip->i_gl, bh, 1);
-	gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-
-out_brelse:
-	brelse(bh);
-out_end_trans:
-	mutex_unlock(&sdp->sd_inum_mutex);
-	gfs2_trans_end(sdp);
-out:
-	gfs2_glock_dq_uninit(&gh);
-	return error;
-}
-
-static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum)
-{
-	int error;
-
-	error = pick_formal_ino_1(sdp, inum);
-	if (error <= 0)
-		return error;
-
-	error = pick_formal_ino_2(sdp, inum);
-
-	return error;
-}
-
 /**
  * create_ok - OK to create a new on-disk inode here?
  * @dip:  Directory in which dinode is to be created
@@ -981,13 +848,10 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	if (error)
 		goto fail_gunlock;
 
-	error = pick_formal_ino(sdp, &inum.no_formal_ino);
-	if (error)
-		goto fail_gunlock;
-
 	error = alloc_dinode(dip, &inum.no_addr, &generation);
 	if (error)
 		goto fail_gunlock;
+	inum.no_formal_ino = generation;
 
 	error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops,
 				  LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
@@ -998,9 +862,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	if (error)
 		goto fail_gunlock2;
 
-	inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode),
-					inum.no_addr,
-					inum.no_formal_ino, 0);
+	inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
+				  inum.no_formal_ino, 0);
 	if (IS_ERR(inode))
 		goto fail_gunlock2;
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1ec69e63c694..8e3f00e1430d 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -84,7 +84,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 
 	gfs2_tune_init(&sdp->sd_tune);
 
-	mutex_init(&sdp->sd_inum_mutex);
 	spin_lock_init(&sdp->sd_statfs_spin);
 
 	spin_lock_init(&sdp->sd_rindex_spin);
@@ -833,21 +832,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 	if (error)
 		goto fail;
 
-	/* Read in the master inode number inode */
-	sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum");
-	if (IS_ERR(sdp->sd_inum_inode)) {
-		error = PTR_ERR(sdp->sd_inum_inode);
-		fs_err(sdp, "can't read in inum inode: %d\n", error);
-		goto fail_journal;
-	}
-
-
 	/* Read in the master statfs inode */
 	sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
 	if (IS_ERR(sdp->sd_statfs_inode)) {
 		error = PTR_ERR(sdp->sd_statfs_inode);
 		fs_err(sdp, "can't read in statfs inode: %d\n", error);
-		goto fail_inum;
+		goto fail_journal;
 	}
 
 	/* Read in the resource index inode */
@@ -876,8 +866,6 @@ fail_rindex:
 	iput(sdp->sd_rindex);
 fail_statfs:
 	iput(sdp->sd_statfs_inode);
-fail_inum:
-	iput(sdp->sd_inum_inode);
 fail_journal:
 	init_journal(sdp, UNDO);
 fail:
@@ -905,20 +893,12 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
 		return error;
 	}
 
-	sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
-	sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
-	if (IS_ERR(sdp->sd_ir_inode)) {
-		error = PTR_ERR(sdp->sd_ir_inode);
-		fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
-		goto fail;
-	}
-
 	sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
 	sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
 	if (IS_ERR(sdp->sd_sc_inode)) {
 		error = PTR_ERR(sdp->sd_sc_inode);
 		fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
-		goto fail_ir_i;
+		goto fail;
 	}
 
 	sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
@@ -932,27 +912,16 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
 	iput(pn);
 	pn = NULL;
 
-	ip = GFS2_I(sdp->sd_ir_inode);
-	error = gfs2_glock_nq_init(ip->i_gl,
-				   LM_ST_EXCLUSIVE, 0,
-				   &sdp->sd_ir_gh);
-	if (error) {
-		fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
-		goto fail_qc_i;
-	}
-
 	ip = GFS2_I(sdp->sd_sc_inode);
-	error = gfs2_glock_nq_init(ip->i_gl,
-				   LM_ST_EXCLUSIVE, 0,
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
 				   &sdp->sd_sc_gh);
 	if (error) {
 		fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
-		goto fail_ir_gh;
+		goto fail_qc_i;
 	}
 
 	ip = GFS2_I(sdp->sd_qc_inode);
-	error = gfs2_glock_nq_init(ip->i_gl,
-				   LM_ST_EXCLUSIVE, 0,
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
 				   &sdp->sd_qc_gh);
 	if (error) {
 		fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
@@ -965,14 +934,10 @@ fail_qc_gh:
 	gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
 fail_ut_gh:
 	gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
-fail_ir_gh:
-	gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
 fail_qc_i:
 	iput(sdp->sd_qc_inode);
 fail_ut_i:
 	iput(sdp->sd_sc_inode);
-fail_ir_i:
-	iput(sdp->sd_ir_inode);
 fail:
 	if (pn)
 		iput(pn);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c681c54fbd60..388a61d12fc8 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1567,6 +1567,8 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
 	rgd->rd_free--;
 	rgd->rd_dinodes++;
 	*generation = rgd->rd_igeneration++;
+	if (*generation == 0)
+		*generation = rgd->rd_igeneration++;
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2e78a3f68a76..d95cf777d244 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -788,7 +788,6 @@ restart:
 	/*  Release stuff  */
 
 	iput(sdp->sd_jindex);
-	iput(sdp->sd_inum_inode);
 	iput(sdp->sd_statfs_inode);
 	iput(sdp->sd_rindex);
 	iput(sdp->sd_quota_inode);
@@ -799,10 +798,8 @@ restart:
 	if (!sdp->sd_args.ar_spectator) {
 		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
 		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
 		gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
 		gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
-		iput(sdp->sd_ir_inode);
 		iput(sdp->sd_sc_inode);
 		iput(sdp->sd_qc_inode);
 	}
-- 
cgit v1.2.2


From a1b08e75dff3dc18a88444803753e667bb1d126e Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 27 Aug 2009 14:46:56 +0800
Subject: ocfs2: invalidate dentry if its dentry_lock isn't initialized.

In commit a5a0a630922a2f6a774b6dac19f70cb5abd86bb0, when
ocfs2_attch_dentry_lock fails, we call an extra iput and reset
dentry->d_fsdata to NULL. This resolve a bug, but it isn't
completed and the dentry is still there. When we want to use
it again, ocfs2_dentry_revalidate doesn't catch it and return
true. That make future ocfs2_dentry_lock panic out.
One bug is http://oss.oracle.com/bugzilla/show_bug.cgi?id=1162.

The resolution is to add a check for dentry->d_fsdata in
revalidate process and return false if dentry->d_fsdata is NULL,
so that a new ocfs2_lookup will be called again.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dcache.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 2f28b7de2c8d..b4957c7d9fe2 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -85,6 +85,17 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
 		goto bail;
 	}
 
+	/*
+	 * If the last lookup failed to create dentry lock, let us
+	 * redo it.
+	 */
+	if (!dentry->d_fsdata) {
+		mlog(0, "Inode %llu doesn't have dentry lock, "
+		     "returning false\n",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		goto bail;
+	}
+
 	ret = 1;
 
 bail:
-- 
cgit v1.2.2


From 55ad63bf3a30936aced50f13452735c2f58b234c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 28 Aug 2009 10:40:33 -0400
Subject: ext4: fix extent sanity checking code with AGGRESSIVE_TEST

The extents sanity-checking code depends on the ext4_ext_space_*()
functions returning the maximum alloable size for eh_max; however,
when the debugging #ifdef AGGRESSIVE_TEST is enabled to test the
extent tree handling code, this prevents a normally created ext4
filesystem from being mounted with the errors:

Aug 26 15:43:50 bsd086 kernel: [   96.070277] EXT4-fs error (device sda8): ext4_ext_check_inode: bad header/extent in inode #8: too large eh_max - magic f30a, entries 1, max 4(3), depth 0(0)
Aug 26 15:43:50 bsd086 kernel: [   96.070526] EXT4-fs (sda8): no journal found

Bug reported by Akira Fujita.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 60 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 8c20caf4aa5c..7a3832577923 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -229,57 +229,65 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
 	return newblock;
 }
 
-static int ext4_ext_space_block(struct inode *inode)
+static inline int ext4_ext_space_block(struct inode *inode, int check)
 {
 	int size;
 
 	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 			/ sizeof(struct ext4_extent);
+	if (!check) {
 #ifdef AGGRESSIVE_TEST
-	if (size > 6)
-		size = 6;
+		if (size > 6)
+			size = 6;
 #endif
+	}
 	return size;
 }
 
-static int ext4_ext_space_block_idx(struct inode *inode)
+static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
 {
 	int size;
 
 	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 			/ sizeof(struct ext4_extent_idx);
+	if (!check) {
 #ifdef AGGRESSIVE_TEST
-	if (size > 5)
-		size = 5;
+		if (size > 5)
+			size = 5;
 #endif
+	}
 	return size;
 }
 
-static int ext4_ext_space_root(struct inode *inode)
+static inline int ext4_ext_space_root(struct inode *inode, int check)
 {
 	int size;
 
 	size = sizeof(EXT4_I(inode)->i_data);
 	size -= sizeof(struct ext4_extent_header);
 	size /= sizeof(struct ext4_extent);
+	if (!check) {
 #ifdef AGGRESSIVE_TEST
-	if (size > 3)
-		size = 3;
+		if (size > 3)
+			size = 3;
 #endif
+	}
 	return size;
 }
 
-static int ext4_ext_space_root_idx(struct inode *inode)
+static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 {
 	int size;
 
 	size = sizeof(EXT4_I(inode)->i_data);
 	size -= sizeof(struct ext4_extent_header);
 	size /= sizeof(struct ext4_extent_idx);
+	if (!check) {
 #ifdef AGGRESSIVE_TEST
-	if (size > 4)
-		size = 4;
+		if (size > 4)
+			size = 4;
 #endif
+	}
 	return size;
 }
 
@@ -293,9 +301,9 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
 	int lcap, icap, rcap, leafs, idxs, num;
 	int newextents = blocks;
 
-	rcap = ext4_ext_space_root_idx(inode);
-	lcap = ext4_ext_space_block(inode);
-	icap = ext4_ext_space_block_idx(inode);
+	rcap = ext4_ext_space_root_idx(inode, 0);
+	lcap = ext4_ext_space_block(inode, 0);
+	icap = ext4_ext_space_block_idx(inode, 0);
 
 	/* number of new leaf blocks needed */
 	num = leafs = (newextents + lcap - 1) / lcap;
@@ -320,14 +328,14 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 
 	if (depth == ext_depth(inode)) {
 		if (depth == 0)
-			max = ext4_ext_space_root(inode);
+			max = ext4_ext_space_root(inode, 1);
 		else
-			max = ext4_ext_space_root_idx(inode);
+			max = ext4_ext_space_root_idx(inode, 1);
 	} else {
 		if (depth == 0)
-			max = ext4_ext_space_block(inode);
+			max = ext4_ext_space_block(inode, 1);
 		else
-			max = ext4_ext_space_block_idx(inode);
+			max = ext4_ext_space_block_idx(inode, 1);
 	}
 
 	return max;
@@ -626,7 +634,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
 	eh->eh_depth = 0;
 	eh->eh_entries = 0;
 	eh->eh_magic = EXT4_EXT_MAGIC;
-	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
+	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_ext_invalidate_cache(inode);
 	return 0;
@@ -851,7 +859,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
 	neh = ext_block_hdr(bh);
 	neh->eh_entries = 0;
-	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
 	neh->eh_magic = EXT4_EXT_MAGIC;
 	neh->eh_depth = 0;
 	ex = EXT_FIRST_EXTENT(neh);
@@ -927,7 +935,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		neh = ext_block_hdr(bh);
 		neh->eh_entries = cpu_to_le16(1);
 		neh->eh_magic = EXT4_EXT_MAGIC;
-		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
 		neh->eh_depth = cpu_to_le16(depth - i);
 		fidx = EXT_FIRST_INDEX(neh);
 		fidx->ei_block = border;
@@ -1052,9 +1060,9 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	/* old root could have indexes or leaves
 	 * so calculate e_max right way */
 	if (ext_depth(inode))
-	  neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
 	else
-	  neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
 	neh->eh_magic = EXT4_EXT_MAGIC;
 	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
@@ -1069,7 +1077,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 		goto out;
 
 	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
-	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
+	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
 	curp->p_hdr->eh_entries = cpu_to_le16(1);
 	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
 
@@ -2348,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 		if (err == 0) {
 			ext_inode_hdr(inode)->eh_depth = 0;
 			ext_inode_hdr(inode)->eh_max =
-				cpu_to_le16(ext4_ext_space_root(inode));
+				cpu_to_le16(ext4_ext_space_root(inode, 0));
 			err = ext4_ext_dirty(handle, inode, path);
 		}
 	}
-- 
cgit v1.2.2


From 2c94eb86c66e1eaaa1e7d8a2120f4fad5e7e7736 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 28 Aug 2009 21:43:15 -0400
Subject: ext4: Allow rename to create more than EXT4_LINK_MAX subdirectories

Use EXT4_DIR_LINK_MAX so that rename() can move a directory into new
parent directory without running into the EXT4_LINK_MAX limit.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index fea14dbd3c22..ba91bd0f7916 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2421,7 +2421,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto end_rename;
 		retval = -EMLINK;
 		if (!new_inode && new_dir != old_dir &&
-				new_dir->i_nlink >= EXT4_LINK_MAX)
+		    EXT4_DIR_LINK_MAX(new_dir))
 			goto end_rename;
 	}
 	if (!new_bh) {
-- 
cgit v1.2.2


From b05ab1dc3795e6f997fb0d34f38fce5012533c3e Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 29 Aug 2009 21:08:08 -0400
Subject: ext4: Limit number of links that can be created by ext4_link()

In ext4_link we need to check using EXT4_LINK_MAX, and not
EXT4_DIR_LINK_MAX(), since ext4_link() is creating hard links of
regular files, and not directories.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ba91bd0f7916..f27c8164c1fe 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2318,7 +2318,7 @@ static int ext4_link(struct dentry *old_dentry,
 	struct inode *inode = old_dentry->d_inode;
 	int err, retries = 0;
 
-	if (EXT4_DIR_LINK_MAX(inode))
+	if (inode->i_nlink >= EXT4_LINK_MAX)
 		return -EMLINK;
 
 	/*
-- 
cgit v1.2.2


From b1f1b8ce0a1d71cbc72f7540134d52b79bd8f5ac Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 30 Aug 2009 04:21:41 +0900
Subject: nilfs2: fix preempt count underflow in
 nilfs_btnode_prepare_change_key

This will fix the following preempt count underflow reported from
users with the title "[NILFS users] segctord problem" (Message-ID:
<949415.6494.qm@web58808.mail.re1.yahoo.com> and Message-ID:
<debc30fc0908270825v747c1734xa59126623cfd5b05@mail.gmail.com>):

 WARNING: at kernel/sched.c:4890 sub_preempt_count+0x95/0xa0()
 Hardware name: HP Compaq 6530b (KR980UT#ABC)
 Modules linked in: bridge stp llc bnep rfcomm l2cap xfs exportfs nilfs2 cowloop loop vboxnetadp vboxnetflt vboxdrv btusb bluetooth uvcvideo videodev v4l1_compat v4l2_compat_ioctl32 arc4 snd_hda_codec_analog ecb iwlagn iwlcore rfkill lib80211 mac80211 snd_hda_intel snd_hda_codec ehci_hcd uhci_hcd usbcore snd_hwdep snd_pcm tg3 cfg80211 psmouse snd_timer joydev libphy ohci1394 snd_page_alloc hp_accel lis3lv02d ieee1394 led_class i915 drm i2c_algo_bit video backlight output i2c_core dm_crypt dm_mod
 Pid: 4197, comm: segctord Not tainted 2.6.30-gentoo-r4-64 #7
 Call Trace:
  [<ffffffff8023fa05>] ? sub_preempt_count+0x95/0xa0
  [<ffffffff802470f8>] warn_slowpath_common+0x78/0xd0
  [<ffffffff8024715f>] warn_slowpath_null+0xf/0x20
  [<ffffffff8023fa05>] sub_preempt_count+0x95/0xa0
  [<ffffffffa04ce4db>] nilfs_btnode_prepare_change_key+0x11b/0x190 [nilfs2]
  [<ffffffffa04d01ad>] nilfs_btree_assign_p+0x19d/0x1e0 [nilfs2]
  [<ffffffffa04d10ad>] nilfs_btree_assign+0xbd/0x130 [nilfs2]
  [<ffffffffa04cead7>] nilfs_bmap_assign+0x47/0x70 [nilfs2]
  [<ffffffffa04d9bc6>] nilfs_segctor_do_construct+0x956/0x20f0 [nilfs2]
  [<ffffffff805ac8e2>] ? _spin_unlock_irqrestore+0x12/0x40
  [<ffffffff803c06e0>] ? __up_write+0xe0/0x150
  [<ffffffff80262959>] ? up_write+0x9/0x10
  [<ffffffffa04ce9f3>] ? nilfs_bmap_test_and_clear_dirty+0x43/0x60 [nilfs2]
  [<ffffffffa04cd627>] ? nilfs_mdt_fetch_dirty+0x27/0x60 [nilfs2]
  [<ffffffffa04db5fc>] nilfs_segctor_construct+0x8c/0xd0 [nilfs2]
  [<ffffffffa04dc3dc>] nilfs_segctor_thread+0x15c/0x3a0 [nilfs2]
  [<ffffffffa04dbe20>] ? nilfs_construction_timeout+0x0/0x10 [nilfs2]
  [<ffffffff80252633>] ? add_timer+0x13/0x20
  [<ffffffff802370da>] ? __wake_up_common+0x5a/0x90
  [<ffffffff8025e960>] ? autoremove_wake_function+0x0/0x40
  [<ffffffffa04dc280>] ? nilfs_segctor_thread+0x0/0x3a0 [nilfs2]
  [<ffffffffa04dc280>] ? nilfs_segctor_thread+0x0/0x3a0 [nilfs2]
  [<ffffffff8025e556>] kthread+0x56/0x90
  [<ffffffff8020cdea>] child_rip+0xa/0x20
  [<ffffffff8025e500>] ? kthread+0x0/0x90
  [<ffffffff8020cde0>] ? child_rip+0x0/0x20

This problem was caused due to a missing radix_tree_preload() call in
the retry path of nilfs_btnode_prepare_change_key() function.

Reported-by: Eric A <eric225125@yahoo.com>
Reported-by: Jerome Poulin <jeromepoulin@gmail.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Tested-by: Jerome Poulin <jeromepoulin@gmail.com>
Cc: stable@kernel.org
---
 fs/nilfs2/btnode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 7e0b61be212e..c668bca579c1 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -209,6 +209,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
 		 * We cannot call radix_tree_preload for the kernels older
 		 * than 2.6.23, because it is not exported for modules.
 		 */
+retry:
 		err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
 		if (err)
 			goto failed_unlock;
@@ -219,7 +220,6 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
 				       (unsigned long long)oldkey,
 				       (unsigned long long)newkey);
 
-retry:
 		spin_lock_irq(&btnc->tree_lock);
 		err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
 		spin_unlock_irq(&btnc->tree_lock);
-- 
cgit v1.2.2


From 2920ee2b47fc8e6aebe1d1956b2725f48fa93cc5 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 31 Aug 2009 15:27:26 +0000
Subject: [CIFS] potential NULL dereference in parse_DFS_referrals()

memory allocation may fail, prevent a NULL dereference

Pointed out by Roel Kluin

CC: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1866bc2927d4..5f0b80d39923 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3961,6 +3961,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		if (is_unicode) {
 			__le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
 						GFP_KERNEL);
+			if (tmp == NULL) {
+				rc = -ENOMEM;
+				goto parse_DFS_referrals_exit;
+			}
 			cifsConvertToUCS((__le16 *) tmp, searchName,
 					PATH_MAX, nls_codepage, remap);
 			node->path_consumed = cifs_ucs2_bytes(tmp,
-- 
cgit v1.2.2


From d96f8f891f69ac1dc8c7bd82e27525de220c04e1 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Thu, 2 Jul 2009 00:09:33 -0500
Subject: xfs: add more statics & drop some unused functions

A lot more functions could be made static, but they need
forward declarations; this does some easy ones, and also
found a few unused functions in the process.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c |  2 +-
 fs/xfs/linux-2.6/xfs_sync.c  | 15 ---------------
 fs/xfs/linux-2.6/xfs_sync.h  |  1 -
 fs/xfs/xfs_ag.h              |  3 ---
 fs/xfs/xfs_alloc.c           |  2 +-
 fs/xfs/xfs_bmap.c            |  2 +-
 fs/xfs/xfs_bmap.h            | 11 -----------
 fs/xfs/xfs_bmap_btree.c      | 20 ++++++++++----------
 fs/xfs/xfs_bmap_btree.h      |  1 -
 fs/xfs/xfs_btree.c           | 42 +-----------------------------------------
 fs/xfs/xfs_btree.h           | 15 ---------------
 fs/xfs/xfs_inode.c           |  8 ++++----
 fs/xfs/xfs_inode.h           |  5 -----
 fs/xfs/xfs_itable.c          |  2 +-
 fs/xfs/xfs_itable.h          |  5 -----
 fs/xfs/xfs_log_priv.h        |  2 --
 fs/xfs/xfs_log_recover.c     |  2 +-
 fs/xfs/xfs_mount.c           |  2 +-
 fs/xfs/xfs_mount.h           |  3 ---
 fs/xfs/xfs_mru_cache.c       | 29 -----------------------------
 fs/xfs/xfs_mru_cache.h       |  1 -
 fs/xfs/xfs_rw.h              |  6 ------
 fs/xfs/xfs_vnodeops.c        |  2 +-
 23 files changed, 22 insertions(+), 159 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a220d36f789b..c709ed61a53e 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -687,7 +687,7 @@ xfs_barrier_test(
 	return error;
 }
 
-void
+STATIC void
 xfs_mountfs_check_barriers(xfs_mount_t *mp)
 {
 	int error;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 98ef624d9baf..320be6aea492 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -749,21 +749,6 @@ __xfs_inode_clear_reclaim_tag(
 			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
 }
 
-void
-xfs_inode_clear_reclaim_tag(
-	xfs_inode_t	*ip)
-{
-	xfs_mount_t	*mp = ip->i_mount;
-	xfs_perag_t	*pag = xfs_get_perag(mp, ip->i_ino);
-
-	read_lock(&pag->pag_ici_lock);
-	spin_lock(&ip->i_flags_lock);
-	__xfs_inode_clear_reclaim_tag(mp, pag, ip);
-	spin_unlock(&ip->i_flags_lock);
-	read_unlock(&pag->pag_ici_lock);
-	xfs_put_perag(mp, pag);
-}
-
 STATIC int
 xfs_reclaim_inode_now(
 	struct xfs_inode	*ip,
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 59120602588a..27920eb7a820 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -49,7 +49,6 @@ int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
 void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
-void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
 void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
 				struct xfs_inode *ip);
 
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f24b50b68d03..af3cfeb18b0d 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -91,9 +91,6 @@ typedef struct xfs_agf {
 #define	XFS_AGF_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
 #define	XFS_BUF_TO_AGF(bp)	((xfs_agf_t *)XFS_BUF_PTR(bp))
 
-extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
-			xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
-
 /*
  * Size of the unlinked inode hash table in the agi.
  */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 2cf944eb796d..6316004922d7 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2248,7 +2248,7 @@ xfs_alloc_put_freelist(
 /*
  * Read in the allocation group header (free/alloc section).
  */
-int					/* error */
+STATIC int				/* error */
 xfs_read_agf(
 	struct xfs_mount	*mp,	/* mount point structure */
 	struct xfs_trans	*tp,	/* transaction pointer */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8ee5b5a76a2a..8971fb09d387 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3713,7 +3713,7 @@ done:
  * entry (null if none).  Else, *lastxp will be set to the index
  * of the found entry; *gotp will contain the entry.
  */
-xfs_bmbt_rec_host_t *			/* pointer to found extent entry */
+STATIC xfs_bmbt_rec_host_t *		/* pointer to found extent entry */
 xfs_bmap_search_multi_extents(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	xfs_fileoff_t	bno,		/* block number searched for */
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 1b8ff9256bd0..56f62d2edc35 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -392,17 +392,6 @@ xfs_bmap_count_blocks(
 	int			whichfork,
 	int			*count);
 
-/*
- * Search the extent records for the entry containing block bno.
- * If bno lies in a hole, point to the next entry.  If bno lies
- * past eof, *eofp will be set, and *prevp will contain the last
- * entry (null if none).  Else, *lastxp will be set to the index
- * of the found entry; *gotp will contain the entry.
- */
-xfs_bmbt_rec_host_t *
-xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *,
-			xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *);
-
 #endif	/* __KERNEL__ */
 
 #endif	/* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 5c1ade06578e..eb7b702d0690 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -202,16 +202,6 @@ xfs_bmbt_get_state(
 				ext_flag);
 }
 
-/* Endian flipping versions of the bmbt extraction functions */
-void
-xfs_bmbt_disk_get_all(
-	xfs_bmbt_rec_t	*r,
-	xfs_bmbt_irec_t *s)
-{
-	__xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
-				get_unaligned_be64(&r->l1), s);
-}
-
 /*
  * Extract the blockcount field from an on disk bmap extent record.
  */
@@ -816,6 +806,16 @@ xfs_bmbt_trace_key(
 	*l1 = 0;
 }
 
+/* Endian flipping versions of the bmbt extraction functions */
+STATIC void
+xfs_bmbt_disk_get_all(
+	xfs_bmbt_rec_t	*r,
+	xfs_bmbt_irec_t *s)
+{
+	__xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
+				get_unaligned_be64(&r->l1), s);
+}
+
 STATIC void
 xfs_bmbt_trace_record(
 	struct xfs_btree_cur	*cur,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e8df007615e..5549d495947f 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -220,7 +220,6 @@ extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
 extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
 extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
 
-extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
 extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
 extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
 
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 26717388acf5..52b5f14d0c32 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -645,46 +645,6 @@ xfs_btree_read_bufl(
 	return 0;
 }
 
-/*
- * Get a buffer for the block, return it read in.
- * Short-form addressing.
- */
-int					/* error */
-xfs_btree_read_bufs(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_agnumber_t	agno,		/* allocation group number */
-	xfs_agblock_t	agbno,		/* allocation group block number */
-	uint		lock,		/* lock flags for read_buf */
-	xfs_buf_t	**bpp,		/* buffer for agno/agbno */
-	int		refval)		/* ref count value for buffer */
-{
-	xfs_buf_t	*bp;		/* return value */
-	xfs_daddr_t	d;		/* real disk block address */
-	int		error;
-
-	ASSERT(agno != NULLAGNUMBER);
-	ASSERT(agbno != NULLAGBLOCK);
-	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-	if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
-					mp->m_bsize, lock, &bp))) {
-		return error;
-	}
-	ASSERT(!bp || !XFS_BUF_GETERROR(bp));
-	if (bp != NULL) {
-		switch (refval) {
-		case XFS_ALLOC_BTREE_REF:
-			XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
-			break;
-		case XFS_INO_BTREE_REF:
-			XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval);
-			break;
-		}
-	}
-	*bpp = bp;
-	return 0;
-}
-
 /*
  * Read-ahead the block, don't wait for it, don't return a buffer.
  * Long-form addressing.
@@ -2951,7 +2911,7 @@ error0:
  * inode we have to copy the single block it was pointing to into the
  * inode.
  */
-int
+STATIC int
 xfs_btree_kill_iroot(
 	struct xfs_btree_cur	*cur)
 {
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 4f852b735b96..7fa07062bdda 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -378,20 +378,6 @@ xfs_btree_read_bufl(
 	struct xfs_buf		**bpp,	/* buffer for fsbno */
 	int			refval);/* ref count value for buffer */
 
-/*
- * Get a buffer for the block, return it read in.
- * Short-form addressing.
- */
-int					/* error */
-xfs_btree_read_bufs(
-	struct xfs_mount	*mp,	/* file system mount point */
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_agnumber_t		agno,	/* allocation group number */
-	xfs_agblock_t		agbno,	/* allocation group block number */
-	uint			lock,	/* lock flags for read_buf */
-	struct xfs_buf		**bpp,	/* buffer for agno/agbno */
-	int			refval);/* ref count value for buffer */
-
 /*
  * Read-ahead the block, don't wait for it, don't return a buffer.
  * Long-form addressing.
@@ -432,7 +418,6 @@ int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
 int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
 int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
 int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
-int xfs_btree_kill_iroot(struct xfs_btree_cur *);
 int xfs_btree_insert(struct xfs_btree_cur *, int *);
 int xfs_btree_delete(struct xfs_btree_cur *, int *);
 int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index da428b3fe0f5..c1dc7ef5a1d8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -651,7 +651,7 @@ xfs_iformat_btree(
 	return 0;
 }
 
-void
+STATIC void
 xfs_dinode_from_disk(
 	xfs_icdinode_t		*to,
 	xfs_dinode_t		*from)
@@ -1247,7 +1247,7 @@ xfs_isize_check(
  * In that case the pages will still be in memory, but the inode size
  * will never have been updated.
  */
-xfs_fsize_t
+STATIC xfs_fsize_t
 xfs_file_last_byte(
 	xfs_inode_t	*ip)
 {
@@ -3837,7 +3837,7 @@ xfs_iext_inline_to_direct(
 /*
  * Resize an extent indirection array to new_size bytes.
  */
-void
+STATIC void
 xfs_iext_realloc_indirect(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	int		new_size)	/* new indirection array size */
@@ -3862,7 +3862,7 @@ xfs_iext_realloc_indirect(
 /*
  * Switch from indirection array to linear (direct) extent allocations.
  */
-void
+STATIC void
 xfs_iext_indirect_to_direct(
 	 xfs_ifork_t	*ifp)		/* inode fork pointer */
 {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 65f24a3cc992..00e8505bde2d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -504,7 +504,6 @@ void		xfs_ipin(xfs_inode_t *);
 void		xfs_iunpin(xfs_inode_t *);
 int		xfs_iflush(xfs_inode_t *, uint);
 void		xfs_ichgtime(xfs_inode_t *, int);
-xfs_fsize_t	xfs_file_last_byte(xfs_inode_t *);
 void		xfs_lock_inodes(xfs_inode_t **, int, uint);
 void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 
@@ -572,8 +571,6 @@ int		xfs_itobp(struct xfs_mount *, struct xfs_trans *,
 			  struct xfs_buf **, uint);
 int		xfs_iread(struct xfs_mount *, struct xfs_trans *,
 			  struct xfs_inode *, xfs_daddr_t, uint);
-void		xfs_dinode_from_disk(struct xfs_icdinode *,
-				     struct xfs_dinode *);
 void		xfs_dinode_to_disk(struct xfs_dinode *,
 				   struct xfs_icdinode *);
 void		xfs_idestroy_fork(struct xfs_inode *, int);
@@ -592,8 +589,6 @@ void		xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
 void		xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
 void		xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
 void		xfs_iext_realloc_direct(xfs_ifork_t *, int);
-void		xfs_iext_realloc_indirect(xfs_ifork_t *, int);
-void		xfs_iext_indirect_to_direct(xfs_ifork_t *);
 void		xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
 void		xfs_iext_inline_to_direct(xfs_ifork_t *, int);
 void		xfs_iext_destroy(xfs_ifork_t *);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index aeb2d2221c7d..c471122d2234 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -39,7 +39,7 @@
 #include "xfs_error.h"
 #include "xfs_btree.h"
 
-int
+STATIC int
 xfs_internal_inum(
 	xfs_mount_t	*mp,
 	xfs_ino_t	ino)
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 1fb04e7deb61..20792bf45946 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -99,11 +99,6 @@ xfs_bulkstat_one(
 	void			*dibuff,
 	int			*stat);
 
-int
-xfs_internal_inum(
-	xfs_mount_t		*mp,
-	xfs_ino_t		ino);
-
 typedef int (*inumbers_fmt_pf)(
 	void			__user *ubuffer, /* buffer to write to */
 	const xfs_inogrp_t	*buffer,	/* buffer to read from */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index bcad5f4c1fd1..679c7c4926a2 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -451,8 +451,6 @@ extern int	 xlog_find_tail(xlog_t	*log,
 extern int	 xlog_recover(xlog_t *log);
 extern int	 xlog_recover_finish(xlog_t *log);
 extern void	 xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
-extern void	 xlog_recover_process_iunlinks(xlog_t *log);
-
 extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
 extern void	 xlog_put_bp(struct xfs_buf *);
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 47da2fb45377..1099395d7d6c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3263,7 +3263,7 @@ xlog_recover_process_one_iunlink(
  * freeing of the inode and its removal from the list must be
  * atomic.
  */
-void
+STATIC void
 xlog_recover_process_iunlinks(
 	xlog_t		*log)
 {
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5c6f092659c1..8b6c9e807efb 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1568,7 +1568,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
  *
  * The m_sb_lock must be held when this routine is called.
  */
-int
+STATIC int
 xfs_mod_incore_sb_unlocked(
 	xfs_mount_t	*mp,
 	xfs_sb_field_t	field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a5122382afde..a6c023bc0fb2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -414,13 +414,10 @@ typedef struct xfs_mod_sb {
 
 extern int	xfs_log_sbcount(xfs_mount_t *, uint);
 extern int	xfs_mountfs(xfs_mount_t *mp);
-extern void	xfs_mountfs_check_barriers(xfs_mount_t *mp);
 
 extern void	xfs_unmountfs(xfs_mount_t *);
 extern int	xfs_unmountfs_writesb(xfs_mount_t *);
 extern int	xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
-extern int	xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
-			int64_t, int);
 extern int	xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
 			uint, int);
 extern int	xfs_mount_log_sb(xfs_mount_t *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index afee7eb24323..4b0613d99faa 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -563,35 +563,6 @@ xfs_mru_cache_lookup(
 	return elem ? elem->value : NULL;
 }
 
-/*
- * To look up an element using its key, but leave its location in the internal
- * lists alone, call xfs_mru_cache_peek().  If the element isn't found, this
- * function returns NULL.
- *
- * See the comments above the declaration of the xfs_mru_cache_lookup() function
- * for important locking information pertaining to this call.
- */
-void *
-xfs_mru_cache_peek(
-	xfs_mru_cache_t	*mru,
-	unsigned long	key)
-{
-	xfs_mru_cache_elem_t *elem;
-
-	ASSERT(mru && mru->lists);
-	if (!mru || !mru->lists)
-		return NULL;
-
-	spin_lock(&mru->lock);
-	elem = radix_tree_lookup(&mru->store, key);
-	if (!elem)
-		spin_unlock(&mru->lock);
-	else
-		__release(mru_lock); /* help sparse not be stupid */
-
-	return elem ? elem->value : NULL;
-}
-
 /*
  * To release the internal data structure spinlock after having performed an
  * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index dd58ea1bbebe..5d439f34b0c9 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -49,7 +49,6 @@ int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
 void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
 void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
 void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
-void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key);
 void xfs_mru_cache_done(struct xfs_mru_cache *mru);
 
 #endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f76c003ec55d..ae65f0df87da 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -78,10 +78,4 @@ extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
 extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
 				xfs_buf_t *bp, xfs_daddr_t blkno);
 
-/*
- * Prototypes for functions in xfs_vnodeops.c.
- */
-extern int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
-			int flags);
-
 #endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 492d75bae2bf..ceecafd1f9c1 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -718,7 +718,7 @@ xfs_fsync(
  * when the link count isn't zero and by xfs_dm_punch_hole() when
  * punching a hole to EOF.
  */
-int
+STATIC int
 xfs_free_eofblocks(
 	xfs_mount_t	*mp,
 	xfs_inode_t	*ip,
-- 
cgit v1.2.2


From fef1111ecdd2b5198afe91537a5cd4c6be80a255 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Thu, 2 Jul 2009 21:35:43 -0500
Subject: un-static xfs_read_agf

CONFIG_XFS_DEBUG builds still need xfs_read_agf to be
non-static, oops.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_ag.h    | 3 +++
 fs/xfs/xfs_alloc.c | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index af3cfeb18b0d..f24b50b68d03 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -91,6 +91,9 @@ typedef struct xfs_agf {
 #define	XFS_AGF_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
 #define	XFS_BUF_TO_AGF(bp)	((xfs_agf_t *)XFS_BUF_PTR(bp))
 
+extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
+			xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
+
 /*
  * Size of the unlinked inode hash table in the agi.
  */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 6316004922d7..2cf944eb796d 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2248,7 +2248,7 @@ xfs_alloc_put_freelist(
 /*
  * Read in the allocation group header (free/alloc section).
  */
-STATIC int				/* error */
+int					/* error */
 xfs_read_agf(
 	struct xfs_mount	*mp,	/* mount point structure */
 	struct xfs_trans	*tp,	/* transaction pointer */
-- 
cgit v1.2.2


From eb00457d624c70a2492c319e7e05ed7e067b2794 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Sun, 5 Jul 2009 12:23:35 -0500
Subject: xfs: remove XFS_INO64_OFFSET

Commit a19d9f887d81106d52cacbc9930207b487e07e0e removed the
ino64 option but left the XFS_INO64_OFFSET define it used
in place - just remove it.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_inum.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
index 7a28191cb0de..b8e4ee4e89a4 100644
--- a/fs/xfs/xfs_inum.h
+++ b/fs/xfs/xfs_inum.h
@@ -72,7 +72,6 @@ struct xfs_mount;
 
 #if XFS_BIG_INUMS
 #define	XFS_MAXINUMBER		((xfs_ino_t)((1ULL << 56) - 1ULL))
-#define	XFS_INO64_OFFSET	((xfs_ino_t)(1ULL << 32))
 #else
 #define	XFS_MAXINUMBER		((xfs_ino_t)((1ULL << 32) - 1ULL))
 #endif
-- 
cgit v1.2.2


From a0f7bfd3429fc48f4d1aa5364b9428d75798ba04 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Mon, 27 Jul 2009 18:15:25 +0200
Subject: fs/xfs: Correct redundant test

bp was tested for NULL a few lines before, followed by a return, and there
is no intervening modification of its value.

A simplified version of the semantic match that finds this problem is as
follows: (http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@r exists@
local idexpression x;
expression E;
position p1,p2;
@@

if (x == NULL || ...) { ... when forall
   return ...; }
... when != \(x=E\|x--\|x++\|--x\|++x\|x-=E\|x+=E\|x|=E\|x&=E\|&x\)
(
*x == NULL
|
*x != NULL
)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Acked-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_trans_buf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8ee2f8c8b0a6..218829e6a152 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -307,7 +307,7 @@ xfs_trans_read_buf(
 			return (flags & XFS_BUF_TRYLOCK) ?
 					EAGAIN : XFS_ERROR(ENOMEM);
 
-		if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
+		if (XFS_BUF_GETERROR(bp) != 0) {
 			xfs_ioerror_alert("xfs_trans_read_buf", mp,
 					  bp, blkno);
 			error = XFS_BUF_GETERROR(bp);
@@ -315,7 +315,7 @@ xfs_trans_read_buf(
 			return error;
 		}
 #ifdef DEBUG
-		if (xfs_do_error && (bp != NULL)) {
+		if (xfs_do_error) {
 			if (xfs_error_target == target) {
 				if (((xfs_req_num++) % xfs_error_mod) == 0) {
 					xfs_buf_relse(bp);
-- 
cgit v1.2.2


From de89de6e0cf4b1eb13f27137cf2aa40d287aabdf Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 31 Aug 2009 17:00:59 -0400
Subject: ext4: Restore wbc->range_start in ext4_da_writepages()

To solve a lock inversion problem, we implement part of the
range_cyclic algorithm in ext4_da_writepages().  (See commit 2acf2c26
for more details.)

As part of that change wbc->range_start was modified by ext4's
writepages function, which causes its callers to get confused since
they aren't expecting the filesystem to modify it.  The simplest fix
is to save and restore wbc->range_start in ext4_da_writepages.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d61fb523308f..ff659e757578 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2749,6 +2749,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	long pages_skipped;
 	int range_cyclic, cycled = 1, io_done = 0;
 	int needed_blocks, ret = 0, nr_to_writebump = 0;
+	loff_t range_start = wbc->range_start;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 
 	trace_ext4_da_writepages(inode, wbc);
@@ -2917,6 +2918,7 @@ out_writepages:
 	if (!no_nrwrite_index_update)
 		wbc->no_nrwrite_index_update = 0;
 	wbc->nr_to_write -= nr_to_writebump;
+	wbc->range_start = range_start;
 	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
 	return ret;
 }
-- 
cgit v1.2.2


From b3a3ca8ca0c3c29abc5b2bfe94bb14f3f4590df9 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 31 Aug 2009 23:13:11 -0400
Subject: ext4: Add new tracepoint: trace_ext4_da_write_pages()

Add a new tracepoint which shows the pages that will be written using
write_cache_pages() by ext4_da_writepages().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  | 15 +++++++++++++++
 fs/ext4/inode.c | 13 +------------
 2 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 41a76e163b99..81014f4ed22d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -113,6 +113,21 @@ struct ext4_allocation_request {
 	unsigned int flags;
 };
 
+/*
+ * For delayed allocation tracking
+ */
+struct mpage_da_data {
+	struct inode *inode;
+	sector_t b_blocknr;		/* start block number of extent */
+	size_t b_size;			/* size of extent */
+	unsigned long b_state;		/* state of the extent */
+	unsigned long first_page, next_page;	/* extent of pages */
+	struct writeback_control *wbc;
+	int io_done;
+	int pages_written;
+	int retval;
+};
+
 /*
  * Special inodes numbers
  */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ff659e757578..17802a96af9f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1875,18 +1875,6 @@ static void ext4_da_page_release_reservation(struct page *page,
  * Delayed allocation stuff
  */
 
-struct mpage_da_data {
-	struct inode *inode;
-	sector_t b_blocknr;		/* start block number of extent */
-	size_t b_size;			/* size of extent */
-	unsigned long b_state;		/* state of the extent */
-	unsigned long first_page, next_page;	/* extent of pages */
-	struct writeback_control *wbc;
-	int io_done;
-	int pages_written;
-	int retval;
-};
-
 /*
  * mpage_da_submit_io - walks through extent of pages and try to write
  * them with writepage() call back
@@ -2863,6 +2851,7 @@ retry:
 			mpd.io_done = 1;
 			ret = MPAGE_DA_EXTENT_TAIL;
 		}
+		trace_ext4_da_write_pages(inode, &mpd);
 		wbc->nr_to_write -= mpd.pages_written;
 
 		ext4_journal_stop(handle);
-- 
cgit v1.2.2


From 1b3859bc9e20d764316346665fc93ecea2d2b176 Mon Sep 17 00:00:00 2001
From: Alexander Strakh <strakh@ispras.ru>
Date: Tue, 1 Sep 2009 17:02:24 +0000
Subject: [CIFS] Memory leak in ntlmv2 hash calculation

 in function calc_ntlmv2_hash memory is not released.
1. If in the line 333 we successfully allocate memory and assign it to
pctxt variable:
       pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL);
then we go to line 376 and exit wihout releasing memory pointed to by pctxt
variable.

Add a memory  releasing for pctxt variable before exit from function
calc_ntlmv2_hash.

Signed-off-by: Alexander Strakh <strakh@ispras.ru>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7c9809523f42..7efe1745494d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -373,6 +373,7 @@ calc_exit_2:
 	   compare with the NTLM example */
 	hmac_md5_final(ses->server->ntlmv2_hash, pctxt);
 
+	kfree(pctxt);
 	return rc;
 }
 
-- 
cgit v1.2.2


From bdb97adcdf0993adbd2eef44b4533620d43792de Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Thu, 20 Aug 2009 13:03:34 +0530
Subject: PATCH] cifs: fix broken mounts when a SSH tunnel is used (try #4)

One more try..

It seems there is a regression that got introduced while Jeff fixed
all the mount/umount races. While attempting to find whether a tcp
session is already existing, we were not checking whether the "port"
used are the same. When a second mount is attempted with a different
"port=" option, it is being ignored. Because of this the cifs mounts
that uses a SSH tunnel appears to be broken.

Steps to reproduce:

1. create 2 shares
# SSH Tunnel a SMB session
2. ssh -f -L 6111:127.0.0.1:445 root@localhost "sleep 86400"
3. ssh -f -L 6222:127.0.0.1:445 root@localhost "sleep 86400"
4. tcpdump -i lo 6111 &
5. mkdir -p /mnt/mnt1
6. mkdir -p /mnt/mnt2
7. mount.cifs //localhost/a /mnt/mnt1 -o username=guest,ip=127.0.0.1,port=6111
#(shows tcpdump activity on port 6111)
8. mount.cifs //localhost/b /mnt/mnt2 -o username=guest,ip=127.0.0.1,port=6222
#(shows tcpdump activity only on port 6111 and not on 6222

Fix by adding a check to compare the port _only_ if the user tries to
override the tcp port with "port=" option, before deciding that an
existing tcp session is found. Also, clean up a bit by replacing
if-else if by a switch statment while at it as suggested by Jeff.

Reviewed-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 45 +++++++++++++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 1f3345d7fa79..4e47a9b800ce 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1377,7 +1377,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 }
 
 static struct TCP_Server_Info *
-cifs_find_tcp_session(struct sockaddr_storage *addr)
+cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
 {
 	struct list_head *tmp;
 	struct TCP_Server_Info *server;
@@ -1397,16 +1397,37 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
 		if (server->tcpStatus == CifsNew)
 			continue;
 
-		if (addr->ss_family == AF_INET &&
-		    (addr4->sin_addr.s_addr !=
-		     server->addr.sockAddr.sin_addr.s_addr))
-			continue;
-		else if (addr->ss_family == AF_INET6 &&
-			 (!ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
-					   &addr6->sin6_addr) ||
-			  server->addr.sockAddr6.sin6_scope_id !=
-					   addr6->sin6_scope_id))
-			continue;
+		switch (addr->ss_family) {
+		case AF_INET:
+			if (addr4->sin_addr.s_addr ==
+			    server->addr.sockAddr.sin_addr.s_addr) {
+				addr4->sin_port = htons(port);
+				/* user overrode default port? */
+				if (addr4->sin_port) {
+					if (addr4->sin_port !=
+					    server->addr.sockAddr.sin_port)
+						continue;
+				}
+				break;
+			} else
+				continue;
+
+		case AF_INET6:
+			if (ipv6_addr_equal(&addr6->sin6_addr,
+			    &server->addr.sockAddr6.sin6_addr) &&
+			    (addr6->sin6_scope_id == 
+			    server->addr.sockAddr6.sin6_scope_id)) {
+				addr6->sin6_port = htons(port);
+				/* user overrode default port? */
+				if (addr6->sin6_port) {
+				       if (addr6->sin6_port !=
+					   server->addr.sockAddr6.sin6_port)
+					       continue;
+				}
+				break;
+			} else 
+				continue;
+		}
 
 		++server->srv_count;
 		write_unlock(&cifs_tcp_ses_lock);
@@ -1475,7 +1496,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	}
 
 	/* see if we already have a matching tcp_ses */
-	tcp_ses = cifs_find_tcp_session(&addr);
+	tcp_ses = cifs_find_tcp_session(&addr, volume_info->port);
 	if (tcp_ses)
 		return tcp_ses;
 
-- 
cgit v1.2.2


From ca43e3beee38623a3f80691b28623d6ecf214742 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 1 Sep 2009 17:20:50 +0000
Subject: [CIFS] Fix checkpatch warnings

Also update version number to 1.61

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES   |  5 ++++-
 fs/cifs/cifsfs.h  |  2 +-
 fs/cifs/connect.c | 12 ++++++------
 3 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index e85b1e4389e0..145540a316ab 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -3,7 +3,10 @@ Version 1.60
 Fix memory leak in reconnect.  Fix oops in DFS mount error path.
 Set s_maxbytes to smaller (the max that vfs can handle) so that
 sendfile will now work over cifs mounts again.  Add noforcegid
-and noforceuid mount parameters.
+and noforceuid mount parameters. Fix small mem leak when using
+ntlmv2. Fix 2nd mount to same server but with different port to
+be allowed (rather than reusing the 1st port) - only when the
+user explicitly overrides the port on the 2nd mount.
 
 Version 1.59
 ------------
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 6c170948300d..094325e3f714 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -113,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.60"
+#define CIFS_VERSION   "1.61"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4e47a9b800ce..d49682433c20 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1415,17 +1415,17 @@ cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
 		case AF_INET6:
 			if (ipv6_addr_equal(&addr6->sin6_addr,
 			    &server->addr.sockAddr6.sin6_addr) &&
-			    (addr6->sin6_scope_id == 
+			    (addr6->sin6_scope_id ==
 			    server->addr.sockAddr6.sin6_scope_id)) {
 				addr6->sin6_port = htons(port);
 				/* user overrode default port? */
 				if (addr6->sin6_port) {
-				       if (addr6->sin6_port !=
+					if (addr6->sin6_port !=
 					   server->addr.sockAddr6.sin6_port)
-					       continue;
+						continue;
 				}
 				break;
-			} else 
+			} else
 				continue;
 		}
 
@@ -2657,9 +2657,9 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 		return -EIO;
 
 	smb_buffer = cifs_buf_get();
-	if (smb_buffer == NULL) {
+	if (smb_buffer == NULL)
 		return -ENOMEM;
-	}
+
 	smb_buffer_response = smb_buffer;
 
 	header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX,
-- 
cgit v1.2.2


From 85c0b2ab5e69ca6133380ead1c50e0840d136b39 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 31 Aug 2009 20:56:51 -0300
Subject: xfs: factor out inode initialisation

Factor out code to initialize new inode clusters into a function of it's own.
This keeps xfs_ialloc_ag_alloc smaller and better structured and enables a
future inode cluster initialization transaction.  Also initialize the agno
variable earlier in xfs_ialloc_ag_alloc to avoid repeated byte swaps.

[hch:  The original patch is from Dave from his unpublished inode create
 transaction patch series, with some modifcations by me to apply stand-alone]

Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_ialloc.c | 175 ++++++++++++++++++++++++++++------------------------
 1 file changed, 95 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 3120a3a5e20f..ce9edf7f4cb4 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -152,6 +152,87 @@ xfs_inobt_get_rec(
 	return error;
 }
 
+/*
+ * Initialise a new set of inodes.
+ */
+STATIC void
+xfs_ialloc_inode_init(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		agbno,
+	xfs_agblock_t		length,
+	unsigned int		gen)
+{
+	struct xfs_buf		*fbuf;
+	struct xfs_dinode	*free;
+	int			blks_per_cluster, nbufs, ninodes;
+	int			version;
+	int			i, j;
+	xfs_daddr_t		d;
+
+	/*
+	 * Loop over the new block(s), filling in the inodes.
+	 * For small block sizes, manipulate the inodes in buffers
+	 * which are multiples of the blocks size.
+	 */
+	if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
+		blks_per_cluster = 1;
+		nbufs = length;
+		ninodes = mp->m_sb.sb_inopblock;
+	} else {
+		blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
+				   mp->m_sb.sb_blocksize;
+		nbufs = length / blks_per_cluster;
+		ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
+	}
+
+	/*
+	 * Figure out what version number to use in the inodes we create.
+	 * If the superblock version has caught up to the one that supports
+	 * the new inode format, then use the new inode version.  Otherwise
+	 * use the old version so that old kernels will continue to be
+	 * able to use the file system.
+	 */
+	if (xfs_sb_version_hasnlink(&mp->m_sb))
+		version = 2;
+	else
+		version = 1;
+
+	for (j = 0; j < nbufs; j++) {
+		/*
+		 * Get the block.
+		 */
+		d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
+		fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+					 mp->m_bsize * blks_per_cluster,
+					 XFS_BUF_LOCK);
+		ASSERT(fbuf);
+		ASSERT(!XFS_BUF_GETERROR(fbuf));
+
+		/*
+		 * Initialize all inodes in this buffer and then log them.
+		 *
+		 * XXX: It would be much better if we had just one transaction
+		 *	to log a whole cluster of inodes instead of all the
+		 *	individual transactions causing a lot of log traffic.
+		 */
+		xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
+		for (i = 0; i < ninodes; i++) {
+			int	ioffset = i << mp->m_sb.sb_inodelog;
+			uint	isize = sizeof(struct xfs_dinode);
+
+			free = xfs_make_iptr(mp, fbuf, i);
+			free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+			free->di_version = version;
+			free->di_gen = cpu_to_be32(gen);
+			free->di_next_unlinked = cpu_to_be32(NULLAGINO);
+			xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
+		}
+		xfs_trans_inode_alloc_buf(tp, fbuf);
+	}
+}
+
 /*
  * Allocate new inodes in the allocation group specified by agbp.
  * Return 0 for success, else error code.
@@ -164,24 +245,15 @@ xfs_ialloc_ag_alloc(
 {
 	xfs_agi_t	*agi;		/* allocation group header */
 	xfs_alloc_arg_t	args;		/* allocation argument structure */
-	int		blks_per_cluster;  /* fs blocks per inode cluster */
 	xfs_btree_cur_t	*cur;		/* inode btree cursor */
-	xfs_daddr_t	d;		/* disk addr of buffer */
 	xfs_agnumber_t	agno;
 	int		error;
-	xfs_buf_t	*fbuf;		/* new free inodes' buffer */
-	xfs_dinode_t	*free;		/* new free inode structure */
-	int		i;		/* inode counter */
-	int		j;		/* block counter */
-	int		nbufs;		/* num bufs of new inodes */
+	int		i;
 	xfs_agino_t	newino;		/* new first inode's number */
 	xfs_agino_t	newlen;		/* new number of inodes */
-	int		ninodes;	/* num inodes per buf */
 	xfs_agino_t	thisino;	/* current inode number, for loop */
-	int		version;	/* inode version number to use */
 	int		isaligned = 0;	/* inode allocation at stripe unit */
 					/* boundary */
-	unsigned int	gen;
 
 	args.tp = tp;
 	args.mp = tp->t_mountp;
@@ -202,12 +274,12 @@ xfs_ialloc_ag_alloc(
  	 */
 	agi = XFS_BUF_TO_AGI(agbp);
 	newino = be32_to_cpu(agi->agi_newino);
+	agno = be32_to_cpu(agi->agi_seqno);
 	args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
 			XFS_IALLOC_BLOCKS(args.mp);
 	if (likely(newino != NULLAGINO &&
 		  (args.agbno < be32_to_cpu(agi->agi_length)))) {
-		args.fsbno = XFS_AGB_TO_FSB(args.mp,
-				be32_to_cpu(agi->agi_seqno), args.agbno);
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
 		args.type = XFS_ALLOCTYPE_THIS_BNO;
 		args.mod = args.total = args.wasdel = args.isfl =
 			args.userdata = args.minalignslop = 0;
@@ -258,8 +330,7 @@ xfs_ialloc_ag_alloc(
 		 * For now, just allocate blocks up front.
 		 */
 		args.agbno = be32_to_cpu(agi->agi_root);
-		args.fsbno = XFS_AGB_TO_FSB(args.mp,
-				be32_to_cpu(agi->agi_seqno), args.agbno);
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
 		/*
 		 * Allocate a fixed-size extent of inodes.
 		 */
@@ -282,8 +353,7 @@ xfs_ialloc_ag_alloc(
 	if (isaligned && args.fsbno == NULLFSBLOCK) {
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
 		args.agbno = be32_to_cpu(agi->agi_root);
-		args.fsbno = XFS_AGB_TO_FSB(args.mp,
-				be32_to_cpu(agi->agi_seqno), args.agbno);
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
 		args.alignment = xfs_ialloc_cluster_alignment(&args);
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
@@ -294,85 +364,30 @@ xfs_ialloc_ag_alloc(
 		return 0;
 	}
 	ASSERT(args.len == args.minlen);
-	/*
-	 * Convert the results.
-	 */
-	newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
-	/*
-	 * Loop over the new block(s), filling in the inodes.
-	 * For small block sizes, manipulate the inodes in buffers
-	 * which are multiples of the blocks size.
-	 */
-	if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
-		blks_per_cluster = 1;
-		nbufs = (int)args.len;
-		ninodes = args.mp->m_sb.sb_inopblock;
-	} else {
-		blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
-				   args.mp->m_sb.sb_blocksize;
-		nbufs = (int)args.len / blks_per_cluster;
-		ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
-	}
-	/*
-	 * Figure out what version number to use in the inodes we create.
-	 * If the superblock version has caught up to the one that supports
-	 * the new inode format, then use the new inode version.  Otherwise
-	 * use the old version so that old kernels will continue to be
-	 * able to use the file system.
-	 */
-	if (xfs_sb_version_hasnlink(&args.mp->m_sb))
-		version = 2;
-	else
-		version = 1;
 
 	/*
+	 * Stamp and write the inode buffers.
+	 *
 	 * Seed the new inode cluster with a random generation number. This
 	 * prevents short-term reuse of generation numbers if a chunk is
 	 * freed and then immediately reallocated. We use random numbers
 	 * rather than a linear progression to prevent the next generation
 	 * number from being easily guessable.
 	 */
-	gen = random32();
-	for (j = 0; j < nbufs; j++) {
-		/*
-		 * Get the block.
-		 */
-		d = XFS_AGB_TO_DADDR(args.mp, be32_to_cpu(agi->agi_seqno),
-				     args.agbno + (j * blks_per_cluster));
-		fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
-					 args.mp->m_bsize * blks_per_cluster,
-					 XFS_BUF_LOCK);
-		ASSERT(fbuf);
-		ASSERT(!XFS_BUF_GETERROR(fbuf));
+	xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len,
+			      random32());
 
-		/*
-		 * Initialize all inodes in this buffer and then log them.
-		 *
-		 * XXX: It would be much better if we had just one transaction to
-		 *      log a whole cluster of inodes instead of all the individual
-		 *      transactions causing a lot of log traffic.
-		 */
-		xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
-		for (i = 0; i < ninodes; i++) {
-			int	ioffset = i << args.mp->m_sb.sb_inodelog;
-			uint	isize = sizeof(struct xfs_dinode);
-
-			free = xfs_make_iptr(args.mp, fbuf, i);
-			free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
-			free->di_version = version;
-			free->di_gen = cpu_to_be32(gen);
-			free->di_next_unlinked = cpu_to_be32(NULLAGINO);
-			xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
-		}
-		xfs_trans_inode_alloc_buf(tp, fbuf);
-	}
+	/*
+	 * Convert the results.
+	 */
+	newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
 	be32_add_cpu(&agi->agi_count, newlen);
 	be32_add_cpu(&agi->agi_freecount, newlen);
-	agno = be32_to_cpu(agi->agi_seqno);
 	down_read(&args.mp->m_peraglock);
 	args.mp->m_perag[agno].pagi_freecount += newlen;
 	up_read(&args.mp->m_peraglock);
 	agi->agi_newino = cpu_to_be32(newino);
+
 	/*
 	 * Insert records describing the new inode chunk into the btree.
 	 */
-- 
cgit v1.2.2


From 2e287a731e0607e0371dc6165b7dd3ebc67fa8e1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 31 Aug 2009 20:56:58 -0300
Subject: xfs: improve xfs_inobt_get_rec prototype

Most callers of xfs_inobt_get_rec need to fill a xfs_inobt_rec_incore_t, and
those who don't yet are fine with a xfs_inobt_rec_incore_t, instead of the
three individual variables, too.  So just change xfs_inobt_get_rec to write
the output into a xfs_inobt_rec_incore_t directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_ialloc.c | 81 ++++++++++++++++++++-------------------------------
 fs/xfs/xfs_ialloc.h |  4 +--
 fs/xfs/xfs_itable.c | 84 ++++++++++++++++++++++++++++-------------------------
 3 files changed, 77 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index ce9edf7f4cb4..72fa3bfc56eb 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -135,9 +135,7 @@ xfs_inobt_update(
 int					/* error */
 xfs_inobt_get_rec(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_agino_t		*ino,	/* output: starting inode of chunk */
-	__int32_t		*fcnt,	/* output: number of free inodes */
-	xfs_inofree_t		*free,	/* output: free inode mask */
+	xfs_inobt_rec_incore_t	*irec,	/* btree record */
 	int			*stat)	/* output: success/failure */
 {
 	union xfs_btree_rec	*rec;
@@ -145,9 +143,9 @@ xfs_inobt_get_rec(
 
 	error = xfs_btree_get_rec(cur, &rec, stat);
 	if (!error && *stat == 1) {
-		*ino = be32_to_cpu(rec->inobt.ir_startino);
-		*fcnt = be32_to_cpu(rec->inobt.ir_freecount);
-		*free = be64_to_cpu(rec->inobt.ir_free);
+		irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+		irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
+		irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
 	}
 	return error;
 }
@@ -746,8 +744,8 @@ nextag:
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 		do {
-			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
-					&rec.ir_freecount, &rec.ir_free, &i)))
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+			if (error)
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 			freecount += rec.ir_freecount;
@@ -766,8 +764,7 @@ nextag:
 		if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i)))
 			goto error0;
 		if (i != 0 &&
-		    (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
-			    &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
+		    (error = xfs_inobt_get_rec(cur, &rec, &j)) == 0 &&
 		    j == 1 &&
 		    rec.ir_freecount > 0) {
 			/*
@@ -799,10 +796,8 @@ nextag:
 				goto error1;
 			doneleft = !i;
 			if (!doneleft) {
-				if ((error = xfs_inobt_get_rec(tcur,
-						&trec.ir_startino,
-						&trec.ir_freecount,
-						&trec.ir_free, &i)))
+				error = xfs_inobt_get_rec(tcur, &trec, &i);
+				if (error)
 					goto error1;
 				XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
 			}
@@ -813,10 +808,8 @@ nextag:
 				goto error1;
 			doneright = !i;
 			if (!doneright) {
-				if ((error = xfs_inobt_get_rec(cur,
-						&rec.ir_startino,
-						&rec.ir_freecount,
-						&rec.ir_free, &i)))
+				error = xfs_inobt_get_rec(cur, &rec, &i);
+				if (error)
 					goto error1;
 				XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
 			}
@@ -876,11 +869,9 @@ nextag:
 						goto error1;
 					doneleft = !i;
 					if (!doneleft) {
-						if ((error = xfs_inobt_get_rec(
-							    tcur,
-							    &trec.ir_startino,
-							    &trec.ir_freecount,
-							    &trec.ir_free, &i)))
+						error = xfs_inobt_get_rec(
+							    tcur, &trec, &i);
+						if (error)
 							goto error1;
 						XFS_WANT_CORRUPTED_GOTO(i == 1,
 							error1);
@@ -896,11 +887,9 @@ nextag:
 						goto error1;
 					doneright = !i;
 					if (!doneright) {
-						if ((error = xfs_inobt_get_rec(
-							    cur,
-							    &rec.ir_startino,
-							    &rec.ir_freecount,
-							    &rec.ir_free, &i)))
+						error = xfs_inobt_get_rec(
+							    cur, &rec, &i);
+						if (error)
 							goto error1;
 						XFS_WANT_CORRUPTED_GOTO(i == 1,
 							error1);
@@ -919,8 +908,7 @@ nextag:
 				be32_to_cpu(agi->agi_newino), 0, 0, &i)))
 			goto error0;
 		if (i == 1 &&
-		    (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
-			    &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
+		    (error = xfs_inobt_get_rec(cur, &rec, &j)) == 0 &&
 		    j == 1 &&
 		    rec.ir_freecount > 0) {
 			/*
@@ -938,10 +926,8 @@ nextag:
 				goto error0;
 			ASSERT(i == 1);
 			for (;;) {
-				if ((error = xfs_inobt_get_rec(cur,
-						&rec.ir_startino,
-						&rec.ir_freecount, &rec.ir_free,
-						&i)))
+				error = xfs_inobt_get_rec(cur, &rec, &i);
+				if (error)
 					goto error0;
 				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 				if (rec.ir_freecount > 0)
@@ -975,8 +961,8 @@ nextag:
 		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
 			goto error0;
 		do {
-			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
-					&rec.ir_freecount, &rec.ir_free, &i)))
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+			if (error)
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 			freecount += rec.ir_freecount;
@@ -1084,8 +1070,8 @@ xfs_difree(
 		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
 			goto error0;
 		do {
-			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
-					&rec.ir_freecount, &rec.ir_free, &i)))
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+			if (error)
 				goto error0;
 			if (i) {
 				freecount += rec.ir_freecount;
@@ -1107,8 +1093,8 @@ xfs_difree(
 		goto error0;
 	}
 	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-	if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount,
-			&rec.ir_free, &i))) {
+	error = xfs_inobt_get_rec(cur, &rec, &i);
+	if (error) {
 		cmn_err(CE_WARN,
 			"xfs_difree: xfs_inobt_get_rec()  returned an error %d on %s.  Returning error.",
 			error, mp->m_fsname);
@@ -1187,10 +1173,8 @@ xfs_difree(
 		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
 			goto error0;
 		do {
-			if ((error = xfs_inobt_get_rec(cur,
-					&rec.ir_startino,
-					&rec.ir_freecount,
-					&rec.ir_free, &i)))
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+			if (error)
 				goto error0;
 			if (i) {
 				freecount += rec.ir_freecount;
@@ -1312,9 +1296,7 @@ xfs_imap(
 		chunk_agbno = agbno - offset_agbno;
 	} else {
 		xfs_btree_cur_t	*cur;	/* inode btree cursor */
-		xfs_agino_t	chunk_agino; /* first agino in inode chunk */
-		__int32_t	chunk_cnt; /* count of free inodes in chunk */
-		xfs_inofree_t	chunk_free; /* mask of free inodes in chunk */
+		xfs_inobt_rec_incore_t chunk_rec;
 		xfs_buf_t	*agbp;	/* agi buffer */
 		int		i;	/* temp state */
 
@@ -1337,8 +1319,7 @@ xfs_imap(
 			goto error0;
 		}
 
-		error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
-				&chunk_free, &i);
+		error = xfs_inobt_get_rec(cur, &chunk_rec, &i);
 		if (error) {
 			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
 					"xfs_inobt_get_rec() failed");
@@ -1356,7 +1337,7 @@ xfs_imap(
 		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 		if (error)
 			return error;
-		chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
+		chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino);
 		offset_agbno = agbno - chunk_agbno;
 	}
 
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index aeee8278f92c..52e72fe7e411 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -166,7 +166,7 @@ int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
 /*
  * Get the data from the pointed-to record.
  */
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
-			     __int32_t *fcnt, xfs_inofree_t *free, int *stat);
+extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+		xfs_inobt_rec_incore_t *rec, int *stat);
 
 #endif	/* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index c471122d2234..3ec13e8a8c54 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -353,9 +353,6 @@ xfs_bulkstat(
 	int			end_of_ag; /* set if we've seen the ag end */
 	int			error;	/* error code */
 	int                     fmterror;/* bulkstat formatter result */
-	__int32_t		gcnt;	/* current btree rec's count */
-	xfs_inofree_t		gfree;	/* current btree rec's free mask */
-	xfs_agino_t		gino;	/* current btree rec's start inode */
 	int			i;	/* loop index */
 	int			icount;	/* count of inodes good in irbuf */
 	size_t			irbsize; /* size of irec buffer in bytes */
@@ -442,6 +439,8 @@ xfs_bulkstat(
 		 * we need to get the remainder of the chunk we're in.
 		 */
 		if (agino > 0) {
+			xfs_inobt_rec_incore_t r;
+
 			/*
 			 * Lookup the inode chunk that this inode lives in.
 			 */
@@ -449,33 +448,33 @@ xfs_bulkstat(
 			if (!error &&	/* no I/O error */
 			    tmp &&	/* lookup succeeded */
 					/* got the record, should always work */
-			    !(error = xfs_inobt_get_rec(cur, &gino, &gcnt,
-				    &gfree, &i)) &&
+			    !(error = xfs_inobt_get_rec(cur, &r, &i)) &&
 			    i == 1 &&
 					/* this is the right chunk */
-			    agino < gino + XFS_INODES_PER_CHUNK &&
+			    agino < r.ir_startino + XFS_INODES_PER_CHUNK &&
 					/* lastino was not last in chunk */
-			    (chunkidx = agino - gino + 1) <
+			    (chunkidx = agino - r.ir_startino + 1) <
 				    XFS_INODES_PER_CHUNK &&
 					/* there are some left allocated */
 			    xfs_inobt_maskn(chunkidx,
-				    XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) {
+				    XFS_INODES_PER_CHUNK - chunkidx) &
+				    ~r.ir_free) {
 				/*
 				 * Grab the chunk record.  Mark all the
 				 * uninteresting inodes (because they're
 				 * before our start point) free.
 				 */
 				for (i = 0; i < chunkidx; i++) {
-					if (XFS_INOBT_MASK(i) & ~gfree)
-						gcnt++;
+					if (XFS_INOBT_MASK(i) & ~r.ir_free)
+						r.ir_freecount++;
 				}
-				gfree |= xfs_inobt_maskn(0, chunkidx);
-				irbp->ir_startino = gino;
-				irbp->ir_freecount = gcnt;
-				irbp->ir_free = gfree;
+				r.ir_free |= xfs_inobt_maskn(0, chunkidx);
+				irbp->ir_startino = r.ir_startino;
+				irbp->ir_freecount = r.ir_freecount;
+				irbp->ir_free = r.ir_free;
 				irbp++;
-				agino = gino + XFS_INODES_PER_CHUNK;
-				icount = XFS_INODES_PER_CHUNK - gcnt;
+				agino = r.ir_startino + XFS_INODES_PER_CHUNK;
+				icount = XFS_INODES_PER_CHUNK - r.ir_freecount;
 			} else {
 				/*
 				 * If any of those tests failed, bump the
@@ -501,6 +500,8 @@ xfs_bulkstat(
 		 * until we run out of inodes or space in the buffer.
 		 */
 		while (irbp < irbufend && icount < ubcount) {
+			xfs_inobt_rec_incore_t r;
+
 			/*
 			 * Loop as long as we're unable to read the
 			 * inode btree.
@@ -518,43 +519,47 @@ xfs_bulkstat(
 			 * If ran off the end of the ag either with an error,
 			 * or the normal way, set end and stop collecting.
 			 */
-			if (error ||
-			    (error = xfs_inobt_get_rec(cur, &gino, &gcnt,
-				    &gfree, &i)) ||
-			    i == 0) {
+			if (error) {
+				end_of_ag = 1;
+				break;
+			}
+
+			error = xfs_inobt_get_rec(cur, &r, &i);
+			if (error || i == 0) {
 				end_of_ag = 1;
 				break;
 			}
+
 			/*
 			 * If this chunk has any allocated inodes, save it.
 			 * Also start read-ahead now for this chunk.
 			 */
-			if (gcnt < XFS_INODES_PER_CHUNK) {
+			if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
 				/*
 				 * Loop over all clusters in the next chunk.
 				 * Do a readahead if there are any allocated
 				 * inodes in that cluster.
 				 */
-				for (agbno = XFS_AGINO_TO_AGBNO(mp, gino),
-				     chunkidx = 0;
+				agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
+				for (chunkidx = 0;
 				     chunkidx < XFS_INODES_PER_CHUNK;
 				     chunkidx += nicluster,
 				     agbno += nbcluster) {
-					if (xfs_inobt_maskn(chunkidx,
-							    nicluster) & ~gfree)
+					if (xfs_inobt_maskn(chunkidx, nicluster)
+							& ~r.ir_free)
 						xfs_btree_reada_bufs(mp, agno,
 							agbno, nbcluster);
 				}
-				irbp->ir_startino = gino;
-				irbp->ir_freecount = gcnt;
-				irbp->ir_free = gfree;
+				irbp->ir_startino = r.ir_startino;
+				irbp->ir_freecount = r.ir_freecount;
+				irbp->ir_free = r.ir_free;
 				irbp++;
-				icount += XFS_INODES_PER_CHUNK - gcnt;
+				icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
 			}
 			/*
 			 * Set agino to after this chunk and bump the cursor.
 			 */
-			agino = gino + XFS_INODES_PER_CHUNK;
+			agino = r.ir_startino + XFS_INODES_PER_CHUNK;
 			error = xfs_btree_increment(cur, 0, &tmp);
 			cond_resched();
 		}
@@ -820,9 +825,7 @@ xfs_inumbers(
 	int		bufidx;
 	xfs_btree_cur_t	*cur;
 	int		error;
-	__int32_t	gcnt;
-	xfs_inofree_t	gfree;
-	xfs_agino_t	gino;
+	xfs_inobt_rec_incore_t r;
 	int		i;
 	xfs_ino_t	ino;
 	int		left;
@@ -870,9 +873,8 @@ xfs_inumbers(
 				continue;
 			}
 		}
-		if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree,
-			&i)) ||
-		    i == 0) {
+		error = xfs_inobt_get_rec(cur, &r, &i);
+		if (error || i == 0) {
 			xfs_buf_relse(agbp);
 			agbp = NULL;
 			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -881,10 +883,12 @@ xfs_inumbers(
 			agino = 0;
 			continue;
 		}
-		agino = gino + XFS_INODES_PER_CHUNK - 1;
-		buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino);
-		buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt;
-		buffer[bufidx].xi_allocmask = ~gfree;
+		agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
+		buffer[bufidx].xi_startino =
+			XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
+		buffer[bufidx].xi_alloccount =
+			XFS_INODES_PER_CHUNK - r.ir_freecount;
+		buffer[bufidx].xi_allocmask = ~r.ir_free;
 		bufidx++;
 		left--;
 		if (bufidx == bcount) {
-- 
cgit v1.2.2


From afabc24a73bfee2656724b0a70395f1693eaa62b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 31 Aug 2009 20:57:03 -0300
Subject: xfs: improve xfs_inobt_update prototype

Both callers of xfs_inobt_update have the record in form of a
xfs_inobt_rec_incore_t, so just pass a pointer to it instead of the
individual variables.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_ialloc.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 72fa3bfc56eb..8819cdacf702 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -110,22 +110,19 @@ xfs_inobt_lookup_le(
 }
 
 /*
- * Update the record referred to by cur to the value given
- * by [ino, fcnt, free].
+ * Update the record referred to by cur to the value given.
  * This either works (return 0) or gets an EFSCORRUPTED error.
  */
 STATIC int				/* error */
 xfs_inobt_update(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_agino_t		ino,	/* starting inode of chunk */
-	__int32_t		fcnt,	/* free inode count */
-	xfs_inofree_t		free)	/* free inode mask */
+	xfs_inobt_rec_incore_t	*irec)	/* btree record */
 {
 	union xfs_btree_rec	rec;
 
-	rec.inobt.ir_startino = cpu_to_be32(ino);
-	rec.inobt.ir_freecount = cpu_to_be32(fcnt);
-	rec.inobt.ir_free = cpu_to_be64(free);
+	rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
+	rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+	rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
 	return xfs_btree_update(cur, &rec);
 }
 
@@ -946,8 +943,8 @@ nextag:
 	ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
 	rec.ir_free &= ~XFS_INOBT_MASK(offset);
 	rec.ir_freecount--;
-	if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
-			rec.ir_free)))
+	error = xfs_inobt_update(cur, &rec);
+	if (error)
 		goto error0;
 	be32_add_cpu(&agi->agi_freecount, -1);
 	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
@@ -1149,12 +1146,14 @@ xfs_difree(
 	} else {
 		*delete = 0;
 
-		if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) {
+		error = xfs_inobt_update(cur, &rec);
+		if (error) {
 			cmn_err(CE_WARN,
-				"xfs_difree: xfs_inobt_update()  returned an error %d on %s.  Returning error.",
+	"xfs_difree: xfs_inobt_update returned an error %d on %s.",
 				error, mp->m_fsname);
 			goto error0;
 		}
+
 		/* 
 		 * Change the inode free counts and log the ag/sb changes.
 		 */
-- 
cgit v1.2.2


From 0b48db80ba689edfd96ed06c3124d6cf1146de3f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dgc@sgi.com>
Date: Mon, 31 Aug 2009 20:57:09 -0300
Subject: xfs: factor out debug checks from xfs_dialloc and xfs_difree

Factor out a common helper from repeated debug checks in xfs_dialloc and
xfs_difree.

[hch: split out from Dave's dynamic allocation policy patches]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_ialloc.c | 131 ++++++++++++++++++++++------------------------------
 1 file changed, 56 insertions(+), 75 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 8819cdacf702..18bf6eece54d 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -147,6 +147,47 @@ xfs_inobt_get_rec(
 	return error;
 }
 
+/*
+ * Verify that the number of free inodes in the AGI is correct.
+ */
+#ifdef DEBUG
+STATIC int
+xfs_check_agi_freecount(
+	struct xfs_btree_cur	*cur,
+	struct xfs_agi		*agi)
+{
+	if (cur->bc_nlevels == 1) {
+		xfs_inobt_rec_incore_t rec;
+		int		freecount = 0;
+		int		error;
+		int		i;
+
+		error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i);
+		if (error)
+			return error;
+
+		do {
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+			if (error)
+				return error;
+
+			if (i) {
+				freecount += rec.ir_freecount;
+				error = xfs_btree_increment(cur, 0, &i);
+				if (error)
+					return error;
+			}
+		} while (i == 1);
+
+		if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
+			ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
+	}
+	return 0;
+}
+#else
+#define xfs_check_agi_freecount(cur, agi)	0
+#endif
+
 /*
  * Initialise a new set of inodes.
  */
@@ -548,6 +589,7 @@ nextag:
 	}
 }
 
+
 /*
  * Visible inode allocation functions.
  */
@@ -733,27 +775,11 @@ nextag:
 	 */
 	if (!pagino)
 		pagino = be32_to_cpu(agi->agi_newino);
-#ifdef DEBUG
-	if (cur->bc_nlevels == 1) {
-		int	freecount = 0;
 
-		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		do {
-			error = xfs_inobt_get_rec(cur, &rec, &i);
-			if (error)
-				goto error0;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			freecount += rec.ir_freecount;
-			if ((error = xfs_btree_increment(cur, 0, &i)))
-				goto error0;
-		} while (i == 1);
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
 
-		ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
-		       XFS_FORCED_SHUTDOWN(mp));
-	}
-#endif
 	/*
 	 * If in the same a.g. as the parent, try to get near the parent.
 	 */
@@ -951,25 +977,11 @@ nextag:
 	down_read(&mp->m_peraglock);
 	mp->m_perag[tagno].pagi_freecount--;
 	up_read(&mp->m_peraglock);
-#ifdef DEBUG
-	if (cur->bc_nlevels == 1) {
-		int	freecount = 0;
 
-		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
-			goto error0;
-		do {
-			error = xfs_inobt_get_rec(cur, &rec, &i);
-			if (error)
-				goto error0;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			freecount += rec.ir_freecount;
-			if ((error = xfs_btree_increment(cur, 0, &i)))
-				goto error0;
-		} while (i == 1);
-		ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
-		       XFS_FORCED_SHUTDOWN(mp));
-	}
-#endif
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
+
 	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
 	*inop = ino;
@@ -1060,26 +1072,11 @@ xfs_difree(
 	 * Initialize the cursor.
 	 */
 	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-#ifdef DEBUG
-	if (cur->bc_nlevels == 1) {
-		int freecount = 0;
 
-		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
-			goto error0;
-		do {
-			error = xfs_inobt_get_rec(cur, &rec, &i);
-			if (error)
-				goto error0;
-			if (i) {
-				freecount += rec.ir_freecount;
-				if ((error = xfs_btree_increment(cur, 0, &i)))
-					goto error0;
-			}
-		} while (i == 1);
-		ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
-		       XFS_FORCED_SHUTDOWN(mp));
-	}
-#endif
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
+
 	/*
 	 * Look for the entry describing this inode.
 	 */
@@ -1165,26 +1162,10 @@ xfs_difree(
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
 	}
 
-#ifdef DEBUG
-	if (cur->bc_nlevels == 1) {
-		int freecount = 0;
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
 
-		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
-			goto error0;
-		do {
-			error = xfs_inobt_get_rec(cur, &rec, &i);
-			if (error)
-				goto error0;
-			if (i) {
-				freecount += rec.ir_freecount;
-				if ((error = xfs_btree_increment(cur, 0, &i)))
-					goto error0;
-			}
-		} while (i == 1);
-		ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
-		       XFS_FORCED_SHUTDOWN(mp));
-	}
-#endif
 	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 	return 0;
 
-- 
cgit v1.2.2


From 4254b0bbb1c0826b7443ffa593576696bc591aa2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 31 Aug 2009 20:57:14 -0300
Subject: xfs: untangle xfs_dialloc

Clarify the control flow in xfs_dialloc.  Factor out a helper to go to the
next node from the current one and improve the control flow by expanding
composite if statements and using gotos.

The xfs_ialloc_next_rec helper is borrowed from Dave Chinners dynamic
allocation policy patches.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_ialloc.c | 291 +++++++++++++++++++++++++---------------------------
 1 file changed, 138 insertions(+), 153 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 18bf6eece54d..7679c1633053 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -589,6 +589,37 @@ nextag:
 	}
 }
 
+/*
+ * Try to retrieve the next record to the left/right from the current one.
+ */
+STATIC int
+xfs_ialloc_next_rec(
+	struct xfs_btree_cur	*cur,
+	xfs_inobt_rec_incore_t	*rec,
+	int			*done,
+	int			left)
+{
+	int                     error;
+	int			i;
+
+	if (left)
+		error = xfs_btree_decrement(cur, 0, &i);
+	else
+		error = xfs_btree_increment(cur, 0, &i);
+
+	if (error)
+		return error;
+	*done = !i;
+	if (i) {
+		error = xfs_inobt_get_rec(cur, rec, &i);
+		if (error)
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+
+	return 0;
+}
+
 
 /*
  * Visible inode allocation functions.
@@ -644,8 +675,8 @@ xfs_dialloc(
 	int		j;		/* result code */
 	xfs_mount_t	*mp;		/* file system mount structure */
 	int		offset;		/* index of inode in chunk */
-	xfs_agino_t	pagino;		/* parent's a.g. relative inode # */
-	xfs_agnumber_t	pagno;		/* parent's allocation group number */
+	xfs_agino_t	pagino;		/* parent's AG relative inode # */
+	xfs_agnumber_t	pagno;		/* parent's AG number */
 	xfs_inobt_rec_incore_t rec;	/* inode allocation record */
 	xfs_agnumber_t	tagno;		/* testing allocation group number */
 	xfs_btree_cur_t	*tcur;		/* temp cursor */
@@ -781,186 +812,140 @@ nextag:
 		goto error0;
 
 	/*
-	 * If in the same a.g. as the parent, try to get near the parent.
+	 * If in the same AG as the parent, try to get near the parent.
 	 */
 	if (pagno == agno) {
-		if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i)))
+		int		doneleft;	/* done, to the left */
+		int		doneright;	/* done, to the right */
+
+		error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		error = xfs_inobt_get_rec(cur, &rec, &j);
+		if (error)
 			goto error0;
-		if (i != 0 &&
-		    (error = xfs_inobt_get_rec(cur, &rec, &j)) == 0 &&
-		    j == 1 &&
-		    rec.ir_freecount > 0) {
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		if (rec.ir_freecount > 0) {
 			/*
 			 * Found a free inode in the same chunk
-			 * as parent, done.
+			 * as the parent, done.
 			 */
+			goto alloc_inode;
 		}
+
+
 		/*
-		 * In the same a.g. as parent, but parent's chunk is full.
+		 * In the same AG as parent, but parent's chunk is full.
 		 */
-		else {
-			int	doneleft;	/* done, to the left */
-			int	doneright;	/* done, to the right */
 
-			if (error)
-				goto error0;
-			ASSERT(i == 1);
-			ASSERT(j == 1);
-			/*
-			 * Duplicate the cursor, search left & right
-			 * simultaneously.
-			 */
-			if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-				goto error0;
-			/*
-			 * Search left with tcur, back up 1 record.
-			 */
-			if ((error = xfs_btree_decrement(tcur, 0, &i)))
-				goto error1;
-			doneleft = !i;
-			if (!doneleft) {
-				error = xfs_inobt_get_rec(tcur, &trec, &i);
-				if (error)
-					goto error1;
-				XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
+		/* duplicate the cursor, search left & right simultaneously */
+		error = xfs_btree_dup_cursor(cur, &tcur);
+		if (error)
+			goto error0;
+
+		/* search left with tcur, back up 1 record */
+		error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
+		if (error)
+			goto error1;
+
+		/* search right with cur, go forward 1 record. */
+		error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+		if (error)
+			goto error1;
+
+		/*
+		 * Loop until we find an inode chunk with a free inode.
+		 */
+		while (!doneleft || !doneright) {
+			int	useleft;  /* using left inode chunk this time */
+
+			/* figure out the closer block if both are valid. */
+			if (!doneleft && !doneright) {
+				useleft = pagino -
+				 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
+				  rec.ir_startino - pagino;
+			} else {
+				useleft = !doneleft;
 			}
-			/*
-			 * Search right with cur, go forward 1 record.
-			 */
-			if ((error = xfs_btree_increment(cur, 0, &i)))
-				goto error1;
-			doneright = !i;
-			if (!doneright) {
-				error = xfs_inobt_get_rec(cur, &rec, &i);
-				if (error)
-					goto error1;
-				XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
+
+			/* free inodes to the left? */
+			if (useleft && trec.ir_freecount) {
+				rec = trec;
+				xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+				cur = tcur;
+				goto alloc_inode;
 			}
-			/*
-			 * Loop until we find the closest inode chunk
-			 * with a free one.
-			 */
-			while (!doneleft || !doneright) {
-				int	useleft;  /* using left inode
-						     chunk this time */
 
-				/*
-				 * Figure out which block is closer,
-				 * if both are valid.
-				 */
-				if (!doneleft && !doneright)
-					useleft =
-						pagino -
-						(trec.ir_startino +
-						 XFS_INODES_PER_CHUNK - 1) <
-						 rec.ir_startino - pagino;
-				else
-					useleft = !doneleft;
-				/*
-				 * If checking the left, does it have
-				 * free inodes?
-				 */
-				if (useleft && trec.ir_freecount) {
-					/*
-					 * Yes, set it up as the chunk to use.
-					 */
-					rec = trec;
-					xfs_btree_del_cursor(cur,
-						XFS_BTREE_NOERROR);
-					cur = tcur;
-					break;
-				}
-				/*
-				 * If checking the right, does it have
-				 * free inodes?
-				 */
-				if (!useleft && rec.ir_freecount) {
-					/*
-					 * Yes, it's already set up.
-					 */
-					xfs_btree_del_cursor(tcur,
-						XFS_BTREE_NOERROR);
-					break;
-				}
-				/*
-				 * If used the left, get another one
-				 * further left.
-				 */
-				if (useleft) {
-					if ((error = xfs_btree_decrement(tcur, 0,
-							&i)))
-						goto error1;
-					doneleft = !i;
-					if (!doneleft) {
-						error = xfs_inobt_get_rec(
-							    tcur, &trec, &i);
-						if (error)
-							goto error1;
-						XFS_WANT_CORRUPTED_GOTO(i == 1,
-							error1);
-					}
-				}
-				/*
-				 * If used the right, get another one
-				 * further right.
-				 */
-				else {
-					if ((error = xfs_btree_increment(cur, 0,
-							&i)))
-						goto error1;
-					doneright = !i;
-					if (!doneright) {
-						error = xfs_inobt_get_rec(
-							    cur, &rec, &i);
-						if (error)
-							goto error1;
-						XFS_WANT_CORRUPTED_GOTO(i == 1,
-							error1);
-					}
-				}
+			/* free inodes to the right? */
+			if (!useleft && rec.ir_freecount) {
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+				goto alloc_inode;
 			}
-			ASSERT(!doneleft || !doneright);
+
+			/* get next record to check */
+			if (useleft) {
+				error = xfs_ialloc_next_rec(tcur, &trec,
+								 &doneleft, 1);
+			} else {
+				error = xfs_ialloc_next_rec(cur, &rec,
+								 &doneright, 0);
+			}
+			if (error)
+				goto error1;
 		}
+		ASSERT(!doneleft || !doneright);
 	}
+
 	/*
-	 * In a different a.g. from the parent.
+	 * In a different AG from the parent.
 	 * See if the most recently allocated block has any free.
 	 */
 	else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
-		if ((error = xfs_inobt_lookup_eq(cur,
-				be32_to_cpu(agi->agi_newino), 0, 0, &i)))
+		error = xfs_inobt_lookup_eq(cur, be32_to_cpu(agi->agi_newino),
+					    0, 0, &i);
+		if (error)
 			goto error0;
-		if (i == 1 &&
-		    (error = xfs_inobt_get_rec(cur, &rec, &j)) == 0 &&
-		    j == 1 &&
-		    rec.ir_freecount > 0) {
-			/*
-			 * The last chunk allocated in the group still has
-			 * a free inode.
-			 */
+
+		if (i == 1) {
+			error = xfs_inobt_get_rec(cur, &rec, &j);
+			if (error)
+				goto error0;
+
+			if (j == 1 && rec.ir_freecount > 0) {
+				/*
+				 * The last chunk allocated in the group
+				 * still has a free inode.
+				 */
+				goto alloc_inode;
+			}
 		}
+
 		/*
-		 * None left in the last group, search the whole a.g.
+		 * None left in the last group, search the whole AG
 		 */
-		else {
+		error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		for (;;) {
+			error = xfs_inobt_get_rec(cur, &rec, &i);
 			if (error)
 				goto error0;
-			if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			if (rec.ir_freecount > 0)
+				break;
+			error = xfs_btree_increment(cur, 0, &i);
+			if (error)
 				goto error0;
-			ASSERT(i == 1);
-			for (;;) {
-				error = xfs_inobt_get_rec(cur, &rec, &i);
-				if (error)
-					goto error0;
-				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-				if (rec.ir_freecount > 0)
-					break;
-				if ((error = xfs_btree_increment(cur, 0, &i)))
-					goto error0;
-				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			}
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 		}
 	}
+
+alloc_inode:
 	offset = xfs_ialloc_find_free(&rec.ir_free);
 	ASSERT(offset >= 0);
 	ASSERT(offset < XFS_INODES_PER_CHUNK);
-- 
cgit v1.2.2


From 2187550525d7bcb8c87689e4eca41b1955bf9ac3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 31 Aug 2009 20:58:21 -0300
Subject: xfs: rationalize xfs_inobt_lookup*

Currenly we have a xfs_inobt_lookup* variant for each comparism direction,
and all these get all three fields of the inobt records passed, while the
common case is just looking for the inode number and we have only marginally
more callers than xfs_inobt_lookup* variants.

So opencode a direct call to xfs_btree_lookup for the single case where we
need all fields, and replace xfs_inobt_lookup* with a xfs_inobt_looku that
just takes the inode number and the direction for all other callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_ialloc.c | 77 +++++++++++++++--------------------------------------
 fs/xfs/xfs_ialloc.h | 14 +++-------
 fs/xfs/xfs_itable.c | 12 +++++----
 3 files changed, 32 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 7679c1633053..748637cd70ff 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -57,56 +57,19 @@ xfs_ialloc_cluster_alignment(
 }
 
 /*
- * Lookup the record equal to ino in the btree given by cur.
+ * Lookup a record by ino in the btree given by cur.
  */
 STATIC int				/* error */
-xfs_inobt_lookup_eq(
+xfs_inobt_lookup(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
 	xfs_agino_t		ino,	/* starting inode of chunk */
-	__int32_t		fcnt,	/* free inode count */
-	xfs_inofree_t		free,	/* free inode mask */
+	xfs_lookup_t		dir,	/* <=, >=, == */
 	int			*stat)	/* success/failure */
 {
 	cur->bc_rec.i.ir_startino = ino;
-	cur->bc_rec.i.ir_freecount = fcnt;
-	cur->bc_rec.i.ir_free = free;
-	return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-
-/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
- */
-int					/* error */
-xfs_inobt_lookup_ge(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_agino_t		ino,	/* starting inode of chunk */
-	__int32_t		fcnt,	/* free inode count */
-	xfs_inofree_t		free,	/* free inode mask */
-	int			*stat)	/* success/failure */
-{
-	cur->bc_rec.i.ir_startino = ino;
-	cur->bc_rec.i.ir_freecount = fcnt;
-	cur->bc_rec.i.ir_free = free;
-	return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
- */
-int					/* error */
-xfs_inobt_lookup_le(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_agino_t		ino,	/* starting inode of chunk */
-	__int32_t		fcnt,	/* free inode count */
-	xfs_inofree_t		free,	/* free inode mask */
-	int			*stat)	/* success/failure */
-{
-	cur->bc_rec.i.ir_startino = ino;
-	cur->bc_rec.i.ir_freecount = fcnt;
-	cur->bc_rec.i.ir_free = free;
-	return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+	cur->bc_rec.i.ir_freecount = 0;
+	cur->bc_rec.i.ir_free = 0;
+	return xfs_btree_lookup(cur, dir, stat);
 }
 
 /*
@@ -162,7 +125,7 @@ xfs_check_agi_freecount(
 		int		error;
 		int		i;
 
-		error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i);
+		error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
 		if (error)
 			return error;
 
@@ -431,13 +394,17 @@ xfs_ialloc_ag_alloc(
 	for (thisino = newino;
 	     thisino < newino + newlen;
 	     thisino += XFS_INODES_PER_CHUNK) {
-		if ((error = xfs_inobt_lookup_eq(cur, thisino,
-				XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) {
+		cur->bc_rec.i.ir_startino = thisino;
+		cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK;
+		cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE;
+		error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i);
+		if (error) {
 			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 			return error;
 		}
 		ASSERT(i == 0);
-		if ((error = xfs_btree_insert(cur, &i))) {
+		error = xfs_btree_insert(cur, &i);
+		if (error) {
 			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 			return error;
 		}
@@ -818,7 +785,7 @@ nextag:
 		int		doneleft;	/* done, to the left */
 		int		doneright;	/* done, to the right */
 
-		error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i);
+		error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
 		if (error)
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
@@ -904,8 +871,8 @@ nextag:
 	 * See if the most recently allocated block has any free.
 	 */
 	else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
-		error = xfs_inobt_lookup_eq(cur, be32_to_cpu(agi->agi_newino),
-					    0, 0, &i);
+		error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+					 XFS_LOOKUP_EQ, &i);
 		if (error)
 			goto error0;
 
@@ -926,7 +893,7 @@ nextag:
 		/*
 		 * None left in the last group, search the whole AG
 		 */
-		error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i);
+		error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
 		if (error)
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
@@ -1065,9 +1032,9 @@ xfs_difree(
 	/*
 	 * Look for the entry describing this inode.
 	 */
-	if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
+	if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
 		cmn_err(CE_WARN,
-			"xfs_difree: xfs_inobt_lookup_le returned()  an error %d on %s.  Returning error.",
+			"xfs_difree: xfs_inobt_lookup returned()  an error %d on %s.  Returning error.",
 			error, mp->m_fsname);
 		goto error0;
 	}
@@ -1277,10 +1244,10 @@ xfs_imap(
 		}
 
 		cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-		error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i);
+		error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
 		if (error) {
 			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-					"xfs_inobt_lookup_le() failed");
+					"xfs_inobt_lookup() failed");
 			goto error0;
 		}
 
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 52e72fe7e411..bb5385475e1f 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,18 +150,10 @@ xfs_ialloc_pagi_init(
         xfs_agnumber_t  agno);		/* allocation group number */
 
 /*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
+ * Lookup a record by ino in the btree given by cur.
  */
-int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
-		__int32_t fcnt,	xfs_inofree_t free, int *stat);
-
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
- */
-int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
-		__int32_t fcnt,	xfs_inofree_t free, int *stat);
+int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
+		xfs_lookup_t dir, int *stat);
 
 /*
  * Get the data from the pointed-to record.
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 3ec13e8a8c54..b68f9107e26c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -444,7 +444,8 @@ xfs_bulkstat(
 			/*
 			 * Lookup the inode chunk that this inode lives in.
 			 */
-			error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp);
+			error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE,
+						 &tmp);
 			if (!error &&	/* no I/O error */
 			    tmp &&	/* lookup succeeded */
 					/* got the record, should always work */
@@ -492,7 +493,7 @@ xfs_bulkstat(
 			/*
 			 * Start of ag.  Lookup the first inode chunk.
 			 */
-			error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp);
+			error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
 			icount = 0;
 		}
 		/*
@@ -511,8 +512,8 @@ xfs_bulkstat(
 				if (XFS_AGINO_TO_AGBNO(mp, agino) >=
 						be32_to_cpu(agi->agi_length))
 					break;
-				error = xfs_inobt_lookup_ge(cur, agino, 0, 0,
-							    &tmp);
+				error = xfs_inobt_lookup(cur, agino,
+							 XFS_LOOKUP_GE, &tmp);
 				cond_resched();
 			}
 			/*
@@ -858,7 +859,8 @@ xfs_inumbers(
 				continue;
 			}
 			cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
-			error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
+			error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
+						 &tmp);
 			if (error) {
 				xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 				cur = NULL;
-- 
cgit v1.2.2


From bd169565993b39b9b4b102cdac8b13e0a259ce2f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dgc@sgi.com>
Date: Mon, 31 Aug 2009 20:58:28 -0300
Subject: xfs: speed up free inode search

Don't search too far - abort if it is outside a certain radius and simply do
a linear search for the first free inode.  In AGs with a million inodes this
can speed up allocation speed by 3-4x.

[hch: ported to the new xfs_ialloc.c world order]

Signed-off-by: Dave Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_ag.h     |   9 ++++
 fs/xfs/xfs_ialloc.c | 133 +++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 115 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f24b50b68d03..a5d54bf4931b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -198,6 +198,15 @@ typedef struct xfs_perag
 	xfs_agino_t	pagi_count;	/* number of allocated inodes */
 	int		pagb_count;	/* pagb slots in use */
 	xfs_perag_busy_t *pagb_list;	/* unstable blocks */
+
+	/*
+	 * Inode allocation search lookup optimisation.
+	 * If the pagino matches, the search for new inodes
+	 * doesn't need to search the near ones again straight away
+	 */
+	xfs_agino_t	pagl_pagino;
+	xfs_agino_t	pagl_leftrec;
+	xfs_agino_t	pagl_rightrec;
 #ifdef __KERNEL__
 	spinlock_t	pagb_lock;	/* lock for pagb_list */
 
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 748637cd70ff..d12d805f20a5 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -587,6 +587,30 @@ xfs_ialloc_next_rec(
 	return 0;
 }
 
+STATIC int
+xfs_ialloc_get_rec(
+	struct xfs_btree_cur	*cur,
+	xfs_agino_t		agino,
+	xfs_inobt_rec_incore_t	*rec,
+	int			*done,
+	int			left)
+{
+	int                     error;
+	int			i;
+
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
+	if (error)
+		return error;
+	*done = !i;
+	if (i) {
+		error = xfs_inobt_get_rec(cur, rec, &i);
+		if (error)
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+
+	return 0;
+}
 
 /*
  * Visible inode allocation functions.
@@ -766,6 +790,8 @@ nextag:
 	 */
 	agno = tagno;
 	*IO_agbp = NULL;
+
+ restart_pagno:
 	cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
 	/*
 	 * If pagino is 0 (this is the root inode allocation) use newino.
@@ -782,8 +808,10 @@ nextag:
 	 * If in the same AG as the parent, try to get near the parent.
 	 */
 	if (pagno == agno) {
+		xfs_perag_t	*pag = &mp->m_perag[agno];
 		int		doneleft;	/* done, to the left */
 		int		doneright;	/* done, to the right */
+		int		searchdistance = 10;
 
 		error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
 		if (error)
@@ -813,15 +841,33 @@ nextag:
 		if (error)
 			goto error0;
 
-		/* search left with tcur, back up 1 record */
-		error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
-		if (error)
-			goto error1;
+		/*
+		 * Skip to last blocks looked up if same parent inode.
+		 */
+		if (pagino != NULLAGINO &&
+		    pag->pagl_pagino == pagino &&
+		    pag->pagl_leftrec != NULLAGINO &&
+		    pag->pagl_rightrec != NULLAGINO) {
+			error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
+						   &trec, &doneleft, 1);
+			if (error)
+				goto error1;
 
-		/* search right with cur, go forward 1 record. */
-		error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
-		if (error)
-			goto error1;
+			error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
+						   &rec, &doneright, 0);
+			if (error)
+				goto error1;
+		} else {
+			/* search left with tcur, back up 1 record */
+			error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
+			if (error)
+				goto error1;
+
+			/* search right with cur, go forward 1 record. */
+			error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+			if (error)
+				goto error1;
+		}
 
 		/*
 		 * Loop until we find an inode chunk with a free inode.
@@ -829,6 +875,17 @@ nextag:
 		while (!doneleft || !doneright) {
 			int	useleft;  /* using left inode chunk this time */
 
+			if (!--searchdistance) {
+				/*
+				 * Not in range - save last search
+				 * location and allocate a new inode
+				 */
+				pag->pagl_leftrec = trec.ir_startino;
+				pag->pagl_rightrec = rec.ir_startino;
+				pag->pagl_pagino = pagino;
+				goto newino;
+			}
+
 			/* figure out the closer block if both are valid. */
 			if (!doneleft && !doneright) {
 				useleft = pagino -
@@ -843,12 +900,20 @@ nextag:
 				rec = trec;
 				xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 				cur = tcur;
+
+				pag->pagl_leftrec = trec.ir_startino;
+				pag->pagl_rightrec = rec.ir_startino;
+				pag->pagl_pagino = pagino;
 				goto alloc_inode;
 			}
 
 			/* free inodes to the right? */
 			if (!useleft && rec.ir_freecount) {
 				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+				pag->pagl_leftrec = trec.ir_startino;
+				pag->pagl_rightrec = rec.ir_startino;
+				pag->pagl_pagino = pagino;
 				goto alloc_inode;
 			}
 
@@ -863,14 +928,28 @@ nextag:
 			if (error)
 				goto error1;
 		}
-		ASSERT(!doneleft || !doneright);
+
+		/*
+		 * We've reached the end of the btree. because
+		 * we are only searching a small chunk of the
+		 * btree each search, there is obviously free
+		 * inodes closer to the parent inode than we
+		 * are now. restart the search again.
+		 */
+		pag->pagl_pagino = NULLAGINO;
+		pag->pagl_leftrec = NULLAGINO;
+		pag->pagl_rightrec = NULLAGINO;
+		xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+		goto restart_pagno;
 	}
 
 	/*
 	 * In a different AG from the parent.
 	 * See if the most recently allocated block has any free.
 	 */
-	else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
+newino:
+	if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
 		error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
 					 XFS_LOOKUP_EQ, &i);
 		if (error)
@@ -889,27 +968,27 @@ nextag:
 				goto alloc_inode;
 			}
 		}
+	}
 
-		/*
-		 * None left in the last group, search the whole AG
-		 */
-		error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+	/*
+	 * None left in the last group, search the whole AG
+	 */
+	error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+	if (error)
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+	for (;;) {
+		error = xfs_inobt_get_rec(cur, &rec, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if (rec.ir_freecount > 0)
+			break;
+		error = xfs_btree_increment(cur, 0, &i);
 		if (error)
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-		for (;;) {
-			error = xfs_inobt_get_rec(cur, &rec, &i);
-			if (error)
-				goto error0;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			if (rec.ir_freecount > 0)
-				break;
-			error = xfs_btree_increment(cur, 0, &i);
-			if (error)
-				goto error0;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		}
 	}
 
 alloc_inode:
-- 
cgit v1.2.2


From 13e6d5cdde0e785aa943810f08b801cadd0935df Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 31 Aug 2009 21:00:31 -0300
Subject: xfs: merge fsync and O_SYNC handling

The guarantees for O_SYNC are exactly the same as the ones we need to
make for an fsync call (and given that Linux O_SYNC is O_DSYNC the
equivalent is fdadatasync, but we treat both the same in XFS), except
with a range data writeout.  Jan Kara has started unifying these two
path for filesystems using the generic helpers, and I've started to
look at XFS.

The actual transaction commited by xfs_fsync and xfs_write_sync_logforce
has a different transaction number, but actually is exactly the same.
We'll only use the fsync transaction going forward.  One major difference
is that xfs_write_sync_logforce never issues a cache flush unless we
commit a transaction causing that as a side-effect, which is an obvious
bug in the O_SYNC handling.  Second all the locking and i_update_size
vs i_update_core changes from 978b7237123d007b9fa983af6e0e2fa8f97f9934
never made it to xfs_write_sync_logforce, so we add them back.

To make xfs_fsync easily usable from the O_SYNC path, the filemap_fdatawait
call is moved up to xfs_file_fsync, so that we don't wait on the whole
file after we already waited for our portion in xfs_write.

We'll also use a plain call to filemap_write_and_wait_range instead
of the previous sync_page_rang which did it in two steps including
an half-hearted inode write out that doesn't help us.

Once we're done with this also remove the now useless i_update_size
tracking.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c |  1 -
 fs/xfs/linux-2.6/xfs_file.c | 19 +++++++---
 fs/xfs/linux-2.6/xfs_lrw.c  |  7 ++--
 fs/xfs/xfs_iget.c           |  1 -
 fs/xfs/xfs_inode.h          |  1 -
 fs/xfs/xfs_inode_item.c     |  8 -----
 fs/xfs/xfs_rw.c             | 84 ---------------------------------------------
 fs/xfs/xfs_rw.h             |  1 -
 fs/xfs/xfs_trans.h          |  2 +-
 fs/xfs/xfs_vnodeops.c       | 11 ++----
 10 files changed, 23 insertions(+), 112 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index aecf2519db76..d5e5559e31db 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -216,7 +216,6 @@ xfs_setfilesize(
 	if (ip->i_d.di_size < isize) {
 		ip->i_d.di_size = isize;
 		ip->i_update_core = 1;
-		ip->i_update_size = 1;
 		xfs_mark_inode_dirty_sync(ip);
 	}
 
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 0542fd507649..988d8f87bc0f 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -172,12 +172,21 @@ xfs_file_release(
  */
 STATIC int
 xfs_file_fsync(
-	struct file	*filp,
-	struct dentry	*dentry,
-	int		datasync)
+	struct file		*file,
+	struct dentry		*dentry,
+	int			datasync)
 {
-	xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
-	return -xfs_fsync(XFS_I(dentry->d_inode));
+	struct inode		*inode = dentry->d_inode;
+	struct xfs_inode	*ip = XFS_I(inode);
+	int			error;
+
+	/* capture size updates in I/O completion before writing the inode. */
+	error = filemap_fdatawait(inode->i_mapping);
+	if (error)
+		return error;
+
+	xfs_iflags_clear(ip, XFS_ITRUNCATED);
+	return -xfs_fsync(ip);
 }
 
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 7078974a6eee..49e4a6aea73c 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -812,18 +812,21 @@ write_retry:
 
 	/* Handle various SYNC-type writes */
 	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+		loff_t end = pos + ret - 1;
 		int error2;
 
 		xfs_iunlock(xip, iolock);
 		if (need_i_mutex)
 			mutex_unlock(&inode->i_mutex);
-		error2 = sync_page_range(inode, mapping, pos, ret);
+
+		error2 = filemap_write_and_wait_range(mapping, pos, end);
 		if (!error)
 			error = error2;
 		if (need_i_mutex)
 			mutex_lock(&inode->i_mutex);
 		xfs_ilock(xip, iolock);
-		error2 = xfs_write_sync_logforce(mp, xip);
+
+		error2 = xfs_fsync(xip);
 		if (!error)
 			error = error2;
 	}
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index ecbf8b4d2e2e..3323826274b3 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -82,7 +82,6 @@ xfs_inode_alloc(
 	memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
 	ip->i_flags = 0;
 	ip->i_update_core = 0;
-	ip->i_update_size = 0;
 	ip->i_delayed_blks = 0;
 	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
 	ip->i_size = 0;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 00e8505bde2d..ed566c248ae4 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -261,7 +261,6 @@ typedef struct xfs_inode {
 	/* Miscellaneous state. */
 	unsigned short		i_flags;	/* see defined flags below */
 	unsigned char		i_update_core;	/* timestamps/size is dirty */
-	unsigned char		i_update_size;	/* di_size field is dirty */
 	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
 
 	xfs_icdinode_t		i_d;		/* most of ondisk inode */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 977c4aec587e..2e69412195e6 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -262,14 +262,6 @@ xfs_inode_item_format(
 		SYNCHRONIZE();
 	}
 
-	/*
-	 * We don't have to worry about re-ordering here because
-	 * the update_size field is protected by the inode lock
-	 * and we have that held in exclusive mode.
-	 */
-	if (ip->i_update_size)
-		ip->i_update_size = 0;
-
 	/*
 	 * Make sure to get the latest atime from the Linux inode.
 	 */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index fea68615ed23..3f816ad7ff19 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -87,90 +87,6 @@ xfs_write_clear_setuid(
 	return 0;
 }
 
-/*
- * Handle logging requirements of various synchronous types of write.
- */
-int
-xfs_write_sync_logforce(
-	xfs_mount_t	*mp,
-	xfs_inode_t	*ip)
-{
-	int		error = 0;
-
-	/*
-	 * If we're treating this as O_DSYNC and we have not updated the
-	 * size, force the log.
-	 */
-	if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
-	    !(ip->i_update_size)) {
-		xfs_inode_log_item_t	*iip = ip->i_itemp;
-
-		/*
-		 * If an allocation transaction occurred
-		 * without extending the size, then we have to force
-		 * the log up the proper point to ensure that the
-		 * allocation is permanent.  We can't count on
-		 * the fact that buffered writes lock out direct I/O
-		 * writes - the direct I/O write could have extended
-		 * the size nontransactionally, then finished before
-		 * we started.  xfs_write_file will think that the file
-		 * didn't grow but the update isn't safe unless the
-		 * size change is logged.
-		 *
-		 * Force the log if we've committed a transaction
-		 * against the inode or if someone else has and
-		 * the commit record hasn't gone to disk (e.g.
-		 * the inode is pinned).  This guarantees that
-		 * all changes affecting the inode are permanent
-		 * when we return.
-		 */
-		if (iip && iip->ili_last_lsn) {
-			error = _xfs_log_force(mp, iip->ili_last_lsn,
-					XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
-		} else if (xfs_ipincount(ip) > 0) {
-			error = _xfs_log_force(mp, (xfs_lsn_t)0,
-					XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
-		}
-
-	} else {
-		xfs_trans_t	*tp;
-
-		/*
-		 * O_SYNC or O_DSYNC _with_ a size update are handled
-		 * the same way.
-		 *
-		 * If the write was synchronous then we need to make
-		 * sure that the inode modification time is permanent.
-		 * We'll have updated the timestamp above, so here
-		 * we use a synchronous transaction to log the inode.
-		 * It's not fast, but it's necessary.
-		 *
-		 * If this a dsync write and the size got changed
-		 * non-transactionally, then we need to ensure that
-		 * the size change gets logged in a synchronous
-		 * transaction.
-		 */
-		tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
-		if ((error = xfs_trans_reserve(tp, 0,
-						XFS_SWRITE_LOG_RES(mp),
-						0, 0, 0))) {
-			/* Transaction reserve failed */
-			xfs_trans_cancel(tp, 0);
-		} else {
-			/* Transaction reserve successful */
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
-			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-			xfs_trans_ihold(tp, ip);
-			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-			xfs_trans_set_sync(tp);
-			error = xfs_trans_commit(tp, 0);
-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		}
-	}
-
-	return error;
-}
-
 /*
  * Force a shutdown of the filesystem instantly while keeping
  * the filesystem consistent. We don't do an unmount here; just shutdown
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index ae65f0df87da..f5e4874c37d8 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -68,7 +68,6 @@ xfs_get_extsz_hint(
  * Prototypes for functions in xfs_rw.c.
  */
 extern int xfs_write_clear_setuid(struct xfs_inode *ip);
-extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip);
 extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
 extern int xfs_bioerror(struct xfs_buf *bp);
 extern int xfs_bioerror_relse(struct xfs_buf *bp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 775249a54f6f..ed47fc77759c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -68,7 +68,7 @@ typedef struct xfs_trans_header {
 #define XFS_TRANS_GROWFS		14
 #define XFS_TRANS_STRAT_WRITE		15
 #define XFS_TRANS_DIOSTRAT		16
-#define	XFS_TRANS_WRITE_SYNC		17
+/* 17 was XFS_TRANS_WRITE_SYNC */
 #define	XFS_TRANS_WRITEID		18
 #define	XFS_TRANS_ADDAFORK		19
 #define	XFS_TRANS_ATTRINVAL		20
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index ceecafd1f9c1..03d3100559ac 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -611,7 +611,7 @@ xfs_fsync(
 	xfs_inode_t	*ip)
 {
 	xfs_trans_t	*tp;
-	int		error;
+	int		error = 0;
 	int		log_flushed = 0, changed = 1;
 
 	xfs_itrace_entry(ip);
@@ -619,14 +619,9 @@ xfs_fsync(
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return XFS_ERROR(EIO);
 
-	/* capture size updates in I/O completion before writing the inode. */
-	error = xfs_wait_on_pages(ip, 0, -1);
-	if (error)
-		return XFS_ERROR(error);
-
 	/*
 	 * We always need to make sure that the required inode state is safe on
-	 * disk.  The vnode might be clean but we still might need to force the
+	 * disk.  The inode might be clean but we still might need to force the
 	 * log because of committed transactions that haven't hit the disk yet.
 	 * Likewise, there could be unflushed non-transactional changes to the
 	 * inode core that have to go to disk and this requires us to issue
@@ -638,7 +633,7 @@ xfs_fsync(
 	 */
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 
-	if (!(ip->i_update_size || ip->i_update_core)) {
+	if (!ip->i_update_core) {
 		/*
 		 * Timestamps/size haven't changed since last inode flush or
 		 * inode transaction commit.  That means either nothing got
-- 
cgit v1.2.2


From aa72a5cf00001d0b952c7c755be404b9118ceb2e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 31 Aug 2009 21:51:52 -0300
Subject: xfs: simplify xfs_trans_iget

xfs_trans_iget is a wrapper for xfs_iget that adds the inode to the
transaction after it is read.  Except when the inode already is in the
inode cache, in which case it returns the existing locked inode with
increment lock recursion counts.

Now, no one in the tree every decrements these lock recursion counts,
so any user of this gets a potential double unlock when both the original
owner of the inode and the xfs_trans_iget caller unlock it.  When looking
back in a git bisect in the historic XFS tree there was only one place
that decremented these counts, xfs_trans_iput.  Introduced in commit
ca25df7a840f426eb566d52667b6950b92bb84b5 by Adam Sweeney in 1993,
and removed in commit 19f899a3ab155ff6a49c0c79b06f2f61059afaf3 by
Steve Lord in 2003.  And as long as it didn't slip through git bisects
cracks never actually used in that time frame.

A quick audit of the callers of xfs_trans_iget shows that no caller
really relies on this behaviour fortunately - xfs_ialloc allows this
inode from disk so it must not be there before, and all the RT allocator
routines only every add each RT bitmap inode once.

In addition to removing lots of code and reducing the size of the inode
item this patch also avoids the double inode cache lookup in each
create/mkdir/mknod transaction.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_iget.c        | 26 ---------------
 fs/xfs/xfs_inode.h       |  2 --
 fs/xfs/xfs_inode_item.c  |  2 --
 fs/xfs/xfs_inode_item.h  |  2 --
 fs/xfs/xfs_trans_inode.c | 86 +++---------------------------------------------
 5 files changed, 5 insertions(+), 113 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 3323826274b3..80e526489be5 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -455,32 +455,6 @@ out_error_or_again:
 	return error;
 }
 
-
-/*
- * Look for the inode corresponding to the given ino in the hash table.
- * If it is there and its i_transp pointer matches tp, return it.
- * Otherwise, return NULL.
- */
-xfs_inode_t *
-xfs_inode_incore(xfs_mount_t	*mp,
-		 xfs_ino_t	ino,
-		 xfs_trans_t	*tp)
-{
-	xfs_inode_t	*ip;
-	xfs_perag_t	*pag;
-
-	pag = xfs_get_perag(mp, ino);
-	read_lock(&pag->pag_ici_lock);
-	ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino));
-	read_unlock(&pag->pag_ici_lock);
-	xfs_put_perag(mp, pag);
-
-	/* the returned inode must match the transaction */
-	if (ip && (ip->i_transp != tp))
-		return NULL;
-	return ip;
-}
-
 /*
  * Decrement reference count of an inode structure and unlock it.
  *
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ed566c248ae4..0b38b9a869ec 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -467,8 +467,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
  * xfs_iget.c prototypes.
  */
-xfs_inode_t	*xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
-				  struct xfs_trans *);
 int		xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
 			 uint, uint, xfs_inode_t **, xfs_daddr_t);
 void		xfs_iput(xfs_inode_t *, uint);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 2e69412195e6..47d5b663c37e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -704,8 +704,6 @@ xfs_inode_item_unlock(
 	 * Clear out the fields of the inode log item particular
 	 * to the current transaction.
 	 */
-	iip->ili_ilock_recur = 0;
-	iip->ili_iolock_recur = 0;
 	iip->ili_flags = 0;
 
 	/*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index a52ac125f055..65bae4c9b8bf 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -137,8 +137,6 @@ typedef struct xfs_inode_log_item {
 	struct xfs_inode	*ili_inode;	   /* inode ptr */
 	xfs_lsn_t		ili_flush_lsn;	   /* lsn at last flush */
 	xfs_lsn_t		ili_last_lsn;	   /* lsn at last transaction */
-	unsigned short		ili_ilock_recur;   /* lock recursion count */
-	unsigned short		ili_iolock_recur;  /* lock recursion count */
 	unsigned short		ili_flags;	   /* misc flags */
 	unsigned short		ili_logged;	   /* flushed logged data */
 	unsigned int		ili_last_fields;   /* fields when flushed */
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 23d276af2e0c..785ff101da0a 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -49,30 +49,7 @@ xfs_trans_inode_broot_debug(
 
 
 /*
- * Get and lock the inode for the caller if it is not already
- * locked within the given transaction.  If it is already locked
- * within the transaction, just increment its lock recursion count
- * and return a pointer to it.
- *
- * For an inode to be locked in a transaction, the inode lock, as
- * opposed to the io lock, must be taken exclusively.  This ensures
- * that the inode can be involved in only 1 transaction at a time.
- * Lock recursion is handled on the io lock, but only for lock modes
- * of equal or lesser strength.  That is, you can recur on the io lock
- * held EXCL with a SHARED request but not vice versa.  Also, if
- * the inode is already a part of the transaction then you cannot
- * go from not holding the io lock to having it EXCL or SHARED.
- *
- * Use the inode cache routine xfs_inode_incore() to find the inode
- * if it is already owned by this transaction.
- *
- * If we don't already own the inode, use xfs_iget() to get it.
- * Since the inode log item structure is embedded in the incore
- * inode structure and is initialized when the inode is brought
- * into memory, there is nothing to do with it here.
- *
- * If the given transaction pointer is NULL, just call xfs_iget().
- * This simplifies code which must handle both cases.
+ * Get an inode and join it to the transaction.
  */
 int
 xfs_trans_iget(
@@ -84,62 +61,11 @@ xfs_trans_iget(
 	xfs_inode_t	**ipp)
 {
 	int			error;
-	xfs_inode_t		*ip;
-
-	/*
-	 * If the transaction pointer is NULL, just call the normal
-	 * xfs_iget().
-	 */
-	if (tp == NULL)
-		return xfs_iget(mp, NULL, ino, flags, lock_flags, ipp, 0);
-
-	/*
-	 * If we find the inode in core with this transaction
-	 * pointer in its i_transp field, then we know we already
-	 * have it locked.  In this case we just increment the lock
-	 * recursion count and return the inode to the caller.
-	 * Assert that the inode is already locked in the mode requested
-	 * by the caller.  We cannot do lock promotions yet, so
-	 * die if someone gets this wrong.
-	 */
-	if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) {
-		/*
-		 * Make sure that the inode lock is held EXCL and
-		 * that the io lock is never upgraded when the inode
-		 * is already a part of the transaction.
-		 */
-		ASSERT(ip->i_itemp != NULL);
-		ASSERT(lock_flags & XFS_ILOCK_EXCL);
-		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-		ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
-		       xfs_isilocked(ip, XFS_IOLOCK_EXCL));
-		ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
-		       (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL));
-		ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
-		       xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED));
-		ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
-		       (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY));
-
-		if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
-			ip->i_itemp->ili_iolock_recur++;
-		}
-		if (lock_flags & XFS_ILOCK_EXCL) {
-			ip->i_itemp->ili_ilock_recur++;
-		}
-		*ipp = ip;
-		return 0;
-	}
-
-	ASSERT(lock_flags & XFS_ILOCK_EXCL);
-	error = xfs_iget(tp->t_mountp, tp, ino, flags, lock_flags, &ip, 0);
-	if (error) {
-		return error;
-	}
-	ASSERT(ip != NULL);
 
-	xfs_trans_ijoin(tp, ip, lock_flags);
-	*ipp = ip;
-	return 0;
+	error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0);
+	if (!error && tp)
+		xfs_trans_ijoin(tp, *ipp, lock_flags);
+	return error;
 }
 
 /*
@@ -163,8 +89,6 @@ xfs_trans_ijoin(
 		xfs_inode_item_init(ip, ip->i_mount);
 	iip = ip->i_itemp;
 	ASSERT(iip->ili_flags == 0);
-	ASSERT(iip->ili_ilock_recur == 0);
-	ASSERT(iip->ili_iolock_recur == 0);
 
 	/*
 	 * Get a log_item_desc to point at the new item.
-- 
cgit v1.2.2


From f4378b6eaf63492c0f9a397d52813e0ae6b49e7b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 1 Sep 2009 14:03:08 -0400
Subject: xfs: actually enable the swapext compat handler

Fix a small typo in the compat ioctl handler that cause the swapext
compat handler to never be called.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Torsten Kaiser <just.for.lkml@googlemail.com>
Tested-by: Torsten Kaiser <just.for.lkml@googlemail.com>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_ioctl32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0882d166239a..eafcc7c18706 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -619,7 +619,7 @@ xfs_file_compat_ioctl(
 	case XFS_IOC_GETVERSION_32:
 		cmd = _NATIVE_IOC(cmd, long);
 		return xfs_file_ioctl(filp, cmd, p);
-	case XFS_IOC_SWAPEXT: {
+	case XFS_IOC_SWAPEXT_32: {
 		struct xfs_swapext	  sxp;
 		struct compat_xfs_swapext __user *sxu = arg;
 
-- 
cgit v1.2.2


From 3725867dccfb83e4b0cff64e916a04258f300591 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 1 Sep 2009 14:03:08 -0400
Subject: xfs: actually enable the swapext compat handler

Fix a small typo in the compat ioctl handler that cause the swapext
compat handler to never be called.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Torsten Kaiser <just.for.lkml@googlemail.com>
Tested-by: Torsten Kaiser <just.for.lkml@googlemail.com>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_ioctl32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0882d166239a..eafcc7c18706 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -619,7 +619,7 @@ xfs_file_compat_ioctl(
 	case XFS_IOC_GETVERSION_32:
 		cmd = _NATIVE_IOC(cmd, long);
 		return xfs_file_ioctl(filp, cmd, p);
-	case XFS_IOC_SWAPEXT: {
+	case XFS_IOC_SWAPEXT_32: {
 		struct xfs_swapext	  sxp;
 		struct compat_xfs_swapext __user *sxu = arg;
 
-- 
cgit v1.2.2


From 8c58b54574d18b3782b2a261b9dae3c4e90b6b04 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 17 Aug 2009 16:26:49 -0400
Subject: cifs: send IPv6 addr in upcall with colon delimiters

Make it easier on the upcall program by adding ':' delimiters between
each group of hex digits.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_spnego.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 051caecf7d67..8ec7736ce954 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -125,7 +125,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 	if (server->addr.sockAddr.sin_family == AF_INET)
 		sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr);
 	else if (server->addr.sockAddr.sin_family == AF_INET6)
-		sprintf(dp, "ip6=%pi6", &server->addr.sockAddr6.sin6_addr);
+		sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr);
 	else
 		goto out;
 
-- 
cgit v1.2.2


From 8e047d09ee1143058b71f40c5f4d028dde52d883 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 28 Aug 2009 10:11:10 -0400
Subject: cifs: use tcon pointer in cifs_show_options

Minor nit: we already have a tcon pointer so we don't need to
dereference cifs_sb again.

Also initialize the vars in the declaration.

Reported-by: Peter Staubach <staubach@redhat.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 84b75253b05a..b750aa502ff7 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -361,13 +361,10 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 static int
 cifs_show_options(struct seq_file *s, struct vfsmount *m)
 {
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *tcon;
-
-	cifs_sb = CIFS_SB(m->mnt_sb);
-	tcon = cifs_sb->tcon;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
+	struct cifsTconInfo *tcon = cifs_sb->tcon;
 
-	seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName);
+	seq_printf(s, ",unc=%s", tcon->treeName);
 	if (tcon->ses->userName)
 		seq_printf(s, ",username=%s", tcon->ses->userName);
 	if (tcon->ses->domainName)
-- 
cgit v1.2.2


From 1b49c5566136455764a8d17ead25784f534c202d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 28 Aug 2009 10:11:11 -0400
Subject: cifs: protect GlobalOplock_Q with its own spinlock

Right now, the GlobalOplock_Q is protected by the GlobalMid_Lock. That
lock is also used for completely unrelated purposes (mostly for managing
the global mid queue). Give the list its own dedicated spinlock
(cifs_oplock_lock) and rename the list to cifs_oplock_list to
eliminate the camel-case.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c    | 13 +++++++------
 fs/cifs/cifsglob.h  |  6 +++++-
 fs/cifs/transport.c | 17 ++++++++---------
 3 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b750aa502ff7..3610e9958b4c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -986,19 +986,19 @@ static int cifs_oplock_thread(void *dummyarg)
 		if (try_to_freeze())
 			continue;
 
-		spin_lock(&GlobalMid_Lock);
-		if (list_empty(&GlobalOplock_Q)) {
-			spin_unlock(&GlobalMid_Lock);
+		spin_lock(&cifs_oplock_lock);
+		if (list_empty(&cifs_oplock_list)) {
+			spin_unlock(&cifs_oplock_lock);
 			set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(39*HZ);
 		} else {
-			oplock_item = list_entry(GlobalOplock_Q.next,
+			oplock_item = list_entry(cifs_oplock_list.next,
 						struct oplock_q_entry, qhead);
 			cFYI(1, ("found oplock item to write out"));
 			pTcon = oplock_item->tcon;
 			inode = oplock_item->pinode;
 			netfid = oplock_item->netfid;
-			spin_unlock(&GlobalMid_Lock);
+			spin_unlock(&cifs_oplock_lock);
 			DeleteOplockQEntry(oplock_item);
 			/* can not grab inode sem here since it would
 				deadlock when oplock received on delete
@@ -1055,7 +1055,7 @@ init_cifs(void)
 	int rc = 0;
 	cifs_proc_init();
 	INIT_LIST_HEAD(&cifs_tcp_ses_list);
-	INIT_LIST_HEAD(&GlobalOplock_Q);
+	INIT_LIST_HEAD(&cifs_oplock_list);
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	INIT_LIST_HEAD(&GlobalDnotifyReqList);
 	INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
@@ -1084,6 +1084,7 @@ init_cifs(void)
 	rwlock_init(&GlobalSMBSeslock);
 	rwlock_init(&cifs_tcp_ses_lock);
 	spin_lock_init(&GlobalMid_Lock);
+	spin_lock_init(&cifs_oplock_lock);
 
 	if (cifs_max_pending < 2) {
 		cifs_max_pending = 2;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6084d6379c03..f100399ee05e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -656,7 +656,11 @@ GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
  */
 GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
 
-GLOBAL_EXTERN struct list_head GlobalOplock_Q;
+/* Global list of oplocks */
+GLOBAL_EXTERN struct list_head cifs_oplock_list;
+
+/* Protects the cifs_oplock_list */
+GLOBAL_EXTERN spinlock_t cifs_oplock_lock;
 
 /* Outstanding dir notify requests */
 GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 0ad3e2d116a6..1da4ab250eae 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -119,20 +119,19 @@ AllocOplockQEntry(struct inode *pinode, __u16 fid, struct cifsTconInfo *tcon)
 		temp->pinode = pinode;
 		temp->tcon = tcon;
 		temp->netfid = fid;
-		spin_lock(&GlobalMid_Lock);
-		list_add_tail(&temp->qhead, &GlobalOplock_Q);
-		spin_unlock(&GlobalMid_Lock);
+		spin_lock(&cifs_oplock_lock);
+		list_add_tail(&temp->qhead, &cifs_oplock_list);
+		spin_unlock(&cifs_oplock_lock);
 	}
 	return temp;
-
 }
 
 void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry)
 {
-	spin_lock(&GlobalMid_Lock);
+	spin_lock(&cifs_oplock_lock);
     /* should we check if list empty first? */
 	list_del(&oplockEntry->qhead);
-	spin_unlock(&GlobalMid_Lock);
+	spin_unlock(&cifs_oplock_lock);
 	kmem_cache_free(cifs_oplock_cachep, oplockEntry);
 }
 
@@ -144,14 +143,14 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
 	if (tcon == NULL)
 		return;
 
-	spin_lock(&GlobalMid_Lock);
-	list_for_each_entry(temp, &GlobalOplock_Q, qhead) {
+	spin_lock(&cifs_oplock_lock);
+	list_for_each_entry(temp, &cifs_oplock_list, qhead) {
 		if ((temp->tcon) && (temp->tcon == tcon)) {
 			list_del(&temp->qhead);
 			kmem_cache_free(cifs_oplock_cachep, temp);
 		}
 	}
-	spin_unlock(&GlobalMid_Lock);
+	spin_unlock(&cifs_oplock_lock);
 }
 
 static int
-- 
cgit v1.2.2


From 6ab409b53dcaf28f83d518a6702f904b7cee3f41 Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Date: Mon, 31 Aug 2009 11:07:12 -0400
Subject: cifs: Replace wrtPending with a real reference count

Currently, cifs_close() tries to wait until all I/O is complete and then
frees the file private data.  If I/O does not completely in a reasonable
amount of time it frees the structure anyway, leaving a potential use-
after-free situation.

This patch changes the wrtPending counter to a complete reference count and
lets the last user free the structure.

Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Tested-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c  |  4 ++--
 fs/cifs/cifsglob.h | 15 ++++++++++++++-
 fs/cifs/dir.c      |  2 +-
 fs/cifs/file.c     | 43 +++++++++++--------------------------------
 fs/cifs/inode.c    |  6 +++---
 5 files changed, 31 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 6941c22398a6..7dfe0842a6f6 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -607,7 +607,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
 		return get_cifs_acl_by_path(cifs_sb, path, pacllen);
 
 	pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
-	atomic_dec(&open_file->wrtPending);
+	cifsFileInfo_put(open_file);
 	return pntsd;
 }
 
@@ -665,7 +665,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 		return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
 
 	rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
-	atomic_dec(&open_file->wrtPending);
+	cifsFileInfo_put(open_file);
 	return rc;
 }
 
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f100399ee05e..6cfc81a32703 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -351,11 +351,24 @@ struct cifsFileInfo {
 	bool closePend:1;	/* file is marked to close */
 	bool invalidHandle:1;	/* file closed via session abend */
 	bool messageMode:1;	/* for pipes: message vs byte mode */
-	atomic_t wrtPending;   /* handle in use - defer close */
+	atomic_t count;		/* reference count */
 	struct mutex fh_mutex; /* prevents reopen race after dead ses*/
 	struct cifs_search_info srch_inf;
 };
 
+/* Take a reference on the file private data */
+static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
+{
+	atomic_inc(&cifs_file->count);
+}
+
+/* Release a reference on the file private data */
+static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
+{
+	if (atomic_dec_and_test(&cifs_file->count))
+		kfree(cifs_file);
+}
+
 /*
  * One of these for each file inode
  */
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 4326ffd90fa9..a6424cfc0121 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -153,7 +153,7 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
 	mutex_init(&pCifsFile->fh_mutex);
 	mutex_init(&pCifsFile->lock_mutex);
 	INIT_LIST_HEAD(&pCifsFile->llist);
-	atomic_set(&pCifsFile->wrtPending, 0);
+	atomic_set(&pCifsFile->count, 1);
 
 	/* set the following in open now
 			pCifsFile->pfile = file; */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c34b7f8a217b..fa7beac8b80e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -53,11 +53,9 @@ static inline struct cifsFileInfo *cifs_init_private(
 	private_data->pInode = inode;
 	private_data->invalidHandle = false;
 	private_data->closePend = false;
-	/* we have to track num writers to the inode, since writepages
-	does not tell us which handle the write is for so there can
-	be a close (overlapping with write) of the filehandle that
-	cifs_writepages chose to use */
-	atomic_set(&private_data->wrtPending, 0);
+	/* Initialize reference count to one.  The private data is
+	freed on the release of the last reference */
+	atomic_set(&private_data->count, 1);
 
 	return private_data;
 }
@@ -643,7 +641,7 @@ int cifs_close(struct inode *inode, struct file *file)
 			if (!pTcon->need_reconnect) {
 				write_unlock(&GlobalSMBSeslock);
 				timeout = 2;
-				while ((atomic_read(&pSMBFile->wrtPending) != 0)
+				while ((atomic_read(&pSMBFile->count) != 1)
 					&& (timeout <= 2048)) {
 					/* Give write a better chance to get to
 					server ahead of the close.  We do not
@@ -657,8 +655,6 @@ int cifs_close(struct inode *inode, struct file *file)
 					msleep(timeout);
 					timeout *= 4;
 				}
-				if (atomic_read(&pSMBFile->wrtPending))
-					cERROR(1, ("close with pending write"));
 				if (!pTcon->need_reconnect &&
 				    !pSMBFile->invalidHandle)
 					rc = CIFSSMBClose(xid, pTcon,
@@ -681,24 +677,7 @@ int cifs_close(struct inode *inode, struct file *file)
 		list_del(&pSMBFile->flist);
 		list_del(&pSMBFile->tlist);
 		write_unlock(&GlobalSMBSeslock);
-		timeout = 10;
-		/* We waited above to give the SMBWrite a chance to issue
-		   on the wire (so we do not get SMBWrite returning EBADF
-		   if writepages is racing with close.  Note that writepages
-		   does not specify a file handle, so it is possible for a file
-		   to be opened twice, and the application close the "wrong"
-		   file handle - in these cases we delay long enough to allow
-		   the SMBWrite to get on the wire before the SMB Close.
-		   We allow total wait here over 45 seconds, more than
-		   oplock break time, and more than enough to allow any write
-		   to complete on the server, or to time out on the client */
-		while ((atomic_read(&pSMBFile->wrtPending) != 0)
-				&& (timeout <= 50000)) {
-			cERROR(1, ("writes pending, delay free of handle"));
-			msleep(timeout);
-			timeout *= 8;
-		}
-		kfree(file->private_data);
+		cifsFileInfo_put(file->private_data);
 		file->private_data = NULL;
 	} else
 		rc = -EBADF;
@@ -1236,7 +1215,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
 			if (!open_file->invalidHandle) {
 				/* found a good file */
 				/* lock it so it will not be closed on us */
-				atomic_inc(&open_file->wrtPending);
+				cifsFileInfo_get(open_file);
 				read_unlock(&GlobalSMBSeslock);
 				return open_file;
 			} /* else might as well continue, and look for
@@ -1276,7 +1255,7 @@ refind_writable:
 		if (open_file->pfile &&
 		    ((open_file->pfile->f_flags & O_RDWR) ||
 		     (open_file->pfile->f_flags & O_WRONLY))) {
-			atomic_inc(&open_file->wrtPending);
+			cifsFileInfo_get(open_file);
 
 			if (!open_file->invalidHandle) {
 				/* found a good writable file */
@@ -1293,7 +1272,7 @@ refind_writable:
 				else { /* start over in case this was deleted */
 				       /* since the list could be modified */
 					read_lock(&GlobalSMBSeslock);
-					atomic_dec(&open_file->wrtPending);
+					cifsFileInfo_put(open_file);
 					goto refind_writable;
 				}
 			}
@@ -1309,7 +1288,7 @@ refind_writable:
 			read_lock(&GlobalSMBSeslock);
 			/* can not use this handle, no write
 			   pending on this one after all */
-			atomic_dec(&open_file->wrtPending);
+			cifsFileInfo_put(open_file);
 
 			if (open_file->closePend) /* list could have changed */
 				goto refind_writable;
@@ -1373,7 +1352,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 	if (open_file) {
 		bytes_written = cifs_write(open_file->pfile, write_data,
 					   to-from, &offset);
-		atomic_dec(&open_file->wrtPending);
+		cifsFileInfo_put(open_file);
 		/* Does mm or vfs already set times? */
 		inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
 		if ((bytes_written > 0) && (offset))
@@ -1562,7 +1541,7 @@ retry:
 						   bytes_to_write, offset,
 						   &bytes_written, iov, n_iov,
 						   long_op);
-				atomic_dec(&open_file->wrtPending);
+				cifsFileInfo_put(open_file);
 				cifs_update_eof(cifsi, offset, bytes_written);
 
 				if (rc || bytes_written < bytes_to_write) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 82d83839655e..1f09c7619319 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -800,7 +800,7 @@ set_via_filehandle:
 	if (open_file == NULL)
 		CIFSSMBClose(xid, pTcon, netfid);
 	else
-		atomic_dec(&open_file->wrtPending);
+		cifsFileInfo_put(open_file);
 out:
 	return rc;
 }
@@ -1635,7 +1635,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 		__u32 npid = open_file->pid;
 		rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
 					npid, false);
-		atomic_dec(&open_file->wrtPending);
+		cifsFileInfo_put(open_file);
 		cFYI(1, ("SetFSize for attrs rc = %d", rc));
 		if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
 			unsigned int bytes_written;
@@ -1790,7 +1790,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 		u16 nfid = open_file->netfid;
 		u32 npid = open_file->pid;
 		rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
-		atomic_dec(&open_file->wrtPending);
+		cifsFileInfo_put(open_file);
 	} else {
 		rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
 				    cifs_sb->local_nls,
-- 
cgit v1.2.2


From 81e251766e8f8c9d7abb5db784e58c5c45f82797 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 1 Sep 2009 19:56:55 -0400
Subject: xfs: un-static xfs_inobt_lookup

xfs_inobt_lookup is also used in xfs_itable.c, remove the STATIC modifier
from it's declaration to fix non-debug builds.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_ialloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index d12d805f20a5..ab64f3efb43b 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -59,7 +59,7 @@ xfs_ialloc_cluster_alignment(
 /*
  * Lookup a record by ino in the btree given by cur.
  */
-STATIC int				/* error */
+int					/* error */
 xfs_inobt_lookup(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
 	xfs_agino_t		ino,	/* starting inode of chunk */
-- 
cgit v1.2.2


From e0e817392b9acf2c98d3be80c233dddb1b52003d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 2 Sep 2009 09:13:40 +0100
Subject: CRED: Add some configurable debugging [try #6]

Add a config option (CONFIG_DEBUG_CREDENTIALS) to turn on some debug checking
for credential management.  The additional code keeps track of the number of
pointers from task_structs to any given cred struct, and checks to see that
this number never exceeds the usage count of the cred struct (which includes
all references, not just those from task_structs).

Furthermore, if SELinux is enabled, the code also checks that the security
pointer in the cred struct is never seen to be invalid.

This attempts to catch the bug whereby inode_has_perm() faults in an nfsd
kernel thread on seeing cred->security be a NULL pointer (it appears that the
credential struct has been previously released):

	http://www.kerneloops.org/oops.php?number=252883

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/nfsd/auth.c   | 4 ++++
 fs/nfsd/nfssvc.c | 2 ++
 fs/nfsd/vfs.c    | 3 +++
 fs/open.c        | 2 ++
 4 files changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 5573508f707f..36fcabbf5186 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -34,6 +34,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 	int flags = nfsexp_flags(rqstp, exp);
 	int ret;
 
+	validate_process_creds();
+
 	/* discard any old override before preparing the new set */
 	revert_creds(get_cred(current->real_cred));
 	new = prepare_creds();
@@ -86,8 +88,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 	else
 		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
 							new->cap_permitted);
+	validate_process_creds();
 	put_cred(override_creds(new));
 	put_cred(new);
+	validate_process_creds();
 	return 0;
 
 oom:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 492c79b7800b..24d58adfe5fd 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -496,7 +496,9 @@ nfsd(void *vrqstp)
 		/* Lock the export hash tables for reading. */
 		exp_readlock();
 
+		validate_process_creds();
 		svc_process(rqstp);
+		validate_process_creds();
 
 		/* Unlock export hash tables */
 		exp_readunlock();
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 23341c1063bc..8fa09bfbcba7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -684,6 +684,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	__be32		err;
 	int		host_err;
 
+	validate_process_creds();
+
 	/*
 	 * If we get here, then the client has already done an "open",
 	 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
@@ -740,6 +742,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 out_nfserr:
 	err = nfserrno(host_err);
 out:
+	validate_process_creds();
 	return err;
 }
 
diff --git a/fs/open.c b/fs/open.c
index 40d1fa25f5aa..31191bf513e4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -959,6 +959,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
 	int error;
 	struct file *f;
 
+	validate_creds(cred);
+
 	/*
 	 * We must always pass in a valid mount pointer.   Historically
 	 * callers got away with not passing it, but we must enforce this at
-- 
cgit v1.2.2


From 988abe4075e5748d9f7c79d9dfffa0cf5291611b Mon Sep 17 00:00:00 2001
From: Alex Elder <aelder@sgi.com>
Date: Wed, 2 Sep 2009 17:02:24 -0500
Subject: xfs: xfs_showargs() reports group *and* project quotas enabled

If you enable group or project quotas on an XFS file system, then the
mount table presented through /proc/self/mounts erroneously shows
that both options are in effect for the file system.  The root of
the problem is some bad logic in the xfs_showargs() function, which
is used to format the file system type-specific options in effect
for a file system.

The problem originated in this GIT commit:
    Move platform specific mount option parse out of core XFS code
    Date: 11/22/07
    Author: Dave Chinner
    SHA1 ID: a67d7c5f5d25d0b13a4dfb182697135b014fa478

For XFS quotas, project and group quota management are mutually
exclusive--only one can be in effect at a time.  There are two
parts to managing quotas:  aggregating usage information; and
enforcing limits.  It is possible to have a quota in effect
(aggregating usage) but not enforced.

These features are recorded on an XFS mount point using these flags:
    XFS_PQUOTA_ACCT - Project quotas are aggregated
    XFS_GQUOTA_ACCT - Group quotas are aggregated
    XFS_OQUOTA_ENFD - Project/group quotas are enforced

The code in error is in fs/xfs/linux-2.6/xfs_super.c:

        if (mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
                seq_puts(m, "," MNTOPT_PRJQUOTA);
        else if (mp->m_qflags & XFS_PQUOTA_ACCT)
                seq_puts(m, "," MNTOPT_PQUOTANOENF);

        if (mp->m_qflags & (XFS_GQUOTA_ACCT|XFS_OQUOTA_ENFD))
                seq_puts(m, "," MNTOPT_GRPQUOTA);
        else if (mp->m_qflags & XFS_GQUOTA_ACCT)
                seq_puts(m, "," MNTOPT_GQUOTANOENF);

The problem is that XFS_OQUOTA_ENFD will be set in mp->m_qflags
if either group or project quotas are enforced, and as a result
both MNTOPT_PRJQUOTA and MNTOPT_GRPQUOTA will be shown as mount
options.

Signed-off-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index c709ed61a53e..5d7c60ac77b4 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -579,15 +579,19 @@ xfs_showargs(
 	else if (mp->m_qflags & XFS_UQUOTA_ACCT)
 		seq_puts(m, "," MNTOPT_UQUOTANOENF);
 
-	if (mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
-		seq_puts(m, "," MNTOPT_PRJQUOTA);
-	else if (mp->m_qflags & XFS_PQUOTA_ACCT)
-		seq_puts(m, "," MNTOPT_PQUOTANOENF);
-
-	if (mp->m_qflags & (XFS_GQUOTA_ACCT|XFS_OQUOTA_ENFD))
-		seq_puts(m, "," MNTOPT_GRPQUOTA);
-	else if (mp->m_qflags & XFS_GQUOTA_ACCT)
-		seq_puts(m, "," MNTOPT_GQUOTANOENF);
+	/* Either project or group quotas can be active, not both */
+
+	if (mp->m_qflags & XFS_PQUOTA_ACCT) {
+		if (mp->m_qflags & XFS_OQUOTA_ENFD)
+			seq_puts(m, "," MNTOPT_PRJQUOTA);
+		else
+			seq_puts(m, "," MNTOPT_PQUOTANOENF);
+	} else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
+		if (mp->m_qflags & XFS_OQUOTA_ENFD)
+			seq_puts(m, "," MNTOPT_GRPQUOTA);
+		else
+			seq_puts(m, "," MNTOPT_GQUOTANOENF);
+	}
 
 	if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
 		seq_puts(m, "," MNTOPT_NOQUOTA);
-- 
cgit v1.2.2


From 6c1488fd581a447ec87c4b59f0d33f95f0aa441b Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Wed, 2 Sep 2009 11:40:32 -0400
Subject: IMA: open new file for read

When creating a new file, ima_path_check() assumed the new file
was being opened for write. Call ima_path_check() with the
appropriate acc_mode so that the read/write counters are
incremented correctly.

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/namei.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index f3c5b278895a..ee01308a01d1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1533,9 +1533,11 @@ int may_open(struct path *path, int acc_mode, int flag)
 	if (error)
 		return error;
 
-	error = ima_path_check(path,
-			       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC),
+	error = ima_path_check(path, acc_mode ?
+			       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
+			       ACC_MODE(flag) & (MAY_READ | MAY_WRITE),
 			       IMA_COUNT_UPDATE);
+
 	if (error)
 		return error;
 	/*
-- 
cgit v1.2.2


From 9162ab2000e08be076883b5a295a771223264ce8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 3 Sep 2009 12:07:17 -0400
Subject: cifs: consolidate reconnect logic in smb_init routines

There's a large cut and paste chunk of code in smb_init and
small_smb_init to handle reconnects. Break it out into a separate
function, clean it up and have both routines call it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 312 +++++++++++++++++++++---------------------------------
 1 file changed, 123 insertions(+), 189 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 5f0b80d39923..301e307e1279 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -100,110 +100,138 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
 	   to this tcon */
 }
 
-/* Allocate and return pointer to an SMB request buffer, and set basic
-   SMB information in the SMB header.  If the return code is zero, this
-   function must have filled in request_buf pointer */
+/* reconnect the socket, tcon, and smb session if needed */
 static int
-small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
-		void **request_buf)
+cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
 {
 	int rc = 0;
+	struct cifsSesInfo *ses;
+	struct TCP_Server_Info *server;
+	struct nls_table *nls_codepage;
 
-	/* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so
-	   check for tcp and smb session status done differently
-	   for those three - in the calling routine */
-	if (tcon) {
-		if (tcon->tidStatus == CifsExiting) {
-			/* only tree disconnect, open, and write,
-			(and ulogoff which does not have tcon)
-			are allowed as we start force umount */
-			if ((smb_command != SMB_COM_WRITE_ANDX) &&
-			   (smb_command != SMB_COM_OPEN_ANDX) &&
-			   (smb_command != SMB_COM_TREE_DISCONNECT)) {
-				cFYI(1, ("can not send cmd %d while umounting",
-					smb_command));
-				return -ENODEV;
-			}
+	/*
+	 * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for
+	 * tcp and smb session status done differently for those three - in the
+	 * calling routine
+	 */
+	if (!tcon)
+		return 0;
+
+	ses = tcon->ses;
+	server = ses->server;
+
+	/*
+	 * only tree disconnect, open, and write, (and ulogoff which does not
+	 * have tcon) are allowed as we start force umount
+	 */
+	if (tcon->tidStatus == CifsExiting) {
+		if (smb_command != SMB_COM_WRITE_ANDX &&
+		    smb_command != SMB_COM_OPEN_ANDX &&
+		    smb_command != SMB_COM_TREE_DISCONNECT) {
+			cFYI(1, ("can not send cmd %d while umounting",
+				smb_command));
+			return -ENODEV;
 		}
-		if ((tcon->ses) && (tcon->ses->status != CifsExiting) &&
-				  (tcon->ses->server)) {
-			struct nls_table *nls_codepage;
-				/* Give Demultiplex thread up to 10 seconds to
-				   reconnect, should be greater than cifs socket
-				   timeout which is 7 seconds */
-			while (tcon->ses->server->tcpStatus ==
-							 CifsNeedReconnect) {
-				wait_event_interruptible_timeout(tcon->ses->server->response_q,
-					(tcon->ses->server->tcpStatus ==
-							CifsGood), 10 * HZ);
-				if (tcon->ses->server->tcpStatus ==
-							CifsNeedReconnect) {
-					/* on "soft" mounts we wait once */
-					if (!tcon->retry ||
-					   (tcon->ses->status == CifsExiting)) {
-						cFYI(1, ("gave up waiting on "
-						      "reconnect in smb_init"));
-						return -EHOSTDOWN;
-					} /* else "hard" mount - keep retrying
-					     until process is killed or server
-					     comes back on-line */
-				} else /* TCP session is reestablished now */
-					break;
-			}
+	}
 
-			nls_codepage = load_nls_default();
-		/* need to prevent multiple threads trying to
-		simultaneously reconnect the same SMB session */
-			down(&tcon->ses->sesSem);
-			if (tcon->ses->need_reconnect)
-				rc = cifs_setup_session(0, tcon->ses,
-							nls_codepage);
-			if (!rc && (tcon->need_reconnect)) {
-				mark_open_files_invalid(tcon);
-				rc = CIFSTCon(0, tcon->ses, tcon->treeName,
-					      tcon, nls_codepage);
-				up(&tcon->ses->sesSem);
-				/* BB FIXME add code to check if wsize needs
-				   update due to negotiated smb buffer size
-				   shrinking */
-				if (rc == 0) {
-					atomic_inc(&tconInfoReconnectCount);
-					/* tell server Unix caps we support */
-					if (tcon->ses->capabilities & CAP_UNIX)
-						reset_cifs_unix_caps(
-						0 /* no xid */,
-						tcon,
-						NULL /* we do not know sb */,
-						NULL /* no vol info */);
-				}
+	if (ses->status == CifsExiting)
+		return -EIO;
 
-				cFYI(1, ("reconnect tcon rc = %d", rc));
-				/* Removed call to reopen open files here.
-				   It is safer (and faster) to reopen files
-				   one at a time as needed in read and write */
-
-				/* Check if handle based operation so we
-				   know whether we can continue or not without
-				   returning to caller to reset file handle */
-				switch (smb_command) {
-					case SMB_COM_READ_ANDX:
-					case SMB_COM_WRITE_ANDX:
-					case SMB_COM_CLOSE:
-					case SMB_COM_FIND_CLOSE2:
-					case SMB_COM_LOCKING_ANDX: {
-						unload_nls(nls_codepage);
-						return -EAGAIN;
-					}
-				}
-			} else {
-				up(&tcon->ses->sesSem);
-			}
-			unload_nls(nls_codepage);
+	/*
+	 * Give demultiplex thread up to 10 seconds to reconnect, should be
+	 * greater than cifs socket timeout which is 7 seconds
+	 */
+	while (server->tcpStatus == CifsNeedReconnect) {
+		wait_event_interruptible_timeout(server->response_q,
+			(server->tcpStatus == CifsGood), 10 * HZ);
 
-		} else {
-			return -EIO;
+		/* is TCP session is reestablished now ?*/
+		if (server->tcpStatus != CifsNeedReconnect)
+			break;
+
+		/*
+		 * on "soft" mounts we wait once. Hard mounts keep
+		 * retrying until process is killed or server comes
+		 * back on-line
+		 */
+		if (!tcon->retry || ses->status == CifsExiting) {
+			cFYI(1, ("gave up waiting on reconnect in smb_init"));
+			return -EHOSTDOWN;
 		}
 	}
+
+	if (!ses->need_reconnect && !tcon->need_reconnect)
+		return 0;
+
+	nls_codepage = load_nls_default();
+
+	/*
+	 * need to prevent multiple threads trying to simultaneously
+	 * reconnect the same SMB session
+	 */
+	down(&ses->sesSem);
+	if (ses->need_reconnect)
+		rc = cifs_setup_session(0, ses, nls_codepage);
+
+	/* do we need to reconnect tcon? */
+	if (rc || !tcon->need_reconnect) {
+		up(&ses->sesSem);
+		goto out;
+	}
+
+	mark_open_files_invalid(tcon);
+	rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
+	up(&ses->sesSem);
+	cFYI(1, ("reconnect tcon rc = %d", rc));
+
+	if (rc)
+		goto out;
+
+	/*
+	 * FIXME: check if wsize needs updated due to negotiated smb buffer
+	 * 	  size shrinking
+	 */
+	atomic_inc(&tconInfoReconnectCount);
+
+	/* tell server Unix caps we support */
+	if (ses->capabilities & CAP_UNIX)
+		reset_cifs_unix_caps(0, tcon, NULL, NULL);
+
+	/*
+	 * Removed call to reopen open files here. It is safer (and faster) to
+	 * reopen files one at a time as needed in read and write.
+	 *
+	 * FIXME: what about file locks? don't we need to reclaim them ASAP?
+	 */
+
+out:
+	/*
+	 * Check if handle based operation so we know whether we can continue
+	 * or not without returning to caller to reset file handle
+	 */
+	switch (smb_command) {
+	case SMB_COM_READ_ANDX:
+	case SMB_COM_WRITE_ANDX:
+	case SMB_COM_CLOSE:
+	case SMB_COM_FIND_CLOSE2:
+	case SMB_COM_LOCKING_ANDX:
+		rc = -EAGAIN;
+	}
+
+	unload_nls(nls_codepage);
+	return rc;
+}
+
+/* Allocate and return pointer to an SMB request buffer, and set basic
+   SMB information in the SMB header.  If the return code is zero, this
+   function must have filled in request_buf pointer */
+static int
+small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+		void **request_buf)
+{
+	int rc = 0;
+
+	rc = cifs_reconnect_tcon(tcon, smb_command);
 	if (rc)
 		return rc;
 
@@ -256,101 +284,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 {
 	int rc = 0;
 
-	/* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so
-	   check for tcp and smb session status done differently
-	   for those three - in the calling routine */
-	if (tcon) {
-		if (tcon->tidStatus == CifsExiting) {
-			/* only tree disconnect, open, and write,
-			  (and ulogoff which does not have tcon)
-			  are allowed as we start force umount */
-			if ((smb_command != SMB_COM_WRITE_ANDX) &&
-			   (smb_command != SMB_COM_OPEN_ANDX) &&
-			   (smb_command != SMB_COM_TREE_DISCONNECT)) {
-				cFYI(1, ("can not send cmd %d while umounting",
-					smb_command));
-				return -ENODEV;
-			}
-		}
-
-		if ((tcon->ses) && (tcon->ses->status != CifsExiting) &&
-				  (tcon->ses->server)) {
-			struct nls_table *nls_codepage;
-				/* Give Demultiplex thread up to 10 seconds to
-				   reconnect, should be greater than cifs socket
-				   timeout which is 7 seconds */
-			while (tcon->ses->server->tcpStatus ==
-							CifsNeedReconnect) {
-				wait_event_interruptible_timeout(tcon->ses->server->response_q,
-					(tcon->ses->server->tcpStatus ==
-							CifsGood), 10 * HZ);
-				if (tcon->ses->server->tcpStatus ==
-						CifsNeedReconnect) {
-					/* on "soft" mounts we wait once */
-					if (!tcon->retry ||
-					   (tcon->ses->status == CifsExiting)) {
-						cFYI(1, ("gave up waiting on "
-						      "reconnect in smb_init"));
-						return -EHOSTDOWN;
-					} /* else "hard" mount - keep retrying
-					     until process is killed or server
-					     comes on-line */
-				} else /* TCP session is reestablished now */
-					break;
-			}
-			nls_codepage = load_nls_default();
-		/* need to prevent multiple threads trying to
-		simultaneously reconnect the same SMB session */
-			down(&tcon->ses->sesSem);
-			if (tcon->ses->need_reconnect)
-				rc = cifs_setup_session(0, tcon->ses,
-							nls_codepage);
-			if (!rc && (tcon->need_reconnect)) {
-				mark_open_files_invalid(tcon);
-				rc = CIFSTCon(0, tcon->ses, tcon->treeName,
-					      tcon, nls_codepage);
-				up(&tcon->ses->sesSem);
-				/* BB FIXME add code to check if wsize needs
-				update due to negotiated smb buffer size
-				shrinking */
-				if (rc == 0) {
-					atomic_inc(&tconInfoReconnectCount);
-					/* tell server Unix caps we support */
-					if (tcon->ses->capabilities & CAP_UNIX)
-						reset_cifs_unix_caps(
-						0 /* no xid */,
-						tcon,
-						NULL /* do not know sb */,
-						NULL /* no vol info */);
-				}
-
-				cFYI(1, ("reconnect tcon rc = %d", rc));
-				/* Removed call to reopen open files here.
-				   It is safer (and faster) to reopen files
-				   one at a time as needed in read and write */
-
-				/* Check if handle based operation so we
-				   know whether we can continue or not without
-				   returning to caller to reset file handle */
-				switch (smb_command) {
-					case SMB_COM_READ_ANDX:
-					case SMB_COM_WRITE_ANDX:
-					case SMB_COM_CLOSE:
-					case SMB_COM_FIND_CLOSE2:
-					case SMB_COM_LOCKING_ANDX: {
-						unload_nls(nls_codepage);
-						return -EAGAIN;
-					}
-				}
-			} else {
-				up(&tcon->ses->sesSem);
-			}
-			unload_nls(nls_codepage);
-
-		} else {
-			return -EIO;
-		}
-	}
+	rc = cifs_reconnect_tcon(tcon, smb_command);
 	if (rc)
 		return rc;
 
-- 
cgit v1.2.2


From 8379e7c46cc48f51197dd663fc6676f47f2a1e71 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Fri, 4 Sep 2009 11:12:01 -0700
Subject: ocfs2: ocfs2_write_begin_nolock() should handle len=0

Bug introduced by mainline commit e7432675f8ca868a4af365759a8d4c3779a3d922
The bug causes ocfs2_write_begin_nolock() to oops when len=0.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Cc: stable@kernel.org
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/aops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b401654011a2..8a1e61545f41 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1747,8 +1747,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 	 * we know zeros will only be needed in the first and/or last cluster.
 	 */
 	if (clusters_to_alloc || extents_to_split ||
-	    wc->w_desc[0].c_needs_zero ||
-	    wc->w_desc[wc->w_clen - 1].c_needs_zero)
+	    (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+			    wc->w_desc[wc->w_clen - 1].c_needs_zero)))
 		cluster_of_pages = 1;
 	else
 		cluster_of_pages = 0;
-- 
cgit v1.2.2


From 7f1346a9de6a689b03f2c1c3a387e49ec64da267 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Sat, 5 Sep 2009 09:28:54 -0400
Subject: ext4: Declare seq_operations and file_operations structures as const

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 3086b3c65adc..5ef6daf0cdc6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2202,7 +2202,7 @@ static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
 {
 }
 
-static struct seq_operations ext4_mb_seq_history_ops = {
+static const struct seq_operations ext4_mb_seq_history_ops = {
 	.start  = ext4_mb_seq_history_start,
 	.next   = ext4_mb_seq_history_next,
 	.stop   = ext4_mb_seq_history_stop,
@@ -2284,7 +2284,7 @@ static ssize_t ext4_mb_seq_history_write(struct file *file,
 	return count;
 }
 
-static struct file_operations ext4_mb_seq_history_fops = {
+static const struct file_operations ext4_mb_seq_history_fops = {
 	.owner		= THIS_MODULE,
 	.open		= ext4_mb_seq_history_open,
 	.read		= seq_read,
@@ -2363,7 +2363,7 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
 {
 }
 
-static struct seq_operations ext4_mb_seq_groups_ops = {
+static const struct seq_operations ext4_mb_seq_groups_ops = {
 	.start  = ext4_mb_seq_groups_start,
 	.next   = ext4_mb_seq_groups_next,
 	.stop   = ext4_mb_seq_groups_stop,
@@ -2384,7 +2384,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
 
 }
 
-static struct file_operations ext4_mb_seq_groups_fops = {
+static const struct file_operations ext4_mb_seq_groups_fops = {
 	.owner		= THIS_MODULE,
 	.open		= ext4_mb_seq_groups_open,
 	.read		= seq_read,
-- 
cgit v1.2.2


From d0646f7b636d067d715fab52a2ba9c6f0f46b0d7 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 5 Sep 2009 12:50:43 -0400
Subject: ext4: Remove journal_checksum mount option and enable it by default

There's no real cost for the journal checksum feature, and we should
make sure it is enabled all the time.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  |  1 -
 fs/ext4/super.c | 20 ++++++--------------
 2 files changed, 6 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 81014f4ed22d..4dc64ed58d26 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -711,7 +711,6 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_QUOTA		0x80000 /* Some quota option set */
 #define EXT4_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
-#define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4037fe0b5a5c..f1815d3bcfd5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1280,11 +1280,9 @@ static int parse_options(char *options, struct super_block *sb,
 			*journal_devnum = option;
 			break;
 		case Opt_journal_checksum:
-			set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
-			break;
+			break;	/* Kept for backwards compatibility */
 		case Opt_journal_async_commit:
 			set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
-			set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
 			break;
 		case Opt_noload:
 			set_opt(sbi->s_mount_opt, NOLOAD);
@@ -2751,20 +2749,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount4;
 	}
 
-	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
-		jbd2_journal_set_features(sbi->s_journal,
-				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+	jbd2_journal_set_features(sbi->s_journal,
+				  JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
+	if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
+		jbd2_journal_set_features(sbi->s_journal, 0, 0,
 				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-	} else if (test_opt(sb, JOURNAL_CHECKSUM)) {
-		jbd2_journal_set_features(sbi->s_journal,
-				JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
+	else
 		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
 				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-	} else {
-		jbd2_journal_clear_features(sbi->s_journal,
-				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
-				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-	}
 
 	/* We have now updated the journal if required, so we can
 	 * validate the data journaling mode. */
-- 
cgit v1.2.2


From a2a8474c3fff88d8dd52d05cb450563fb26fd26c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 5 Sep 2009 11:17:13 -0700
Subject: exec: do not sleep in TASK_TRACED under ->cred_guard_mutex

Tom Horsley reports that his debugger hangs when it tries to read
/proc/pid_of_tracee/maps, this happens since

	"mm_for_maps: take ->cred_guard_mutex to fix the race with exec"
	04b836cbf19e885f8366bccb2e4b0474346c02d

commit in 2.6.31.

But the root of the problem lies in the fact that do_execve() path calls
tracehook_report_exec() which can stop if the tracer sets PT_TRACE_EXEC.

The tracee must not sleep in TASK_TRACED holding this mutex.  Even if we
remove ->cred_guard_mutex from mm_for_maps() and proc_pid_attr_write(),
another task doing PTRACE_ATTACH should not hang until it is killed or the
tracee resumes.

With this patch do_execve() does not use ->cred_guard_mutex directly and
we do not hold it throughout, instead:

	- introduce prepare_bprm_creds() helper, it locks the mutex
	  and calls prepare_exec_creds() to initialize bprm->cred.

	- install_exec_creds() drops the mutex after commit_creds(),
	  and thus before tracehook_report_exec()->ptrace_stop().

	  or, if exec fails,

	  free_bprm() drops this mutex when bprm->cred != NULL which
	  indicates install_exec_creds() was not called.

Reported-by: Tom Horsley <tom.horsley@att.net>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat.c | 17 ++++-------------
 fs/exec.c   | 63 +++++++++++++++++++++++++++++++++++++------------------------
 2 files changed, 42 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 94502dab972a..6d6f98fe64a0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1485,20 +1485,15 @@ int compat_do_execve(char * filename,
 	if (!bprm)
 		goto out_files;
 
-	retval = -ERESTARTNOINTR;
-	if (mutex_lock_interruptible(&current->cred_guard_mutex))
+	retval = prepare_bprm_creds(bprm);
+	if (retval)
 		goto out_free;
-	current->in_execve = 1;
-
-	retval = -ENOMEM;
-	bprm->cred = prepare_exec_creds();
-	if (!bprm->cred)
-		goto out_unlock;
 
 	retval = check_unsafe_exec(bprm);
 	if (retval < 0)
-		goto out_unlock;
+		goto out_free;
 	clear_in_exec = retval;
+	current->in_execve = 1;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
@@ -1547,7 +1542,6 @@ int compat_do_execve(char * filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_guard_mutex);
 	acct_update_integrals(current);
 	free_bprm(bprm);
 	if (displaced)
@@ -1567,10 +1561,7 @@ out_file:
 out_unmark:
 	if (clear_in_exec)
 		current->fs->in_exec = 0;
-
-out_unlock:
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_guard_mutex);
 
 out_free:
 	free_bprm(bprm);
diff --git a/fs/exec.c b/fs/exec.c
index fb4f3cdda78c..172ceb6edde4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1015,6 +1015,35 @@ out:
 
 EXPORT_SYMBOL(flush_old_exec);
 
+/*
+ * Prepare credentials and lock ->cred_guard_mutex.
+ * install_exec_creds() commits the new creds and drops the lock.
+ * Or, if exec fails before, free_bprm() should release ->cred and
+ * and unlock.
+ */
+int prepare_bprm_creds(struct linux_binprm *bprm)
+{
+	if (mutex_lock_interruptible(&current->cred_guard_mutex))
+		return -ERESTARTNOINTR;
+
+	bprm->cred = prepare_exec_creds();
+	if (likely(bprm->cred))
+		return 0;
+
+	mutex_unlock(&current->cred_guard_mutex);
+	return -ENOMEM;
+}
+
+void free_bprm(struct linux_binprm *bprm)
+{
+	free_arg_pages(bprm);
+	if (bprm->cred) {
+		mutex_unlock(&current->cred_guard_mutex);
+		abort_creds(bprm->cred);
+	}
+	kfree(bprm);
+}
+
 /*
  * install the new credentials for this executable
  */
@@ -1024,12 +1053,13 @@ void install_exec_creds(struct linux_binprm *bprm)
 
 	commit_creds(bprm->cred);
 	bprm->cred = NULL;
-
-	/* cred_guard_mutex must be held at least to this point to prevent
+	/*
+	 * cred_guard_mutex must be held at least to this point to prevent
 	 * ptrace_attach() from altering our determination of the task's
-	 * credentials; any time after this it may be unlocked */
-
+	 * credentials; any time after this it may be unlocked.
+	 */
 	security_bprm_committed_creds(bprm);
+	mutex_unlock(&current->cred_guard_mutex);
 }
 EXPORT_SYMBOL(install_exec_creds);
 
@@ -1246,14 +1276,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 
 EXPORT_SYMBOL(search_binary_handler);
 
-void free_bprm(struct linux_binprm *bprm)
-{
-	free_arg_pages(bprm);
-	if (bprm->cred)
-		abort_creds(bprm->cred);
-	kfree(bprm);
-}
-
 /*
  * sys_execve() executes a new program.
  */
@@ -1277,20 +1299,15 @@ int do_execve(char * filename,
 	if (!bprm)
 		goto out_files;
 
-	retval = -ERESTARTNOINTR;
-	if (mutex_lock_interruptible(&current->cred_guard_mutex))
+	retval = prepare_bprm_creds(bprm);
+	if (retval)
 		goto out_free;
-	current->in_execve = 1;
-
-	retval = -ENOMEM;
-	bprm->cred = prepare_exec_creds();
-	if (!bprm->cred)
-		goto out_unlock;
 
 	retval = check_unsafe_exec(bprm);
 	if (retval < 0)
-		goto out_unlock;
+		goto out_free;
 	clear_in_exec = retval;
+	current->in_execve = 1;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
@@ -1340,7 +1357,6 @@ int do_execve(char * filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_guard_mutex);
 	acct_update_integrals(current);
 	free_bprm(bprm);
 	if (displaced)
@@ -1360,10 +1376,7 @@ out_file:
 out_unmark:
 	if (clear_in_exec)
 		current->fs->in_exec = 0;
-
-out_unlock:
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_guard_mutex);
 
 out_free:
 	free_bprm(bprm);
-- 
cgit v1.2.2


From 9de6886ec6e37f45807266a702bb7621498395ad Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Sat, 5 Sep 2009 00:25:37 -0400
Subject: ext2: fix unbalanced kmap()/kunmap()

In ext2_rename(), dir_page is acquired through ext2_dotdot().  It is
then released through ext2_set_link() but only if old_dir != new_dir.
Failing that, the pkmap reference count is never decremented and the
page remains pinned forever.  Repeat that a couple times with highmem
pages and all pkmap slots get exhausted, and every further kmap() calls
end up stalling on the pkmap_map_wait queue at which point the whole
system comes to a halt.

Signed-off-by: Nicolas Pitre <nico@marvell.com>
Acked-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/namei.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index e1dedb0f7873..78d9b925fc94 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -362,6 +362,10 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 	if (dir_de) {
 		if (old_dir != new_dir)
 			ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
+		else {
+			kunmap(dir_page);
+			page_cache_release(dir_page);
+		}
 		inode_dec_link_count(old_dir);
 	}
 	return 0;
-- 
cgit v1.2.2


From 5f3481e9a80c240f169b36ea886e2325b9aeb745 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 5 Sep 2009 21:42:42 -0400
Subject: ext4: fix cache flush in ext4_sync_file

We need to flush the write cache unconditionally in ->fsync, otherwise
writes into already allocated blocks can get lost.  Writes into fully
allocated files are very common when using disk images for
virtualization, and without this fix can easily lose data after
an fdatasync, which is the typical implementation for a cache flush on
the virtual drive.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/fsync.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 83cf6415f599..ab418c0f502d 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -92,9 +92,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 			.nr_to_write = 0, /* sys_fsync did this */
 		};
 		ret = sync_inode(inode, &wbc);
-		if (journal && (journal->j_flags & JBD2_BARRIER))
-			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 	}
 out:
+	if (journal && (journal->j_flags & JBD2_BARRIER))
+		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 	return ret;
 }
-- 
cgit v1.2.2


From 70d5d3dcea47c16058d2b093c29e07fdf61b56ad Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Wed, 16 Sep 2009 14:28:22 -0400
Subject: ext4: Fix wrong comparisons in mext_check_arguments()

The mext_check_arguments() function in move_extents.c has wrong
comparisons.  orig_start which is passed from user-space is block
unit, but i_size of inode is byte unit, therefore the checks do not
work fine.  This mis-check leads to the overflow of 'len' and then
hits BUG_ON() in ext4_move_extents().  The patch fixes this issue.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Reviewed-by: Greg Freemyer <greg.freemyer@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 46 +++++++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 5821e0bee917..c593eb2b193a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -898,6 +898,10 @@ mext_check_arguments(struct inode *orig_inode,
 			  struct inode *donor_inode, __u64 orig_start,
 			  __u64 donor_start, __u64 *len, __u64 moved_len)
 {
+	ext4_lblk_t orig_blocks, donor_blocks;
+	unsigned int blkbits = orig_inode->i_blkbits;
+	unsigned int blocksize = 1 << blkbits;
+
 	/* Regular file check */
 	if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
 		ext4_debug("ext4 move extent: The argument files should be "
@@ -972,43 +976,47 @@ mext_check_arguments(struct inode *orig_inode,
 	}
 
 	if (orig_inode->i_size > donor_inode->i_size) {
-		if (orig_start >= donor_inode->i_size) {
+		donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
+		/* TODO: eliminate this artificial restriction */
+		if (orig_start >= donor_blocks) {
 			ext4_debug("ext4 move extent: orig start offset "
-			"[%llu] should be less than donor file size "
-			"[%lld] [ino:orig %lu, donor_inode %lu]\n",
-			orig_start, donor_inode->i_size,
+			"[%llu] should be less than donor file blocks "
+			"[%u] [ino:orig %lu, donor %lu]\n",
+			orig_start, donor_blocks,
 			orig_inode->i_ino, donor_inode->i_ino);
 			return -EINVAL;
 		}
 
-		if (orig_start + *len > donor_inode->i_size) {
+		/* TODO: eliminate this artificial restriction */
+		if (orig_start + *len > donor_blocks) {
 			ext4_debug("ext4 move extent: End offset [%llu] should "
-				"be less than donor file size [%lld]."
-				"So adjust length from %llu to %lld "
+				"be less than donor file blocks [%u]."
+				"So adjust length from %llu to %llu "
 				"[ino:orig %lu, donor %lu]\n",
-				orig_start + *len, donor_inode->i_size,
-				*len, donor_inode->i_size - orig_start,
+				orig_start + *len, donor_blocks,
+				*len, donor_blocks - orig_start,
 				orig_inode->i_ino, donor_inode->i_ino);
-			*len = donor_inode->i_size - orig_start;
+			*len = donor_blocks - orig_start;
 		}
 	} else {
-		if (orig_start >= orig_inode->i_size) {
+		orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
+		if (orig_start >= orig_blocks) {
 			ext4_debug("ext4 move extent: start offset [%llu] "
-				"should be less than original file size "
-				"[%lld] [inode:orig %lu, donor %lu]\n",
-				 orig_start, orig_inode->i_size,
+				"should be less than original file blocks "
+				"[%u] [ino:orig %lu, donor %lu]\n",
+				 orig_start, orig_blocks,
 				orig_inode->i_ino, donor_inode->i_ino);
 			return -EINVAL;
 		}
 
-		if (orig_start + *len > orig_inode->i_size) {
+		if (orig_start + *len > orig_blocks) {
 			ext4_debug("ext4 move extent: Adjust length "
-				"from %llu to %lld. Because it should be "
-				"less than original file size "
+				"from %llu to %llu. Because it should be "
+				"less than original file blocks "
 				"[ino:orig %lu, donor %lu]\n",
-				*len, orig_inode->i_size - orig_start,
+				*len, orig_blocks - orig_start,
 				orig_inode->i_ino, donor_inode->i_ino);
-			*len = orig_inode->i_size - orig_start;
+			*len = orig_blocks - orig_start;
 		}
 	}
 
-- 
cgit v1.2.2


From daea696dbac0e33af3cfe304efbfb8d74e0effe6 Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Sat, 5 Sep 2009 22:11:55 -0400
Subject: ext4: Remove unneeded BUG_ON() in ext4_move_extents()

The ext4_move_extents() functions checks with BUG_ON() whether the
exchanged blocks count accords with request blocks count.  But, if the
target range (orig_start + len) includes sparse block(s), 'moved_len'
(exchanged blocks count) does not agree with 'len' (request blocks
count), since sparse block is not counted in 'moved_len'.  This causes
us to hit the BUG_ON(), even though the function succeeded.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c593eb2b193a..c8c66b167cd7 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -1322,8 +1322,5 @@ out2:
 	if (ret)
 		return ret;
 
-	/* All of the specified blocks must be exchanged in succeed */
-	BUG_ON(*moved_len != len);
-
 	return 0;
 }
-- 
cgit v1.2.2


From 8d6669133d8cdbb7cbe0e1f0f3744e7802a84afe Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Sat, 5 Sep 2009 22:46:29 -0400
Subject: ext4: Return exchanged blocks count to user space in failure

Return exchanged blocks count (moved_len) to user space,
if ext4_move_extents() failed on the way.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ioctl.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7050a9cd04a4..c1cdf613e725 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -243,10 +243,9 @@ setversion_out:
 					me.donor_start, me.len, &me.moved_len);
 		fput(donor_filp);
 
-		if (!err)
-			if (copy_to_user((struct move_extent *)arg,
-				&me, sizeof(me)))
-				return -EFAULT;
+		if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
+			return -EFAULT;
+
 		return err;
 	}
 
-- 
cgit v1.2.2


From 44fc48f7048ab9657b524938a832fec4e0acea98 Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Sat, 5 Sep 2009 23:12:41 -0400
Subject: ext4: Fix small typo for move_extent_per_page()

This function means moving extents every page, so change its name from
move_exgtent_par_page().

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.co.jp>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c8c66b167cd7..4c4491cef357 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -740,7 +740,7 @@ out:
  * on success, or a negative error value on failure.
  */
 static int
-move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
+move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 		  pgoff_t orig_page_offset, int data_offset_in_page,
 		  int block_len_in_page, int uninit)
 {
@@ -1267,7 +1267,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		while (orig_page_offset <= seq_end_page) {
 
 			/* Swap original branches with new branches */
-			ret = move_extent_par_page(o_filp, donor_inode,
+			ret = move_extent_per_page(o_filp, donor_inode,
 						orig_page_offset,
 						data_offset_in_page,
 						block_len_in_page, uninit);
-- 
cgit v1.2.2


From acd0c935178649f72c44ec49ca83bee35ce1f79e Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Fri, 4 Sep 2009 13:08:46 -0400
Subject: IMA: update ima_counts_put

- As ima_counts_put() may be called after the inode has been freed,
verify that the inode is not NULL, before dereferencing it.

- Maintain the IMA file counters in may_open() properly, decrementing
any counter increments on subsequent errors.

Reported-by: Ciprian Docan <docan@eden.rutgers.edu>
Reported-by: J.R. Okajima <hooanon05@yahoo.co.jp>
Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Acked-by: Eric Paris <eparis@redhat.com
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/namei.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index f3c5b278895a..1f13751693a5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1542,28 +1542,31 @@ int may_open(struct path *path, int acc_mode, int flag)
 	 * An append-only file must be opened in append mode for writing.
 	 */
 	if (IS_APPEND(inode)) {
+		error = -EPERM;
 		if  ((flag & FMODE_WRITE) && !(flag & O_APPEND))
-			return -EPERM;
+			goto err_out;
 		if (flag & O_TRUNC)
-			return -EPERM;
+			goto err_out;
 	}
 
 	/* O_NOATIME can only be set by the owner or superuser */
 	if (flag & O_NOATIME)
-		if (!is_owner_or_cap(inode))
-			return -EPERM;
+		if (!is_owner_or_cap(inode)) {
+			error = -EPERM;
+			goto err_out;
+		}
 
 	/*
 	 * Ensure there are no outstanding leases on the file.
 	 */
 	error = break_lease(inode, flag);
 	if (error)
-		return error;
+		goto err_out;
 
 	if (flag & O_TRUNC) {
 		error = get_write_access(inode);
 		if (error)
-			return error;
+			goto err_out;
 
 		/*
 		 * Refuse to truncate files with mandatory locks held on them.
@@ -1581,12 +1584,17 @@ int may_open(struct path *path, int acc_mode, int flag)
 		}
 		put_write_access(inode);
 		if (error)
-			return error;
+			goto err_out;
 	} else
 		if (flag & FMODE_WRITE)
 			vfs_dq_init(inode);
 
 	return 0;
+err_out:
+	ima_counts_put(path, acc_mode ?
+		       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
+		       ACC_MODE(flag) & (MAY_READ | MAY_WRITE));
+	return error;
 }
 
 /*
-- 
cgit v1.2.2


From a827eaffff07c7d58a4cb32158cbeb4849f4e33a Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 9 Sep 2009 22:36:03 -0400
Subject: ext4: Take page lock before looking at attached buffer_heads flags

In order to check whether the buffer_heads are mapped we need to hold
page lock. Otherwise a reclaim can cleanup the attached buffer_heads.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 17802a96af9f..5c0d17066f4a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5286,12 +5286,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	else
 		len = PAGE_CACHE_SIZE;
 
+	lock_page(page);
+	/*
+	 * return if we have all the buffers mapped. This avoid
+	 * the need to call write_begin/write_end which does a
+	 * journal_start/journal_stop which can block and take
+	 * long time
+	 */
 	if (page_has_buffers(page)) {
-		/* return if we have all the buffers mapped */
 		if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-				       ext4_bh_unmapped))
+					ext4_bh_unmapped)) {
+			unlock_page(page);
 			goto out_unlock;
+		}
 	}
+	unlock_page(page);
 	/*
 	 * OK, we need to fill the hole... Do write_begin write_end
 	 * to do block allocation/reservation.We are not holding
-- 
cgit v1.2.2


From 80e42468d65475e92651e62175bb7807773321d0 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 8 Sep 2009 08:21:26 -0400
Subject: ext4: print more sysadmin-friendly message in check_block_validity()

Drop the WARN_ON(1), as he stack trace is not appropriate, since it is
triggered by file system corruption, and it misleads users into
thinking there is a kernel bug.  In addition, change the message
displayed by ext4_error() to make it clear that this is a file system
corruption problem.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5c0d17066f4a..6253ecdac67f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1121,16 +1121,15 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 		ext4_discard_preallocations(inode);
 }
 
-static int check_block_validity(struct inode *inode, sector_t logical,
-				sector_t phys, int len)
+static int check_block_validity(struct inode *inode, const char *msg,
+				sector_t logical, sector_t phys, int len)
 {
 	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
-		ext4_error(inode->i_sb, "check_block_validity",
+		ext4_error(inode->i_sb, msg,
 			   "inode #%lu logical block %llu mapped to %llu "
 			   "(size %d)", inode->i_ino,
 			   (unsigned long long) logical,
 			   (unsigned long long) phys, len);
-		WARN_ON(1);
 		return -EIO;
 	}
 	return 0;
@@ -1182,8 +1181,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	up_read((&EXT4_I(inode)->i_data_sem));
 
 	if (retval > 0 && buffer_mapped(bh)) {
-		int ret = check_block_validity(inode, block,
-					       bh->b_blocknr, retval);
+		int ret = check_block_validity(inode, "file system corruption",
+					       block, bh->b_blocknr, retval);
 		if (ret != 0)
 			return ret;
 	}
@@ -1264,8 +1263,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 
 	up_write((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && buffer_mapped(bh)) {
-		int ret = check_block_validity(inode, block,
-					       bh->b_blocknr, retval);
+		int ret = check_block_validity(inode, "file system "
+					       "corruption after allocation",
+					       block, bh->b_blocknr, retval);
 		if (ret != 0)
 			return ret;
 	}
-- 
cgit v1.2.2


From acf7e2444acfaf4c8540603b76d71010eea3fc24 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 8 Sep 2009 18:00:30 +0100
Subject: GFS2: Be extra careful about deallocating inodes

There is a potential race in the inode deallocation code if two
nodes try to deallocate the same inode at the same time. Most of
the issue is solved by the iopen locking. There is still a small
window which is not covered by the iopen lock. This patches fixes
that and also makes the deallocation code more robust in the face of
any errors in the rgrp bitmaps, or erroneous iopen callbacks from
other nodes.

This does introduce one extra disk read, but that is generally not
an issue since its the same block that must be written to later
in the deallocation process. The total disk accesses therefore stay
the same,

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/export.c | 36 ++++--------------------------------
 fs/gfs2/rgrp.c   | 42 +++++++++++++++++++++++++++++++++++++++++-
 fs/gfs2/rgrp.h   |  4 ++--
 fs/gfs2/super.c  |  4 ++++
 4 files changed, 51 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9200ef221716..d15876e9aa26 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -143,17 +143,14 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
 }
 
 static struct dentry *gfs2_get_dentry(struct super_block *sb,
-		struct gfs2_inum_host *inum)
+				      struct gfs2_inum_host *inum)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
-	struct gfs2_holder i_gh, ri_gh, rgd_gh;
-	struct gfs2_rgrpd *rgd;
+	struct gfs2_holder i_gh;
 	struct inode *inode;
 	struct dentry *dentry;
 	int error;
 
-	/* System files? */
-
 	inode = gfs2_ilookup(sb, inum->no_addr);
 	if (inode) {
 		if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
@@ -168,29 +165,11 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
 	if (error)
 		return ERR_PTR(error);
 
-	error = gfs2_rindex_hold(sdp, &ri_gh);
+	error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
 	if (error)
 		goto fail;
 
-	error = -EINVAL;
-	rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
-	if (!rgd)
-		goto fail_rindex;
-
-	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
-	if (error)
-		goto fail_rindex;
-
-	error = -ESTALE;
-	if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
-		goto fail_rgd;
-
-	gfs2_glock_dq_uninit(&rgd_gh);
-	gfs2_glock_dq_uninit(&ri_gh);
-
-	inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
-					inum->no_addr,
-					0, 0);
+	inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0);
 	if (IS_ERR(inode)) {
 		error = PTR_ERR(inode);
 		goto fail;
@@ -224,13 +203,6 @@ out_inode:
 	if (!IS_ERR(dentry))
 		dentry->d_op = &gfs2_dops;
 	return dentry;
-
-fail_rgd:
-	gfs2_glock_dq_uninit(&rgd_gh);
-
-fail_rindex:
-	gfs2_glock_dq_uninit(&ri_gh);
-
 fail:
 	gfs2_glock_dq_uninit(&i_gh);
 	return ERR_PTR(error);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 388a61d12fc8..caaa665d4033 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1256,7 +1256,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
  * Returns: The block type (GFS2_BLKST_*)
  */
 
-unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
+static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
 {
 	struct gfs2_bitmap *bi = NULL;
 	u32 length, rgrp_block, buf_block;
@@ -1693,6 +1693,46 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
 	gfs2_meta_wipe(ip, ip->i_no_addr, 1);
 }
 
+/**
+ * gfs2_check_blk_type - Check the type of a block
+ * @sdp: The superblock
+ * @no_addr: The block number to check
+ * @type: The block type we are looking for
+ *
+ * Returns: 0 if the block type matches the expected type
+ *          -ESTALE if it doesn't match
+ *          or -ve errno if something went wrong while checking
+ */
+
+int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
+{
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_holder ri_gh, rgd_gh;
+	int error;
+
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		goto fail;
+
+	error = -EINVAL;
+	rgd = gfs2_blk2rgrpd(sdp, no_addr);
+	if (!rgd)
+		goto fail_rindex;
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
+	if (error)
+		goto fail_rindex;
+
+	if (gfs2_get_block_type(rgd, no_addr) != type)
+		error = -ESTALE;
+
+	gfs2_glock_dq_uninit(&rgd_gh);
+fail_rindex:
+	gfs2_glock_dq_uninit(&ri_gh);
+fail:
+	return error;
+}
+
 /**
  * gfs2_rlist_add - add a RG to a list of RGs
  * @sdp: the filesystem
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index a8dedd78245b..b4106ddaaa98 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -44,8 +44,6 @@ gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
 
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
-extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
-
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
 extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
 
@@ -53,6 +51,8 @@ extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
 extern void gfs2_unlink_di(struct inode *inode);
+extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
+			       unsigned int type);
 
 struct gfs2_rgrp_list {
 	unsigned int rl_rgrps;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index d95cf777d244..0ec3ec672de1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1286,6 +1286,10 @@ static void gfs2_delete_inode(struct inode *inode)
 		goto out;
 	}
 
+	error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
+	if (error)
+		goto out_truncate;
+
 	gfs2_glock_dq_wait(&ip->i_iopen_gh);
 	gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
 	error = gfs2_glock_nq(&ip->i_iopen_gh);
-- 
cgit v1.2.2


From e8e66ed25b5cbeebed69c475f6c108e52078a3b3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 28 Aug 2009 10:05:33 -0700
Subject: Do not call 'ima_path_check()' for each path component

Not only is that a supremely timing-critical path, but it's hopefully
some day going to be lockless for the common case, and ima can't do
that.

Plus the integrity code doesn't even care about non-regular files, so it
was always a total waste of time and effort.

Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 1f13751693a5..a005d8b7afad 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -856,9 +856,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 		if (err == -EAGAIN)
 			err = inode_permission(nd->path.dentry->d_inode,
 					       MAY_EXEC);
-		if (!err)
-			err = ima_path_check(&nd->path, MAY_EXEC,
-				             IMA_COUNT_UPDATE);
  		if (err)
 			break;
 
-- 
cgit v1.2.2


From b7a437b08a44a3ed7e3a052eb39d2c5f618b603b Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 28 Aug 2009 10:50:37 -0700
Subject: Simplify exec_permission_lite() logic

Instead of returning EAGAIN and having the caller do something
special for that case,  just do the special case directly.

Reviewed-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index a005d8b7afad..8c3580610eec 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -435,7 +435,7 @@ static int exec_permission_lite(struct inode *inode)
 	umode_t	mode = inode->i_mode;
 
 	if (inode->i_op->permission)
-		return -EAGAIN;
+		return inode_permission(inode, MAY_EXEC);
 
 	if (current_fsuid() == inode->i_uid)
 		mode >>= 6;
@@ -853,9 +853,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 
 		nd->flags |= LOOKUP_CONTINUE;
 		err = exec_permission_lite(inode);
-		if (err == -EAGAIN)
-			err = inode_permission(nd->path.dentry->d_inode,
-					       MAY_EXEC);
  		if (err)
 			break;
 
-- 
cgit v1.2.2


From f1ac9f6bfea6f21e8ab6dbbe46879d62a6fba8c0 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 28 Aug 2009 10:53:56 -0700
Subject: Simplify exec_permission_lite() further

This function is only called for path components that are already known
to be directories (they have a '->lookup' method).  So don't bother
doing that whole S_ISDIR() testing, the whole point of the 'lite()'
version is that we know that we are looking at a directory component,
and that we're only checking name lookup permission.

Reviewed-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 8c3580610eec..929f535fb225 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -445,13 +445,7 @@ static int exec_permission_lite(struct inode *inode)
 	if (mode & MAY_EXEC)
 		goto ok;
 
-	if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE))
-		goto ok;
-
-	if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE))
-		goto ok;
-
-	if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH))
+	if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
 		goto ok;
 
 	return -EACCES;
-- 
cgit v1.2.2


From cb9179ead0aa0e3b7b4087cdba59baf16bbeef6d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 28 Aug 2009 11:08:31 -0700
Subject: Simplify exec_permission_lite(), part 3

Don't call down to the generic inode_permission() function just to
call the inode-specific permission function - just do it directly.

The generic inode_permission() code does things like checking MAY_WRITE
and devcgroup_inode_permission(), neither of which are relevant for the
light pathname walk permission checks (we always do just MAY_EXEC, and
the inode is never a special device).

Reviewed-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 929f535fb225..e645e3070360 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -434,8 +434,12 @@ static int exec_permission_lite(struct inode *inode)
 {
 	umode_t	mode = inode->i_mode;
 
-	if (inode->i_op->permission)
-		return inode_permission(inode, MAY_EXEC);
+	if (inode->i_op->permission) {
+		int ret = inode->i_op->permission(inode, MAY_EXEC);
+		if (!ret)
+			goto ok;
+		return ret;
+	}
 
 	if (current_fsuid() == inode->i_uid)
 		mode >>= 6;
-- 
cgit v1.2.2


From 5909ccaa300a4a834ffa275327af4df0b9cb5295 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 28 Aug 2009 11:51:25 -0700
Subject: Make 'check_acl()' a first-class filesystem op

This is stage one in flattening out the callchains for the common
permission testing.  Rather than have most filesystem implement their
own inode->i_op->permission function that just calls back down to the
VFS layers 'generic_permission()' with the per-filesystem ACL checking
function, the filesystem can just expose its 'check_acl' function
directly, and let the VFS layer do everything for it.

This is all just preparatory - no filesystem actually enables this yet.

Reviewed-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 62 +++++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 35 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index e645e3070360..ed27bb205b7e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,19 +169,10 @@ void putname(const char *name)
 EXPORT_SYMBOL(putname);
 #endif
 
-
-/**
- * generic_permission  -  check for access rights on a Posix-like filesystem
- * @inode:	inode to check access rights for
- * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
- * @check_acl:	optional callback to check for Posix ACLs
- *
- * Used to check for read/write/execute permissions on a file.
- * We use "fsuid" for this, letting us set arbitrary permissions
- * for filesystem access without changing the "normal" uids which
- * are used for other things..
+/*
+ * This does basic POSIX ACL permission checking
  */
-int generic_permission(struct inode *inode, int mask,
+static int acl_permission_check(struct inode *inode, int mask,
 		int (*check_acl)(struct inode *inode, int mask))
 {
 	umode_t			mode = inode->i_mode;
@@ -193,9 +184,7 @@ int generic_permission(struct inode *inode, int mask,
 	else {
 		if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
 			int error = check_acl(inode, mask);
-			if (error == -EACCES)
-				goto check_capabilities;
-			else if (error != -EAGAIN)
+			if (error != -EAGAIN)
 				return error;
 		}
 
@@ -208,8 +197,32 @@ int generic_permission(struct inode *inode, int mask,
 	 */
 	if ((mask & ~mode) == 0)
 		return 0;
+	return -EACCES;
+}
+
+/**
+ * generic_permission  -  check for access rights on a Posix-like filesystem
+ * @inode:	inode to check access rights for
+ * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @check_acl:	optional callback to check for Posix ACLs
+ *
+ * Used to check for read/write/execute permissions on a file.
+ * We use "fsuid" for this, letting us set arbitrary permissions
+ * for filesystem access without changing the "normal" uids which
+ * are used for other things..
+ */
+int generic_permission(struct inode *inode, int mask,
+		int (*check_acl)(struct inode *inode, int mask))
+{
+	int ret;
+
+	/*
+	 * Do the basic POSIX ACL permission checks.
+	 */
+	ret = acl_permission_check(inode, mask, check_acl);
+	if (ret != -EACCES)
+		return ret;
 
- check_capabilities:
 	/*
 	 * Read/write DACs are always overridable.
 	 * Executable DACs are overridable if at least one exec bit is set.
@@ -262,7 +275,7 @@ int inode_permission(struct inode *inode, int mask)
 	if (inode->i_op->permission)
 		retval = inode->i_op->permission(inode, mask);
 	else
-		retval = generic_permission(inode, mask, NULL);
+		retval = generic_permission(inode, mask, inode->i_op->check_acl);
 
 	if (retval)
 		return retval;
@@ -432,27 +445,22 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
  */
 static int exec_permission_lite(struct inode *inode)
 {
-	umode_t	mode = inode->i_mode;
+	int ret;
 
 	if (inode->i_op->permission) {
-		int ret = inode->i_op->permission(inode, MAY_EXEC);
+		ret = inode->i_op->permission(inode, MAY_EXEC);
 		if (!ret)
 			goto ok;
 		return ret;
 	}
-
-	if (current_fsuid() == inode->i_uid)
-		mode >>= 6;
-	else if (in_group_p(inode->i_gid))
-		mode >>= 3;
-
-	if (mode & MAY_EXEC)
+	ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl);
+	if (!ret)
 		goto ok;
 
 	if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
 		goto ok;
 
-	return -EACCES;
+	return ret;
 ok:
 	return security_inode_permission(inode, MAY_EXEC);
 }
-- 
cgit v1.2.2


From 1d5ccd1c422d7d292a9e45248aa36771900c6331 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 28 Aug 2009 12:12:24 -0700
Subject: ext[234]: move over to 'check_acl' permission model

Don't implement per-filesystem 'extX_permission()' functions that have
to be called for every path component operation, and instead just expose
the actual ACL checking so that the VFS layer can now do it for us.

Reviewed-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/acl.c   | 8 +-------
 fs/ext2/acl.h   | 4 ++--
 fs/ext2/file.c  | 2 +-
 fs/ext2/namei.c | 4 ++--
 fs/ext3/acl.c   | 8 +-------
 fs/ext3/acl.h   | 4 ++--
 fs/ext3/file.c  | 2 +-
 fs/ext3/namei.c | 4 ++--
 fs/ext4/acl.c   | 8 +-------
 fs/ext4/acl.h   | 4 ++--
 fs/ext4/file.c  | 2 +-
 fs/ext4/namei.c | 4 ++--
 12 files changed, 18 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index d636e1297cad..a63d44256a70 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -230,7 +230,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	return error;
 }
 
-static int
+int
 ext2_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
@@ -246,12 +246,6 @@ ext2_check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-int
-ext2_permission(struct inode *inode, int mask)
-{
-	return generic_permission(inode, mask, ext2_check_acl);
-}
-
 /*
  * Initialize the ACLs of a new inode. Called from ext2_new_inode.
  *
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index ecefe478898f..3ff6cbb9ac44 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,13 +54,13 @@ static inline int ext2_acl_count(size_t size)
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
 
 /* acl.c */
-extern int ext2_permission (struct inode *, int);
+extern int ext2_check_acl (struct inode *, int);
 extern int ext2_acl_chmod (struct inode *);
 extern int ext2_init_acl (struct inode *, struct inode *);
 
 #else
 #include <linux/sched.h>
-#define ext2_permission NULL
+#define ext2_check_acl	NULL
 #define ext2_get_acl	NULL
 #define ext2_set_acl	NULL
 
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 2b9e47dc9222..a2f3afd1a1c1 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -85,6 +85,6 @@ const struct inode_operations ext2_file_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.setattr	= ext2_setattr,
-	.permission	= ext2_permission,
+	.check_acl	= ext2_check_acl,
 	.fiemap		= ext2_fiemap,
 };
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 78d9b925fc94..23701f289e98 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -400,7 +400,7 @@ const struct inode_operations ext2_dir_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.setattr	= ext2_setattr,
-	.permission	= ext2_permission,
+	.check_acl	= ext2_check_acl,
 };
 
 const struct inode_operations ext2_special_inode_operations = {
@@ -411,5 +411,5 @@ const struct inode_operations ext2_special_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.setattr	= ext2_setattr,
-	.permission	= ext2_permission,
+	.check_acl	= ext2_check_acl,
 };
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index e167bae37ef0..c9b0df376b5f 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -238,7 +238,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 	return error;
 }
 
-static int
+int
 ext3_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
@@ -254,12 +254,6 @@ ext3_check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-int
-ext3_permission(struct inode *inode, int mask)
-{
-	return generic_permission(inode, mask, ext3_check_acl);
-}
-
 /*
  * Initialize the ACLs of a new inode. Called from ext3_new_inode.
  *
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 07d15a3a5969..597334626de9 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,13 +54,13 @@ static inline int ext3_acl_count(size_t size)
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
 
 /* acl.c */
-extern int ext3_permission (struct inode *, int);
+extern int ext3_check_acl (struct inode *, int);
 extern int ext3_acl_chmod (struct inode *);
 extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
 
 #else  /* CONFIG_EXT3_FS_POSIX_ACL */
 #include <linux/sched.h>
-#define ext3_permission NULL
+#define ext3_check_acl NULL
 
 static inline int
 ext3_acl_chmod(struct inode *inode)
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 5b49704b231b..299253214789 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -137,7 +137,7 @@ const struct inode_operations ext3_file_inode_operations = {
 	.listxattr	= ext3_listxattr,
 	.removexattr	= generic_removexattr,
 #endif
-	.permission	= ext3_permission,
+	.check_acl	= ext3_check_acl,
 	.fiemap		= ext3_fiemap,
 };
 
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 6ff7b9730234..aad6400c9b77 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2445,7 +2445,7 @@ const struct inode_operations ext3_dir_inode_operations = {
 	.listxattr	= ext3_listxattr,
 	.removexattr	= generic_removexattr,
 #endif
-	.permission	= ext3_permission,
+	.check_acl	= ext3_check_acl,
 };
 
 const struct inode_operations ext3_special_inode_operations = {
@@ -2456,5 +2456,5 @@ const struct inode_operations ext3_special_inode_operations = {
 	.listxattr	= ext3_listxattr,
 	.removexattr	= generic_removexattr,
 #endif
-	.permission	= ext3_permission,
+	.check_acl	= ext3_check_acl,
 };
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index f6d8967149ca..0df88b2a69b0 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -236,7 +236,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 	return error;
 }
 
-static int
+int
 ext4_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
@@ -252,12 +252,6 @@ ext4_check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-int
-ext4_permission(struct inode *inode, int mask)
-{
-	return generic_permission(inode, mask, ext4_check_acl);
-}
-
 /*
  * Initialize the ACLs of a new inode. Called from ext4_new_inode.
  *
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 949789d2bba6..9d843d5deac4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 
 /* acl.c */
-extern int ext4_permission(struct inode *, int);
+extern int ext4_check_acl(struct inode *, int);
 extern int ext4_acl_chmod(struct inode *);
 extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 
 #else  /* CONFIG_EXT4_FS_POSIX_ACL */
 #include <linux/sched.h>
-#define ext4_permission NULL
+#define ext4_check_acl NULL
 
 static inline int
 ext4_acl_chmod(struct inode *inode)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3f1873fef1c6..27f3c5354c0e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -207,7 +207,7 @@ const struct inode_operations ext4_file_inode_operations = {
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
 #endif
-	.permission	= ext4_permission,
+	.check_acl	= ext4_check_acl,
 	.fallocate	= ext4_fallocate,
 	.fiemap		= ext4_fiemap,
 };
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index de04013d16ff..114abe5d2c1d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2536,7 +2536,7 @@ const struct inode_operations ext4_dir_inode_operations = {
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
 #endif
-	.permission	= ext4_permission,
+	.check_acl	= ext4_check_acl,
 	.fiemap         = ext4_fiemap,
 };
 
@@ -2548,5 +2548,5 @@ const struct inode_operations ext4_special_inode_operations = {
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
 #endif
-	.permission	= ext4_permission,
+	.check_acl	= ext4_check_acl,
 };
-- 
cgit v1.2.2


From 18f4c644773bc8de1fd9c5182b30c231aafb94ef Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 28 Aug 2009 12:29:03 -0700
Subject: jffs2/jfs/xfs: switch over to 'check_acl' rather than 'permission()'

This avoids an indirect call in the VFS for each path component lookup.

Well, at least as long as you own the directory in question, and the ACL
check is unnecessary.

Reviewed-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jffs2/acl.c              |  7 +------
 fs/jffs2/acl.h              |  4 ++--
 fs/jffs2/dir.c              |  2 +-
 fs/jffs2/file.c             |  2 +-
 fs/jffs2/symlink.c          |  2 +-
 fs/jfs/acl.c                |  7 +------
 fs/jfs/file.c               |  2 +-
 fs/jfs/jfs_acl.h            |  2 +-
 fs/jfs/namei.c              |  2 +-
 fs/xfs/linux-2.6/xfs_iops.c | 16 ++++------------
 10 files changed, 14 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 8fcb6239218e..7edb62e97419 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -258,7 +258,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	return rc;
 }
 
-static int jffs2_check_acl(struct inode *inode, int mask)
+int jffs2_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl;
 	int rc;
@@ -274,11 +274,6 @@ static int jffs2_check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-int jffs2_permission(struct inode *inode, int mask)
-{
-	return generic_permission(inode, mask, jffs2_check_acl);
-}
-
 int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
 {
 	struct posix_acl *acl, *clone;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index fc929f2a14f6..f0ba63e3c36b 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
 
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
 
-extern int jffs2_permission(struct inode *, int);
+extern int jffs2_check_acl(struct inode *, int);
 extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
@@ -36,7 +36,7 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler;
 
 #else
 
-#define jffs2_permission			(NULL)
+#define jffs2_check_acl				(NULL)
 #define jffs2_acl_chmod(inode)			(0)
 #define jffs2_init_acl_pre(dir_i,inode,mode)	(0)
 #define jffs2_init_acl_post(inode)		(0)
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 6f60cc910f4c..7aa4417e085f 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -55,7 +55,7 @@ const struct inode_operations jffs2_dir_inode_operations =
 	.rmdir =	jffs2_rmdir,
 	.mknod =	jffs2_mknod,
 	.rename =	jffs2_rename,
-	.permission =	jffs2_permission,
+	.check_acl =	jffs2_check_acl,
 	.setattr =	jffs2_setattr,
 	.setxattr =	jffs2_setxattr,
 	.getxattr =	jffs2_getxattr,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 23c947539864..b7b74e299142 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -56,7 +56,7 @@ const struct file_operations jffs2_file_operations =
 
 const struct inode_operations jffs2_file_inode_operations =
 {
-	.permission =	jffs2_permission,
+	.check_acl =	jffs2_check_acl,
 	.setattr =	jffs2_setattr,
 	.setxattr =	jffs2_setxattr,
 	.getxattr =	jffs2_getxattr,
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index b7339c3b6ad9..4ec11e8bda8c 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -21,7 +21,7 @@ const struct inode_operations jffs2_symlink_inode_operations =
 {
 	.readlink =	generic_readlink,
 	.follow_link =	jffs2_follow_link,
-	.permission =	jffs2_permission,
+	.check_acl =	jffs2_check_acl,
 	.setattr =	jffs2_setattr,
 	.setxattr =	jffs2_setxattr,
 	.getxattr =	jffs2_getxattr,
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a29c7c3e3fb8..d66477c34306 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -114,7 +114,7 @@ out:
 	return rc;
 }
 
-static int jfs_check_acl(struct inode *inode, int mask)
+int jfs_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
 
@@ -129,11 +129,6 @@ static int jfs_check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-int jfs_permission(struct inode *inode, int mask)
-{
-	return generic_permission(inode, mask, jfs_check_acl);
-}
-
 int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
 {
 	struct posix_acl *acl = NULL;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 7f6063acaa3b..2b70fa78e4a7 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -96,7 +96,7 @@ const struct inode_operations jfs_file_inode_operations = {
 	.removexattr	= jfs_removexattr,
 #ifdef CONFIG_JFS_POSIX_ACL
 	.setattr	= jfs_setattr,
-	.permission	= jfs_permission,
+	.check_acl	= jfs_check_acl,
 #endif
 };
 
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 88475f10a389..b07bd417ef85 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
 
 #ifdef CONFIG_JFS_POSIX_ACL
 
-int jfs_permission(struct inode *, int);
+int jfs_check_acl(struct inode *, int);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
 int jfs_setattr(struct dentry *, struct iattr *);
 
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 514ee2edb92a..c79a4270f083 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1543,7 +1543,7 @@ const struct inode_operations jfs_dir_inode_operations = {
 	.removexattr	= jfs_removexattr,
 #ifdef CONFIG_JFS_POSIX_ACL
 	.setattr	= jfs_setattr,
-	.permission	= jfs_permission,
+	.check_acl	= jfs_check_acl,
 #endif
 };
 
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 8070b34cc287..6c32f1d63d8c 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -484,14 +484,6 @@ xfs_vn_put_link(
 		kfree(s);
 }
 
-STATIC int
-xfs_vn_permission(
-	struct inode		*inode,
-	int			mask)
-{
-	return generic_permission(inode, mask, xfs_check_acl);
-}
-
 STATIC int
 xfs_vn_getattr(
 	struct vfsmount		*mnt,
@@ -696,7 +688,7 @@ xfs_vn_fiemap(
 }
 
 static const struct inode_operations xfs_inode_operations = {
-	.permission		= xfs_vn_permission,
+	.check_acl		= xfs_check_acl,
 	.truncate		= xfs_vn_truncate,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
@@ -724,7 +716,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
 	.rmdir			= xfs_vn_unlink,
 	.mknod			= xfs_vn_mknod,
 	.rename			= xfs_vn_rename,
-	.permission		= xfs_vn_permission,
+	.check_acl		= xfs_check_acl,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
 	.setxattr		= generic_setxattr,
@@ -749,7 +741,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
 	.rmdir			= xfs_vn_unlink,
 	.mknod			= xfs_vn_mknod,
 	.rename			= xfs_vn_rename,
-	.permission		= xfs_vn_permission,
+	.check_acl		= xfs_check_acl,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
 	.setxattr		= generic_setxattr,
@@ -762,7 +754,7 @@ static const struct inode_operations xfs_symlink_inode_operations = {
 	.readlink		= generic_readlink,
 	.follow_link		= xfs_vn_follow_link,
 	.put_link		= xfs_vn_put_link,
-	.permission		= xfs_vn_permission,
+	.check_acl		= xfs_check_acl,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
 	.setxattr		= generic_setxattr,
-- 
cgit v1.2.2


From dbab8360ed8abca38121109feab47c4bea895994 Mon Sep 17 00:00:00 2001
From: Harshula Jayasuriya <harshula@redhat.com>
Date: Tue, 8 Sep 2009 19:49:33 -0400
Subject: NFS: out of date comment regarding O_EXCL above nfs3_proc_create()

Hi Trond,

Recently we were observing the behaviour difference between a 2.4.x and
2.6.x kernel with respect to O_EXCL. A comment from 2.4.x era, "For now,
we don't implement O_EXCL." seems inaccurate in TOT.

If so, here's a patch to remove the comment.

This patch is against:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

Signed-off-by: Harshula Jayasuriya <harshula@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3proc.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index d0cc5ce0edfe..ee6a13f05443 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -299,7 +299,6 @@ static void nfs3_free_createdata(struct nfs3_createdata *data)
 
 /*
  * Create a regular file.
- * For now, we don't implement O_EXCL.
  */
 static int
 nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
-- 
cgit v1.2.2


From 4cfd74fc99a41fdc161f243e1c16199656d33ab1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 8 Sep 2009 19:49:47 -0400
Subject: NFS: Mount option parser should detect missing "port="

The meaning of not specifying the "port=" mount option is different
for "-t nfs" and "-t nfs4" mounts.  The default port value for
NFSv2/v3 mounts is 0, but the default for NFSv4 mounts is 2049.

To support "-t nfs -o vers=4", the mount option parser must detect
when "port=" is missing so that the correct default port value can be
set depending on which NFS version is requested.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h |  9 +++++++--
 fs/nfs/super.c    | 55 +++++++++++++++++++++++++++++--------------------------
 2 files changed, 36 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2e485677019c..5e686a4d744e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -48,6 +48,11 @@ struct nfs_clone_mount {
  */
 #define NFS_MAX_SECFLAVORS	(12)
 
+/*
+ * Value used if the user did not specify a port value.
+ */
+#define NFS_UNSPEC_PORT		(-1)
+
 /*
  * In-kernel mount arguments
  */
@@ -71,7 +76,7 @@ struct nfs_parsed_mount_data {
 		size_t			addrlen;
 		char			*hostname;
 		u32			version;
-		unsigned short		port;
+		int			port;
 		unsigned short		protocol;
 	} mount_server;
 
@@ -80,7 +85,7 @@ struct nfs_parsed_mount_data {
 		size_t			addrlen;
 		char			*hostname;
 		char			*export_path;
-		unsigned short		port;
+		int			port;
 		unsigned short		protocol;
 	} nfs_server;
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f3a95df4b95f..05544336f7de 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -746,6 +746,21 @@ static int nfs_verify_server_address(struct sockaddr *addr)
 	return 0;
 }
 
+/*
+ * Select between a default port value and a user-specified port value.
+ * If a zero value is set, then autobind will be used.
+ */
+static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port,
+				 const unsigned short default_port)
+{
+	unsigned short port = default_port;
+
+	if (parsed_port != NFS_UNSPEC_PORT)
+		port = parsed_port;
+
+	rpc_set_port(sap, port);
+}
+
 /*
  * Sanity check the NFS transport protocol.
  *
@@ -1415,11 +1430,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 		args->mount_server.addrlen = args->nfs_server.addrlen;
 	}
 	request.salen = args->mount_server.addrlen;
-
-	/*
-	 * autobind will be used if mount_server.port == 0
-	 */
-	rpc_set_port(request.sap, args->mount_server.port);
+	nfs_set_default_port(request.sap, args->mount_server.port, 0);
 
 	/*
 	 * Now ask the mount server to map our export path
@@ -1597,6 +1608,7 @@ static int nfs_validate_mount_data(void *options,
 				   const char *dev_name)
 {
 	struct nfs_mount_data *data = (struct nfs_mount_data *)options;
+	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
 
 	if (data == NULL)
 		goto out_no_data;
@@ -1608,8 +1620,8 @@ static int nfs_validate_mount_data(void *options,
 	args->acregmax		= NFS_DEF_ACREGMAX;
 	args->acdirmin		= NFS_DEF_ACDIRMIN;
 	args->acdirmax		= NFS_DEF_ACDIRMAX;
-	args->mount_server.port	= 0;	/* autobind unless user sets port */
-	args->nfs_server.port	= 0;	/* autobind unless user sets port */
+	args->mount_server.port	= NFS_UNSPEC_PORT;
+	args->nfs_server.port	= NFS_UNSPEC_PORT;
 	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 	args->auth_flavors[0]	= RPC_AUTH_UNIX;
 	args->auth_flavor_len	= 1;
@@ -1657,11 +1669,9 @@ static int nfs_validate_mount_data(void *options,
 		args->acdirmin		= data->acdirmin;
 		args->acdirmax		= data->acdirmax;
 
-		memcpy(&args->nfs_server.address, &data->addr,
-		       sizeof(data->addr));
+		memcpy(sap, &data->addr, sizeof(data->addr));
 		args->nfs_server.addrlen = sizeof(data->addr);
-		if (!nfs_verify_server_address((struct sockaddr *)
-						&args->nfs_server.address))
+		if (!nfs_verify_server_address(sap))
 			goto out_no_address;
 
 		if (!(data->flags & NFS_MOUNT_TCP))
@@ -1709,12 +1719,10 @@ static int nfs_validate_mount_data(void *options,
 		if (nfs_parse_mount_options((char *)options, args) == 0)
 			return -EINVAL;
 
-		if (!nfs_verify_server_address((struct sockaddr *)
-						&args->nfs_server.address))
+		if (!nfs_verify_server_address(sap))
 			goto out_no_address;
 
-		rpc_set_port((struct sockaddr *)&args->nfs_server.address,
-				args->nfs_server.port);
+		nfs_set_default_port(sap, args->nfs_server.port, 0);
 
 		nfs_set_mount_transport_protocol(args);
 
@@ -2261,7 +2269,7 @@ static int nfs4_validate_mount_data(void *options,
 				    struct nfs_parsed_mount_data *args,
 				    const char *dev_name)
 {
-	struct sockaddr_in *ap;
+	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
 	struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
 	char *c;
 
@@ -2274,23 +2282,21 @@ static int nfs4_validate_mount_data(void *options,
 	args->acregmax		= NFS_DEF_ACREGMAX;
 	args->acdirmin		= NFS_DEF_ACDIRMIN;
 	args->acdirmax		= NFS_DEF_ACDIRMAX;
-	args->nfs_server.port	= NFS_PORT; /* 2049 unless user set port= */
+	args->nfs_server.port	= NFS_UNSPEC_PORT;
 	args->auth_flavors[0]	= RPC_AUTH_UNIX;
 	args->auth_flavor_len	= 1;
 	args->minorversion	= 0;
 
 	switch (data->version) {
 	case 1:
-		ap = (struct sockaddr_in *)&args->nfs_server.address;
 		if (data->host_addrlen > sizeof(args->nfs_server.address))
 			goto out_no_address;
 		if (data->host_addrlen == 0)
 			goto out_no_address;
 		args->nfs_server.addrlen = data->host_addrlen;
-		if (copy_from_user(ap, data->host_addr, data->host_addrlen))
+		if (copy_from_user(sap, data->host_addr, data->host_addrlen))
 			return -EFAULT;
-		if (!nfs_verify_server_address((struct sockaddr *)
-						&args->nfs_server.address))
+		if (!nfs_verify_server_address(sap))
 			goto out_no_address;
 
 		if (data->auth_flavourlen) {
@@ -2342,12 +2348,9 @@ static int nfs4_validate_mount_data(void *options,
 		if (nfs_parse_mount_options((char *)options, args) == 0)
 			return -EINVAL;
 
-		if (!nfs_verify_server_address((struct sockaddr *)
-						&args->nfs_server.address))
+		if (!nfs_verify_server_address(sap))
 			return -EINVAL;
-
-		rpc_set_port((struct sockaddr *)&args->nfs_server.address,
-				args->nfs_server.port);
+		nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT);
 
 		nfs_validate_transport_protocol(args);
 
-- 
cgit v1.2.2


From 7630c852e19c7fffb85b50d98eeb5516fec7c088 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 8 Sep 2009 19:49:57 -0400
Subject: NFS: Refactor NFSv4 text-based mount option validation

Clean up: Refactor the part of nfs4_validate_mount_options() that
handles text-based options, so we can call it from the NFSv2/v3
option validation function.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 63 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 35 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 05544336f7de..599e8c5fe6ce 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -272,6 +272,8 @@ static const struct super_operations nfs_sops = {
 };
 
 #ifdef CONFIG_NFS_V4
+static int nfs4_validate_text_mount_data(void *options,
+	struct nfs_parsed_mount_data *args, const char *dev_name);
 static int nfs4_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static int nfs4_remote_get_sb(struct file_system_type *fs_type,
@@ -2262,6 +2264,37 @@ static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
 	args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3);
 }
 
+static int nfs4_validate_text_mount_data(void *options,
+					 struct nfs_parsed_mount_data *args,
+					 const char *dev_name)
+{
+	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
+
+	nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT);
+
+	nfs_validate_transport_protocol(args);
+
+	nfs4_validate_mount_flags(args);
+
+	if (args->auth_flavor_len > 1) {
+		dfprintk(MOUNT,
+			 "NFS4: Too many RPC auth flavours specified\n");
+		return -EINVAL;
+	}
+
+	if (args->client_address == NULL) {
+		dfprintk(MOUNT,
+			 "NFS4: mount program didn't pass callback address\n");
+		return -EINVAL;
+	}
+
+	return nfs_parse_devname(dev_name,
+				   &args->nfs_server.hostname,
+				   NFS4_MAXNAMLEN,
+				   &args->nfs_server.export_path,
+				   NFS4_MAXPATHLEN);
+}
+
 /*
  * Validate NFSv4 mount options
  */
@@ -2342,36 +2375,14 @@ static int nfs4_validate_mount_data(void *options,
 		nfs_validate_transport_protocol(args);
 
 		break;
-	default: {
-		int status;
-
+	default:
 		if (nfs_parse_mount_options((char *)options, args) == 0)
 			return -EINVAL;
 
 		if (!nfs_verify_server_address(sap))
 			return -EINVAL;
-		nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT);
-
-		nfs_validate_transport_protocol(args);
-
-		nfs4_validate_mount_flags(args);
-
-		if (args->auth_flavor_len > 1)
-			goto out_inval_auth;
-
-		if (args->client_address == NULL)
-			goto out_no_client_address;
-
-		status = nfs_parse_devname(dev_name,
-					   &args->nfs_server.hostname,
-					   NFS4_MAXNAMLEN,
-					   &args->nfs_server.export_path,
-					   NFS4_MAXPATHLEN);
-		if (status < 0)
-			return status;
 
-		break;
-		}
+		return nfs4_validate_text_mount_data(options, args, dev_name);
 	}
 
 	return 0;
@@ -2388,10 +2399,6 @@ out_inval_auth:
 out_no_address:
 	dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
 	return -EINVAL;
-
-out_no_client_address:
-	dfprintk(MOUNT, "NFS4: mount program didn't pass callback address\n");
-	return -EINVAL;
 }
 
 /*
-- 
cgit v1.2.2


From a6fe23be90aa78783523a25330e09bfaa43a1581 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 8 Sep 2009 19:50:00 -0400
Subject: NFS: Move details of nfs4_get_sb() to a helper

Clean up: Refactor nfs4_get_sb() to allow its guts to be invoked by
nfs_get_sb().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 44 +++++++++++++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 599e8c5fe6ce..c105e12197c2 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -274,6 +274,8 @@ static const struct super_operations nfs_sops = {
 #ifdef CONFIG_NFS_V4
 static int nfs4_validate_text_mount_data(void *options,
 	struct nfs_parsed_mount_data *args, const char *dev_name);
+static int nfs4_try_mount(int flags, const char *dev_name,
+	struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
 static int nfs4_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static int nfs4_remote_get_sb(struct file_system_type *fs_type,
@@ -2565,6 +2567,34 @@ out_err:
 	return ret;
 }
 
+static int nfs4_try_mount(int flags, const char *dev_name,
+			 struct nfs_parsed_mount_data *data,
+			 struct vfsmount *mnt)
+{
+	char *export_path;
+	struct vfsmount *root_mnt;
+	int error;
+
+	dfprintk(MOUNT, "--> nfs4_try_mount()\n");
+
+	export_path = data->nfs_server.export_path;
+	data->nfs_server.export_path = "/";
+	root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
+			data->nfs_server.hostname);
+	data->nfs_server.export_path = export_path;
+
+	error = PTR_ERR(root_mnt);
+	if (IS_ERR(root_mnt))
+		goto out;
+
+	error = nfs_follow_remote_path(root_mnt, export_path, mnt);
+
+out:
+	dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error,
+			error != 0 ? " [error]" : "");
+	return error;
+}
+
 /*
  * Get the superblock for an NFS4 mountpoint
  */
@@ -2572,8 +2602,6 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
 	struct nfs_parsed_mount_data *data;
-	char *export_path;
-	struct vfsmount *root_mnt;
 	int error = -ENOMEM;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
@@ -2585,17 +2613,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 	if (error < 0)
 		goto out;
 
-	export_path = data->nfs_server.export_path;
-	data->nfs_server.export_path = "/";
-	root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
-			data->nfs_server.hostname);
-	data->nfs_server.export_path = export_path;
-
-	error = PTR_ERR(root_mnt);
-	if (IS_ERR(root_mnt))
-		goto out;
-
-	error = nfs_follow_remote_path(root_mnt, export_path, mnt);
+	error = nfs4_try_mount(flags, dev_name, data, mnt);
 
 out:
 	kfree(data->client_address);
-- 
cgit v1.2.2


From 764302ccb88dd0df062eccd507b6c6de24f1c560 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 8 Sep 2009 19:50:03 -0400
Subject: NFS: Allow the "nfs" file system type to support NFSv4

When mounting an "nfs" type file system, recognize "v4," "vers=4," or
"nfsvers=4" mount options, and convert the file system to "nfs4" under
the covers.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
[trondmy: fixed up binary mount code so it sets the 'version' field too]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h |  1 +
 fs/nfs/super.c    | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5e686a4d744e..e21b1bb9972f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -68,6 +68,7 @@ struct nfs_parsed_mount_data {
 	unsigned int		auth_flavor_len;
 	rpc_authflavor_t	auth_flavors[1];
 	char			*client_address;
+	unsigned int		version;
 	unsigned int		minorversion;
 	char			*fscache_uniq;
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index c105e12197c2..34b1ccf51adf 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -73,7 +73,7 @@ enum {
 	Opt_cto, Opt_nocto,
 	Opt_ac, Opt_noac,
 	Opt_lock, Opt_nolock,
-	Opt_v2, Opt_v3,
+	Opt_v2, Opt_v3, Opt_v4,
 	Opt_udp, Opt_tcp, Opt_rdma,
 	Opt_acl, Opt_noacl,
 	Opt_rdirplus, Opt_nordirplus,
@@ -127,6 +127,7 @@ static const match_table_t nfs_mount_option_tokens = {
 	{ Opt_nolock, "nolock" },
 	{ Opt_v2, "v2" },
 	{ Opt_v3, "v3" },
+	{ Opt_v4, "v4" },
 	{ Opt_udp, "udp" },
 	{ Opt_tcp, "tcp" },
 	{ Opt_rdma, "rdma" },
@@ -934,10 +935,18 @@ static int nfs_parse_mount_options(char *raw,
 			break;
 		case Opt_v2:
 			mnt->flags &= ~NFS_MOUNT_VER3;
+			mnt->version = 2;
 			break;
 		case Opt_v3:
 			mnt->flags |= NFS_MOUNT_VER3;
+			mnt->version = 3;
 			break;
+#ifdef CONFIG_NFS_V4
+		case Opt_v4:
+			mnt->flags &= ~NFS_MOUNT_VER3;
+			mnt->version = 4;
+			break;
+#endif
 		case Opt_udp:
 			mnt->flags &= ~NFS_MOUNT_TCP;
 			mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1151,10 +1160,18 @@ static int nfs_parse_mount_options(char *raw,
 			switch (option) {
 			case NFS2_VERSION:
 				mnt->flags &= ~NFS_MOUNT_VER3;
+				mnt->version = 2;
 				break;
 			case NFS3_VERSION:
 				mnt->flags |= NFS_MOUNT_VER3;
+				mnt->version = 3;
+				break;
+#ifdef CONFIG_NFS_V4
+			case NFS4_VERSION:
+				mnt->flags &= ~NFS_MOUNT_VER3;
+				mnt->version = 4;
 				break;
+#endif
 			default:
 				goto out_invalid_value;
 			}
@@ -1629,6 +1646,7 @@ static int nfs_validate_mount_data(void *options,
 	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 	args->auth_flavors[0]	= RPC_AUTH_UNIX;
 	args->auth_flavor_len	= 1;
+	args->minorversion	= 0;
 
 	switch (data->version) {
 	case 1:
@@ -1650,8 +1668,11 @@ static int nfs_validate_mount_data(void *options,
 			if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
 				goto out_invalid_fh;
 			mntfh->size = data->root.size;
-		} else
+			args->version = 3;
+		} else {
 			mntfh->size = NFS2_FHSIZE;
+			args->version = 2;
+		}
 
 
 		memcpy(mntfh->data, data->root.data, mntfh->size);
@@ -1726,6 +1747,14 @@ static int nfs_validate_mount_data(void *options,
 		if (!nfs_verify_server_address(sap))
 			goto out_no_address;
 
+		if (args->version == 4)
+#ifdef CONFIG_NFS_V4
+			return nfs4_validate_text_mount_data(options,
+							     args, dev_name);
+#else
+			goto out_v4_not_compiled;
+#endif
+
 		nfs_set_default_port(sap, args->nfs_server.port, 0);
 
 		nfs_set_mount_transport_protocol(args);
@@ -1774,6 +1803,12 @@ out_v3_not_compiled:
 	return -EPROTONOSUPPORT;
 #endif /* !CONFIG_NFS_V3 */
 
+#ifndef CONFIG_NFS_V4
+out_v4_not_compiled:
+	dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n");
+	return -EPROTONOSUPPORT;
+#endif /* !CONFIG_NFS_V4 */
+
 out_nomem:
 	dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
 	return -ENOMEM;
@@ -2069,6 +2104,14 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	if (error < 0)
 		goto out;
 
+#ifdef CONFIG_NFS_V4
+	if (data->version == 4) {
+		error = nfs4_try_mount(flags, dev_name, data, mnt);
+		kfree(data->client_address);
+		goto out;
+	}
+#endif	/* CONFIG_NFS_V4 */
+
 	/* Get a volume representation */
 	server = nfs_create_server(data, mntfh);
 	if (IS_ERR(server)) {
@@ -2320,6 +2363,7 @@ static int nfs4_validate_mount_data(void *options,
 	args->nfs_server.port	= NFS_UNSPEC_PORT;
 	args->auth_flavors[0]	= RPC_AUTH_UNIX;
 	args->auth_flavor_len	= 1;
+	args->version		= 4;
 	args->minorversion	= 0;
 
 	switch (data->version) {
-- 
cgit v1.2.2


From 2ecda72b49a0849ce41e7fa1fa974a245b9119f8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 8 Sep 2009 19:50:07 -0400
Subject: NFSv4: Disallow 'mount -t nfs4 -overs=2' and 'mount -t nfs4 -overs=3'

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 34b1ccf51adf..867f70504531 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2321,6 +2321,12 @@ static int nfs4_validate_text_mount_data(void *options,
 
 	nfs4_validate_mount_flags(args);
 
+	if (args->version != 4) {
+		dfprintk(MOUNT,
+			 "NFS4: Illegal mount version\n");
+		return -EINVAL;
+	}
+
 	if (args->auth_flavor_len > 1) {
 		dfprintk(MOUNT,
 			 "NFS4: Too many RPC auth flavours specified\n");
-- 
cgit v1.2.2


From 2b88f7c535a8125213def012a67c1b0a667ceb2e Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 9 Sep 2009 15:59:35 +0100
Subject: GFS2: Remove unused sysfs file

The /sys/fs/gfs2/<fsname>/lock_module/id file has been unused for
some time now, so we can remove it. We still accept the mount option
though, as userspace still sends that.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h     | 1 -
 fs/gfs2/ops_fstype.c | 6 +-----
 fs/gfs2/sys.c        | 8 --------
 3 files changed, 1 insertion(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 52436abc3b4d..6edb423f90b3 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -496,7 +496,6 @@ struct gfs2_sb_host {
  */
 
 struct lm_lockstruct {
-	u32 ls_id;
 	unsigned int ls_jid;
 	unsigned int ls_first;
 	unsigned int ls_first_done;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 8e3f00e1430d..52fb6c048981 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1028,7 +1028,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
 
 	ls->ls_ops = lm;
 	ls->ls_first = 1;
-	ls->ls_id = 0;
 
 	for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) {
 		substring_t tmp[MAX_OPT_ARGS];
@@ -1046,10 +1045,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
 			ls->ls_jid = option;
 			break;
 		case Opt_id:
-			ret = match_int(&tmp[0], &option);
-			if (ret)
-				goto hostdata_error;
-			ls->ls_id = option;
+			/* Obsolete, but left for backward compat purposes */
 			break;
 		case Opt_first:
 			ret = match_int(&tmp[0], &option);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 0d4f7e974e9f..446329728d52 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -320,12 +320,6 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 	return ret;
 }
 
-static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf)
-{
-	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-	return sprintf(buf, "%u\n", ls->ls_id);
-}
-
 static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
 {
 	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -390,7 +384,6 @@ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
 GDLM_ATTR(proto_name,		0444, proto_name_show,		NULL);
 GDLM_ATTR(block,		0644, block_show,		block_store);
 GDLM_ATTR(withdraw,		0644, withdraw_show,		withdraw_store);
-GDLM_ATTR(id,			0444, lkid_show,		NULL);
 GDLM_ATTR(jid,			0444, jid_show,			NULL);
 GDLM_ATTR(first,		0444, lkfirst_show,		NULL);
 GDLM_ATTR(first_done,		0444, first_done_show,		NULL);
@@ -402,7 +395,6 @@ static struct attribute *lock_module_attrs[] = {
 	&gdlm_attr_proto_name.attr,
 	&gdlm_attr_block.attr,
 	&gdlm_attr_withdraw.attr,
-	&gdlm_attr_id.attr,
 	&gdlm_attr_jid.attr,
 	&gdlm_attr_first.attr,
 	&gdlm_attr_first_done.attr,
-- 
cgit v1.2.2


From 4734d401d43c6469d568caf223d37aa0fc1bf4dc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 9 Sep 2009 18:19:02 -0500
Subject: xfs: use correct log reservation when handling ENOSPC in xfs_create

We added the ENOSPC handling patch in xfs_create just after it got mered
with xfs_mkdir.  Change the log reservation to the variable for either
the create or mkdir value so it does the right thing if get here for creating
a directory.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_vnodeops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 03d3100559ac..a434f287962d 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1471,8 +1471,8 @@ xfs_create(
 	if (error == ENOSPC) {
 		/* flush outstanding delalloc blocks and retry */
 		xfs_flush_inodes(dp);
-		error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
-			XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+		error = xfs_trans_reserve(tp, resblks, log_res, 0,
+				XFS_TRANS_PERM_LOG_RES, log_count);
 	}
 	if (error == ENOSPC) {
 		/* No space at all so try a "no-allocation" reservation */
-- 
cgit v1.2.2


From b1ab7e4b2a88d3ac13771463be8f302ce1616cfc Mon Sep 17 00:00:00 2001
From: "David P. Quigley" <dpquigl@tycho.nsa.gov>
Date: Thu, 3 Sep 2009 14:25:56 -0400
Subject: VFS: Factor out part of vfs_setxattr so it can be called from the
 SELinux hook for inode_setsecctx.

This factors out the part of the vfs_setxattr function that performs the
setting of the xattr and its notification. This is needed so the SELinux
implementation of inode_setsecctx can handle the setting of the xattr while
maintaining the proper separation of layers.

Signed-off-by: David P. Quigley <dpquigl@tycho.nsa.gov>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/xattr.c | 55 ++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/xattr.c b/fs/xattr.c
index 1c3d0af59ddf..6d4f6d3449fb 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -66,22 +66,28 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 	return inode_permission(inode, mask);
 }
 
-int
-vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-		size_t size, int flags)
+/**
+ *  __vfs_setxattr_noperm - perform setxattr operation without performing
+ *  permission checks.
+ *
+ *  @dentry - object to perform setxattr on
+ *  @name - xattr name to set
+ *  @value - value to set @name to
+ *  @size - size of @value
+ *  @flags - flags to pass into filesystem operations
+ *
+ *  returns the result of the internal setxattr or setsecurity operations.
+ *
+ *  This function requires the caller to lock the inode's i_mutex before it
+ *  is executed. It also assumes that the caller will make the appropriate
+ *  permission checks.
+ */
+int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags)
 {
 	struct inode *inode = dentry->d_inode;
-	int error;
-
-	error = xattr_permission(inode, name, MAY_WRITE);
-	if (error)
-		return error;
+	int error = -EOPNOTSUPP;
 
-	mutex_lock(&inode->i_mutex);
-	error = security_inode_setxattr(dentry, name, value, size, flags);
-	if (error)
-		goto out;
-	error = -EOPNOTSUPP;
 	if (inode->i_op->setxattr) {
 		error = inode->i_op->setxattr(dentry, name, value, size, flags);
 		if (!error) {
@@ -97,6 +103,29 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		if (!error)
 			fsnotify_xattr(dentry);
 	}
+
+	return error;
+}
+
+
+int
+vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = xattr_permission(inode, name, MAY_WRITE);
+	if (error)
+		return error;
+
+	mutex_lock(&inode->i_mutex);
+	error = security_inode_setxattr(dentry, name, value, size, flags);
+	if (error)
+		goto out;
+
+	error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
+
 out:
 	mutex_unlock(&inode->i_mutex);
 	return error;
-- 
cgit v1.2.2


From ddd29ec6597125c830f7badb608a86c98b936b64 Mon Sep 17 00:00:00 2001
From: "David P. Quigley" <dpquigl@tycho.nsa.gov>
Date: Wed, 9 Sep 2009 14:25:37 -0400
Subject: sysfs: Add labeling support for sysfs

This patch adds a setxattr handler to the file, directory, and symlink
inode_operations structures for sysfs. The patch uses hooks introduced in the
previous patch to handle the getting and setting of security information for
the sysfs inodes. As was suggested by Eric Biederman the struct iattr in the
sysfs_dirent structure has been replaced by a structure which contains the
iattr, secdata and secdata length to allow the changes to persist in the event
that the inode representing the sysfs_dirent is evicted. Because sysfs only
stores this information when a change is made all the optional data is moved
into one dynamically allocated field.

This patch addresses an issue where SELinux was denying virtd access to the PCI
configuration entries in sysfs. The lack of setxattr handlers for sysfs
required that a single label be assigned to all entries in sysfs. Granting virtd
access to every entry in sysfs is not an acceptable solution so fine grained
labeling of sysfs is required such that individual entries can be labeled
appropriately.

[sds:  Fixed compile-time warnings, coding style, and setting of inode security init flags.]

Signed-off-by: David P. Quigley <dpquigl@tycho.nsa.gov>
Signed-off-by: Stephen D. Smalley <sds@tycho.nsa.gov>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/sysfs/dir.c     |   1 +
 fs/sysfs/inode.c   | 134 +++++++++++++++++++++++++++++++++++++++--------------
 fs/sysfs/symlink.c |   2 +
 fs/sysfs/sysfs.h   |  12 ++++-
 4 files changed, 112 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 14f2d71ea3ce..0050fc40e8c9 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -760,6 +760,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 const struct inode_operations sysfs_dir_inode_operations = {
 	.lookup		= sysfs_lookup,
 	.setattr	= sysfs_setattr,
+	.setxattr	= sysfs_setxattr,
 };
 
 static void remove_dir(struct sysfs_dirent *sd)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 555f0ff988df..2b6a8d9de73d 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,8 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
 #include "sysfs.h"
 
 extern struct super_block * sysfs_sb;
@@ -35,6 +37,7 @@ static struct backing_dev_info sysfs_backing_dev_info = {
 
 static const struct inode_operations sysfs_inode_operations ={
 	.setattr	= sysfs_setattr,
+	.setxattr	= sysfs_setxattr,
 };
 
 int __init sysfs_inode_init(void)
@@ -42,18 +45,37 @@ int __init sysfs_inode_init(void)
 	return bdi_init(&sysfs_backing_dev_info);
 }
 
+struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
+{
+	struct sysfs_inode_attrs *attrs;
+	struct iattr *iattrs;
+
+	attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
+	if (!attrs)
+		return NULL;
+	iattrs = &attrs->ia_iattr;
+
+	/* assign default attributes */
+	iattrs->ia_mode = sd->s_mode;
+	iattrs->ia_uid = 0;
+	iattrs->ia_gid = 0;
+	iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
+
+	return attrs;
+}
 int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
 {
 	struct inode * inode = dentry->d_inode;
 	struct sysfs_dirent * sd = dentry->d_fsdata;
-	struct iattr * sd_iattr;
+	struct sysfs_inode_attrs *sd_attrs;
+	struct iattr *iattrs;
 	unsigned int ia_valid = iattr->ia_valid;
 	int error;
 
 	if (!sd)
 		return -EINVAL;
 
-	sd_iattr = sd->s_iattr;
+	sd_attrs = sd->s_iattr;
 
 	error = inode_change_ok(inode, iattr);
 	if (error)
@@ -65,42 +87,77 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
 	if (error)
 		return error;
 
-	if (!sd_iattr) {
+	if (!sd_attrs) {
 		/* setting attributes for the first time, allocate now */
-		sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
-		if (!sd_iattr)
+		sd_attrs = sysfs_init_inode_attrs(sd);
+		if (!sd_attrs)
 			return -ENOMEM;
-		/* assign default attributes */
-		sd_iattr->ia_mode = sd->s_mode;
-		sd_iattr->ia_uid = 0;
-		sd_iattr->ia_gid = 0;
-		sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
-		sd->s_iattr = sd_iattr;
+		sd->s_iattr = sd_attrs;
+	} else {
+		/* attributes were changed at least once in past */
+		iattrs = &sd_attrs->ia_iattr;
+
+		if (ia_valid & ATTR_UID)
+			iattrs->ia_uid = iattr->ia_uid;
+		if (ia_valid & ATTR_GID)
+			iattrs->ia_gid = iattr->ia_gid;
+		if (ia_valid & ATTR_ATIME)
+			iattrs->ia_atime = timespec_trunc(iattr->ia_atime,
+					inode->i_sb->s_time_gran);
+		if (ia_valid & ATTR_MTIME)
+			iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime,
+					inode->i_sb->s_time_gran);
+		if (ia_valid & ATTR_CTIME)
+			iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime,
+					inode->i_sb->s_time_gran);
+		if (ia_valid & ATTR_MODE) {
+			umode_t mode = iattr->ia_mode;
+
+			if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+				mode &= ~S_ISGID;
+			iattrs->ia_mode = sd->s_mode = mode;
+		}
 	}
+	return error;
+}
 
-	/* attributes were changed atleast once in past */
-
-	if (ia_valid & ATTR_UID)
-		sd_iattr->ia_uid = iattr->ia_uid;
-	if (ia_valid & ATTR_GID)
-		sd_iattr->ia_gid = iattr->ia_gid;
-	if (ia_valid & ATTR_ATIME)
-		sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime,
-						inode->i_sb->s_time_gran);
-	if (ia_valid & ATTR_MTIME)
-		sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime,
-						inode->i_sb->s_time_gran);
-	if (ia_valid & ATTR_CTIME)
-		sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime,
-						inode->i_sb->s_time_gran);
-	if (ia_valid & ATTR_MODE) {
-		umode_t mode = iattr->ia_mode;
-
-		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
-			mode &= ~S_ISGID;
-		sd_iattr->ia_mode = sd->s_mode = mode;
-	}
+int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		size_t size, int flags)
+{
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct sysfs_inode_attrs *iattrs;
+	void *secdata;
+	int error;
+	u32 secdata_len = 0;
+
+	if (!sd)
+		return -EINVAL;
+	if (!sd->s_iattr)
+		sd->s_iattr = sysfs_init_inode_attrs(sd);
+	if (!sd->s_iattr)
+		return -ENOMEM;
+
+	iattrs = sd->s_iattr;
+
+	if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
+		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
+		error = security_inode_setsecurity(dentry->d_inode, suffix,
+						value, size, flags);
+		if (error)
+			goto out;
+		error = security_inode_getsecctx(dentry->d_inode,
+						&secdata, &secdata_len);
+		if (error)
+			goto out;
+		if (iattrs->ia_secdata)
+			security_release_secctx(iattrs->ia_secdata,
+						iattrs->ia_secdata_len);
+		iattrs->ia_secdata = secdata;
+		iattrs->ia_secdata_len = secdata_len;
 
+	} else
+		return -EINVAL;
+out:
 	return error;
 }
 
@@ -146,6 +203,7 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd)
 static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
 	struct bin_attribute *bin_attr;
+	struct sysfs_inode_attrs *iattrs;
 
 	inode->i_private = sysfs_get(sd);
 	inode->i_mapping->a_ops = &sysfs_aops;
@@ -154,16 +212,20 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 	inode->i_ino = sd->s_ino;
 	lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key);
 
-	if (sd->s_iattr) {
+	iattrs = sd->s_iattr;
+	if (iattrs) {
 		/* sysfs_dirent has non-default attributes
 		 * get them for the new inode from persistent copy
 		 * in sysfs_dirent
 		 */
-		set_inode_attr(inode, sd->s_iattr);
+		set_inode_attr(inode, &iattrs->ia_iattr);
+		if (iattrs->ia_secdata)
+			security_inode_notifysecctx(inode,
+						iattrs->ia_secdata,
+						iattrs->ia_secdata_len);
 	} else
 		set_default_inode_attr(inode, sd->s_mode);
 
-
 	/* initialize inode according to type */
 	switch (sysfs_type(sd)) {
 	case SYSFS_DIR:
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 1d897ad808e0..c5081ad77026 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -16,6 +16,7 @@
 #include <linux/kobject.h>
 #include <linux/namei.h>
 #include <linux/mutex.h>
+#include <linux/security.h>
 
 #include "sysfs.h"
 
@@ -209,6 +210,7 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co
 }
 
 const struct inode_operations sysfs_symlink_inode_operations = {
+	.setxattr = sysfs_setxattr,
 	.readlink = generic_readlink,
 	.follow_link = sysfs_follow_link,
 	.put_link = sysfs_put_link,
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 3fa0d98481e2..af4c4e7482ac 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,6 +8,8 @@
  * This file is released under the GPLv2.
  */
 
+#include <linux/fs.h>
+
 struct sysfs_open_dirent;
 
 /* type-specific structures for sysfs_dirent->s_* union members */
@@ -31,6 +33,12 @@ struct sysfs_elem_bin_attr {
 	struct hlist_head	buffers;
 };
 
+struct sysfs_inode_attrs {
+	struct iattr	ia_iattr;
+	void		*ia_secdata;
+	u32		ia_secdata_len;
+};
+
 /*
  * sysfs_dirent - the building block of sysfs hierarchy.  Each and
  * every sysfs node is represented by single sysfs_dirent.
@@ -56,7 +64,7 @@ struct sysfs_dirent {
 	unsigned int		s_flags;
 	ino_t			s_ino;
 	umode_t			s_mode;
-	struct iattr		*s_iattr;
+	struct sysfs_inode_attrs *s_iattr;
 };
 
 #define SD_DEACTIVATED_BIAS		INT_MIN
@@ -148,6 +156,8 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
 struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
 void sysfs_delete_inode(struct inode *inode);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
+int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		size_t size, int flags);
 int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
 int sysfs_inode_init(void);
 
-- 
cgit v1.2.2


From c7acb4c16646943180bd221c167a077e0a084f9c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 9 Sep 2009 21:32:41 -0400
Subject: ext4: Use bforget() in no journal mode for
 ext4_journal_{forget,revoke}()

When ext4 is using a journal, a metadata block which is deallocated
must be passed into the journal layer so it can be dropped from the
current transaction and/or revoked.  This is done by calling the
functions ext4_journal_forget() and ext4_journal_revoke(), which call
jbd2_journal_forget(), and jbd2_journal_revoke(), respectively.

Since the jbd2_journal_forget() and jbd2_journal_revoke() call
bforget(), if ext4 is not using a journal, ext4_journal_forget() and
ext4_journal_revoke() must call bforget() to avoid a dirty metadata
block overwriting a block after it has been reallocated and reused for
another inode's data block.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4_jbd2.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index eb27fd0f2ee8..ecb9ca455fd5 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
 						  handle, err);
 	}
 	else
-		brelse(bh);
+		bforget(bh);
 	return err;
 }
 
@@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
 						  handle, err);
 	}
 	else
-		brelse(bh);
+		bforget(bh);
 	return err;
 }
 
-- 
cgit v1.2.2


From fe188c0e084bdf3038dc0ac963c21d764f53f7da Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 12 Sep 2009 13:41:55 -0400
Subject: ext4: Assure that metadata blocks are written during fsync in no
 journal mode

When there is no journal present, we must attach buffer heads
associated with extent tree and indirect blocks to the inode's
mapping->private_list via mark_buffer_dirty_inode() so that
ext4_sync_file() --- which is called to service fsync() and
fdatasync() system calls --- can write out the inode's metadata blocks
by calling sync_mapping_buffers().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4_jbd2.c | 5 ++++-
 fs/ext4/fsync.c     | 9 +++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index ecb9ca455fd5..6a9409920dee 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
 			ext4_journal_abort_handle(where, __func__, bh,
 						  handle, err);
 	} else {
-		mark_buffer_dirty(bh);
+		if (inode && bh)
+			mark_buffer_dirty_inode(bh, inode);
+		else
+			mark_buffer_dirty(bh);
 		if (inode && inode_needs_sync(inode)) {
 			sync_dirty_buffer(bh);
 			if (buffer_req(bh) && !buffer_uptodate(bh)) {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index ab418c0f502d..07475740b512 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -50,7 +50,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
-	int ret = 0;
+	int err, ret = 0;
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
@@ -79,6 +79,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		goto out;
 	}
 
+	if (!journal)
+		ret = sync_mapping_buffers(inode->i_mapping);
+
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
 		goto out;
 
@@ -91,7 +94,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 			.sync_mode = WB_SYNC_ALL,
 			.nr_to_write = 0, /* sys_fsync did this */
 		};
-		ret = sync_inode(inode, &wbc);
+		err = sync_inode(inode, &wbc);
+		if (ret == 0)
+			ret = err;
 	}
 out:
 	if (journal && (journal->j_flags & JBD2_BARRIER))
-- 
cgit v1.2.2


From 91ac6f43317c0bf99969665f98016548011dfa38 Mon Sep 17 00:00:00 2001
From: Frank Mayhar <fmayhar@google.com>
Date: Wed, 9 Sep 2009 22:33:47 -0400
Subject: ext4: Make non-journal fsync work properly

Teach ext4_write_inode() and ext4_do_update_inode() about non-journal
mode:  If we're not using a journal, ext4_write_inode() now calls
ext4_do_update_inode() (after getting the iloc via ext4_get_inode_loc())
with a new "do_sync" parameter.  If that parameter is nonzero _and_ we're
not using a journal, ext4_do_update_inode() calls sync_dirty_buffer()
instead of ext4_handle_dirty_metadata().

This problem was found in power-fail testing, checking the amount of
loss of files and blocks after a power failure when using fsync() and
when not using fsync().  It turned out that using fsync() was actually
worse than not doing so, possibly because it increased the likelihood
that the inodes would remain unflushed and would therefore be lost at
the power failure.

Signed-off-by: Frank Mayhar <fmayhar@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 54 ++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6253ecdac67f..d04c8428bde2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4538,7 +4538,8 @@ static int ext4_inode_blocks_set(handle_t *handle,
  */
 static int ext4_do_update_inode(handle_t *handle,
 				struct inode *inode,
-				struct ext4_iloc *iloc)
+				struct ext4_iloc *iloc,
+				int do_sync)
 {
 	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
 	struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4640,10 +4641,22 @@ static int ext4_do_update_inode(handle_t *handle,
 		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
 	}
 
-	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-	rc = ext4_handle_dirty_metadata(handle, inode, bh);
-	if (!err)
-		err = rc;
+	/*
+	 * If we're not using a journal and we were called from
+	 * ext4_write_inode() to sync the inode (making do_sync true),
+	 * we can just use sync_dirty_buffer() directly to do our dirty
+	 * work.  Testing s_journal here is a bit redundant but it's
+	 * worth it to avoid potential future trouble.
+	 */
+	if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
+		BUFFER_TRACE(bh, "call sync_dirty_buffer");
+		sync_dirty_buffer(bh);
+	} else {
+		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+		rc = ext4_handle_dirty_metadata(handle, inode, bh);
+		if (!err)
+			err = rc;
+	}
 	ei->i_state &= ~EXT4_STATE_NEW;
 
 out_brelse:
@@ -4689,19 +4702,32 @@ out_brelse:
  */
 int ext4_write_inode(struct inode *inode, int wait)
 {
+	int err;
+
 	if (current->flags & PF_MEMALLOC)
 		return 0;
 
-	if (ext4_journal_current_handle()) {
-		jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
-		dump_stack();
-		return -EIO;
-	}
+	if (EXT4_SB(inode->i_sb)->s_journal) {
+		if (ext4_journal_current_handle()) {
+			jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+			dump_stack();
+			return -EIO;
+		}
 
-	if (!wait)
-		return 0;
+		if (!wait)
+			return 0;
+
+		err = ext4_force_commit(inode->i_sb);
+	} else {
+		struct ext4_iloc iloc;
 
-	return ext4_force_commit(inode->i_sb);
+		err = ext4_get_inode_loc(inode, &iloc);
+		if (err)
+			return err;
+		err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
+					   inode, &iloc, wait);
+	}
+	return err;
 }
 
 /*
@@ -4995,7 +5021,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
 	get_bh(iloc->bh);
 
 	/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
-	err = ext4_do_update_inode(handle, inode, iloc);
+	err = ext4_do_update_inode(handle, inode, iloc, 0);
 	put_bh(iloc->bh);
 	return err;
 }
-- 
cgit v1.2.2


From 752015d1b0683a8c623ebfe4c62893413e9b30d3 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Tue, 8 Sep 2009 19:49:40 -0700
Subject: binfmt_elf: fix PT_INTERP bss handling

In fs/binfmt_elf.c, load_elf_interp() calls padzero() for .bss even if
the PT_LOAD has no PROT_WRITE and no .bss.  This generates EFAULT.

Here is a small test case.  (Yes, there are other, useful PT_INTERP
which have only .text and no .data/.bss.)

	----- ptinterp.S
	_start: .globl _start
		 nop
		 int3
	-----
	$ gcc -m32 -nostartfiles -nostdlib -o ptinterp ptinterp.S
	$ gcc -m32 -Wl,--dynamic-linker=ptinterp -o hello hello.c
	$ ./hello
	Segmentation fault  # during execve() itself

	After applying the patch:
	$ ./hello
	Trace trap  # user-mode execution after execve() finishes

If the ELF headers are actually self-inconsistent, then dying is fine.
But having no PROT_WRITE segment is perfectly normal and correct if
there is no segment with p_memsz > p_filesz (i.e. bss).  John Reiser
suggested checking for PROT_WRITE in the bss logic.  I think it makes
most sense to simply apply the bss logic only when there is bss.

This patch looks less trivial than it is due to some reindentation.
It just moves the "if (last_bss > elf_bss) {" test up to include the
partial-page bss logic as well as the more-pages bss logic.

Reported-by: John Reiser <jreiser@bitwagon.com>
Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b7c1603cd4bd..7c1e65d54872 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -501,22 +501,22 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 		}
 	}
 
-	/*
-	 * Now fill out the bss section.  First pad the last page up
-	 * to the page boundary, and then perform a mmap to make sure
-	 * that there are zero-mapped pages up to and including the 
-	 * last bss page.
-	 */
-	if (padzero(elf_bss)) {
-		error = -EFAULT;
-		goto out_close;
-	}
+	if (last_bss > elf_bss) {
+		/*
+		 * Now fill out the bss section.  First pad the last page up
+		 * to the page boundary, and then perform a mmap to make sure
+		 * that there are zero-mapped pages up to and including the
+		 * last bss page.
+		 */
+		if (padzero(elf_bss)) {
+			error = -EFAULT;
+			goto out_close;
+		}
 
-	/* What we have mapped so far */
-	elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
+		/* What we have mapped so far */
+		elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
 
-	/* Map the last of the bss segment */
-	if (last_bss > elf_bss) {
+		/* Map the last of the bss segment */
 		down_write(&current->mm->mmap_sem);
 		error = do_brk(elf_bss, last_bss - elf_bss);
 		up_write(&current->mm->mmap_sem);
-- 
cgit v1.2.2


From b6a758ec3af3ec236dbfdcf6a06b84ac8f94957e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 9 Sep 2009 23:47:46 -0400
Subject: ext4: move ext4_mb_init_group() function earlier in the mballoc.c

This moves the function around so that it can be called from
ext4_mb_load_buddy().

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 182 +++++++++++++++++++++++++++---------------------------
 1 file changed, 91 insertions(+), 91 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5ef6daf0cdc6..fed5ac699141 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -910,6 +910,97 @@ out:
 	return err;
 }
 
+static noinline_for_stack
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+
+	int ret = 0;
+	void *bitmap;
+	int blocks_per_page;
+	int block, pnum, poff;
+	int num_grp_locked = 0;
+	struct ext4_group_info *this_grp;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct inode *inode = sbi->s_buddy_cache;
+	struct page *page = NULL, *bitmap_page = NULL;
+
+	mb_debug(1, "init group %u\n", group);
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	this_grp = ext4_get_group_info(sb, group);
+	/*
+	 * This ensures we don't add group
+	 * to this buddy cache via resize
+	 */
+	num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
+	if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+		/*
+		 * somebody initialized the group
+		 * return without doing anything
+		 */
+		ret = 0;
+		goto err;
+	}
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (page) {
+		BUG_ON(page->mapping != inode->i_mapping);
+		ret = ext4_mb_init_cache(page, NULL);
+		if (ret) {
+			unlock_page(page);
+			goto err;
+		}
+		unlock_page(page);
+	}
+	if (page == NULL || !PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+	mark_page_accessed(page);
+	bitmap_page = page;
+	bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+	/* init buddy cache */
+	block++;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (page == bitmap_page) {
+		/*
+		 * If both the bitmap and buddy are in
+		 * the same page we don't need to force
+		 * init the buddy
+		 */
+		unlock_page(page);
+	} else if (page) {
+		BUG_ON(page->mapping != inode->i_mapping);
+		ret = ext4_mb_init_cache(page, bitmap);
+		if (ret) {
+			unlock_page(page);
+			goto err;
+		}
+		unlock_page(page);
+	}
+	if (page == NULL || !PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+	mark_page_accessed(page);
+err:
+	ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+	if (bitmap_page)
+		page_cache_release(bitmap_page);
+	if (page)
+		page_cache_release(page);
+	return ret;
+}
+
 static noinline_for_stack int
 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 					struct ext4_buddy *e4b)
@@ -1839,97 +1930,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
 
 }
 
-static noinline_for_stack
-int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
-{
-
-	int ret;
-	void *bitmap;
-	int blocks_per_page;
-	int block, pnum, poff;
-	int num_grp_locked = 0;
-	struct ext4_group_info *this_grp;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct inode *inode = sbi->s_buddy_cache;
-	struct page *page = NULL, *bitmap_page = NULL;
-
-	mb_debug(1, "init group %u\n", group);
-	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-	this_grp = ext4_get_group_info(sb, group);
-	/*
-	 * This ensures we don't add group
-	 * to this buddy cache via resize
-	 */
-	num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
-	if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
-		/*
-		 * somebody initialized the group
-		 * return without doing anything
-		 */
-		ret = 0;
-		goto err;
-	}
-	/*
-	 * the buddy cache inode stores the block bitmap
-	 * and buddy information in consecutive blocks.
-	 * So for each group we need two blocks.
-	 */
-	block = group * 2;
-	pnum = block / blocks_per_page;
-	poff = block % blocks_per_page;
-	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-	if (page) {
-		BUG_ON(page->mapping != inode->i_mapping);
-		ret = ext4_mb_init_cache(page, NULL);
-		if (ret) {
-			unlock_page(page);
-			goto err;
-		}
-		unlock_page(page);
-	}
-	if (page == NULL || !PageUptodate(page)) {
-		ret = -EIO;
-		goto err;
-	}
-	mark_page_accessed(page);
-	bitmap_page = page;
-	bitmap = page_address(page) + (poff * sb->s_blocksize);
-
-	/* init buddy cache */
-	block++;
-	pnum = block / blocks_per_page;
-	poff = block % blocks_per_page;
-	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-	if (page == bitmap_page) {
-		/*
-		 * If both the bitmap and buddy are in
-		 * the same page we don't need to force
-		 * init the buddy
-		 */
-		unlock_page(page);
-	} else if (page) {
-		BUG_ON(page->mapping != inode->i_mapping);
-		ret = ext4_mb_init_cache(page, bitmap);
-		if (ret) {
-			unlock_page(page);
-			goto err;
-		}
-		unlock_page(page);
-	}
-	if (page == NULL || !PageUptodate(page)) {
-		ret = -EIO;
-		goto err;
-	}
-	mark_page_accessed(page);
-err:
-	ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
-	if (bitmap_page)
-		page_cache_release(bitmap_page);
-	if (page)
-		page_cache_release(page);
-	return ret;
-}
-
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
-- 
cgit v1.2.2


From f41c0750538667b87a19c93952e5d42fcc069bd7 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 9 Sep 2009 23:34:50 -0400
Subject: ext4: check for need init flag in ext4_mb_load_buddy

We should check for need init flag with the group's alloc_sem held, to
make sure while we are loading the buddy cache and holding a reference
to it, a file system resize can't add new blocks to same group.

The patch also drops the need init flag check in
ext4_mb_regular_allocator() because doing the check without holding
alloc_sem is racy.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/mballoc.c | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index fed5ac699141..484c9d10c7fa 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1034,8 +1034,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 	 * groups mapped by the page is blocked
 	 * till we are done with allocation
 	 */
+repeat_load_buddy:
 	down_read(e4b->alloc_semp);
 
+	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+		/* we need to check for group need init flag
+		 * with alloc_semp held so that we can be sure
+		 * that new blocks didn't get added to the group
+		 * when we are loading the buddy cache
+		 */
+		up_read(e4b->alloc_semp);
+		/*
+		 * we need full data about the group
+		 * to make a good selection
+		 */
+		ret = ext4_mb_init_group(sb, group);
+		if (ret)
+			return ret;
+		goto repeat_load_buddy;
+	}
+
 	/*
 	 * the buddy cache inode stores the block bitmap
 	 * and buddy information in consecutive blocks.
@@ -2012,27 +2030,6 @@ repeat:
 			if (grp->bb_free == 0)
 				continue;
 
-			/*
-			 * if the group is already init we check whether it is
-			 * a good group and if not we don't load the buddy
-			 */
-			if (EXT4_MB_GRP_NEED_INIT(grp)) {
-				/*
-				 * we need full data about the group
-				 * to make a good selection
-				 */
-				err = ext4_mb_init_group(sb, group);
-				if (err)
-					goto out;
-			}
-
-			/*
-			 * If the particular group doesn't satisfy our
-			 * criteria we continue with the next group
-			 */
-			if (!ext4_mb_good_group(ac, group, cr))
-				continue;
-
 			err = ext4_mb_load_buddy(sb, group, &e4b);
 			if (err)
 				goto out;
-- 
cgit v1.2.2


From 08c3a8133810d955d97f7146c50c43e4073f2148 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 9 Sep 2009 23:50:17 -0400
Subject: ext4: Clarify the locking details in mballoc

We don't need to take the alloc_sem lock when we are adding new
groups, since mballoc won't see the new group added until we bump
sbi->s_groups_count.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/mballoc.c | 7 +++++--
 fs/ext4/resize.c  | 7 +------
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 484c9d10c7fa..d23056d375b3 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -928,8 +928,11 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
 	this_grp = ext4_get_group_info(sb, group);
 	/*
-	 * This ensures we don't add group
-	 * to this buddy cache via resize
+	 * This ensures that we don't reinit the buddy cache
+	 * page which map to the group from which we are already
+	 * allocating. If we are looking at the buddy cache we would
+	 * have taken a reference using ext4_mb_load_buddy and that
+	 * would have taken the alloc_sem lock.
 	 */
 	num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
 	if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 68b0351fc647..3cfc343c41b5 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -746,7 +746,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	struct inode *inode = NULL;
 	handle_t *handle;
 	int gdb_off, gdb_num;
-	int num_grp_locked = 0;
 	int err, err2;
 
 	gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -856,7 +855,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
          * using the new disk blocks.
          */
 
-	num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
 	/* Update group descriptor block for new group */
 	gdp = (struct ext4_group_desc *)((char *)primary->b_data +
 					 gdb_off * EXT4_DESC_SIZE(sb));
@@ -875,10 +873,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	 * descriptor
 	 */
 	err = ext4_mb_add_groupinfo(sb, input->group, gdp);
-	if (err) {
-		ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
+	if (err)
 		goto exit_journal;
-	}
 
 	/*
 	 * Make the new blocks and inodes valid next.  We do this before
@@ -920,7 +916,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
 	/* Update the global fs size fields */
 	sbi->s_groups_count++;
-	ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
 
 	ext4_handle_dirty_metadata(handle, NULL, primary);
 
-- 
cgit v1.2.2


From 9f0ab4a3f0fdb1ff404d150618ace2fa069bb2e1 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Tue, 8 Sep 2009 19:49:40 -0700
Subject: binfmt_elf: fix PT_INTERP bss handling

In fs/binfmt_elf.c, load_elf_interp() calls padzero() for .bss even if
the PT_LOAD has no PROT_WRITE and no .bss.  This generates EFAULT.

Here is a small test case.  (Yes, there are other, useful PT_INTERP
which have only .text and no .data/.bss.)

	----- ptinterp.S
	_start: .globl _start
		 nop
		 int3
	-----
	$ gcc -m32 -nostartfiles -nostdlib -o ptinterp ptinterp.S
	$ gcc -m32 -Wl,--dynamic-linker=ptinterp -o hello hello.c
	$ ./hello
	Segmentation fault  # during execve() itself

	After applying the patch:
	$ ./hello
	Trace trap  # user-mode execution after execve() finishes

If the ELF headers are actually self-inconsistent, then dying is fine.
But having no PROT_WRITE segment is perfectly normal and correct if
there is no segment with p_memsz > p_filesz (i.e. bss).  John Reiser
suggested checking for PROT_WRITE in the bss logic.  I think it makes
most sense to simply apply the bss logic only when there is bss.

This patch looks less trivial than it is due to some reindentation.
It just moves the "if (last_bss > elf_bss) {" test up to include the
partial-page bss logic as well as the more-pages bss logic.

Reported-by: John Reiser <jreiser@bitwagon.com>
Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/binfmt_elf.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b7c1603cd4bd..7c1e65d54872 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -501,22 +501,22 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 		}
 	}
 
-	/*
-	 * Now fill out the bss section.  First pad the last page up
-	 * to the page boundary, and then perform a mmap to make sure
-	 * that there are zero-mapped pages up to and including the 
-	 * last bss page.
-	 */
-	if (padzero(elf_bss)) {
-		error = -EFAULT;
-		goto out_close;
-	}
+	if (last_bss > elf_bss) {
+		/*
+		 * Now fill out the bss section.  First pad the last page up
+		 * to the page boundary, and then perform a mmap to make sure
+		 * that there are zero-mapped pages up to and including the
+		 * last bss page.
+		 */
+		if (padzero(elf_bss)) {
+			error = -EFAULT;
+			goto out_close;
+		}
 
-	/* What we have mapped so far */
-	elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
+		/* What we have mapped so far */
+		elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
 
-	/* Map the last of the bss segment */
-	if (last_bss > elf_bss) {
+		/* Map the last of the bss segment */
 		down_write(&current->mm->mmap_sem);
 		error = do_brk(elf_bss, last_bss - elf_bss);
 		up_write(&current->mm->mmap_sem);
-- 
cgit v1.2.2


From 71290b368ad5e1e0b0b300c9d5638490a9fd1a2d Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 10 Sep 2009 17:31:04 -0400
Subject: ext4: Don't update superblock write time when filesystem is read-only

This avoids updating the superblock write time when we are mounting
the root file system read/only but we need to replay the journal; at
that point, for people who are east of GMT and who make their clock
tick in localtime for Windows bug-for-bug compatibility, and this will
cause e2fsck to complain and force a full file system check.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f1815d3bcfd5..9f6fa3f74629 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3222,7 +3222,18 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 		clear_buffer_write_io_error(sbh);
 		set_buffer_uptodate(sbh);
 	}
-	es->s_wtime = cpu_to_le32(get_seconds());
+	/*
+	 * If the file system is mounted read-only, don't update the
+	 * superblock write time.  This avoids updating the superblock
+	 * write time when we are mounting the root file system
+	 * read/only but we need to replay the journal; at that point,
+	 * for people who are east of GMT and who make their clock
+	 * tick in localtime for Windows bug-for-bug compatibility,
+	 * the clock is set in the future, and this will cause e2fsck
+	 * to complain and force a full file system check.
+	 */
+	if (!(sb->s_flags & MS_RDONLY))
+		es->s_wtime = cpu_to_le32(get_seconds());
 	es->s_kbytes_written =
 		cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 
 			    ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
-- 
cgit v1.2.2


From 0e3d2a6313d03413d93327202a60256d1d726fdc Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 11 Sep 2009 09:30:12 -0400
Subject: ext4: Fix async commit mode to be safe by using a barrier

Previously the journal_async_commit mount option was equivalent to
using barrier=0 (and just as unsafe).  This patch fixes it so that we
eliminate the barrier before the commit block (by not using ordered
mode), and explicitly issuing an empty barrier bio after writing the
commit block.  Because of the journal checksum, it is safe to do this;
if the journal blocks are not all written before a power failure, the
checksum in the commit block will prevent the last transaction from
being replayed.

Using the fs_mark benchmark, using journal_async_commit shows a 50%
improvement:

FSUse%        Count         Size    Files/sec     App Overhead
     8         1000        10240         30.5            28242

vs.

FSUse%        Count         Size    Files/sec     App Overhead
     8         1000        10240         45.8            28620


Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0df600e9162d..26d991ddc1e6 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
+#include <linux/blkdev.h>
 #include <trace/events/jbd2.h>
 
 /*
@@ -133,8 +134,8 @@ static int journal_submit_commit_record(journal_t *journal,
 	bh->b_end_io = journal_end_buffer_io_sync;
 
 	if (journal->j_flags & JBD2_BARRIER &&
-		!JBD2_HAS_INCOMPAT_FEATURE(journal,
-					 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
+				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 		set_buffer_ordered(bh);
 		barrier_done = 1;
 	}
@@ -706,11 +707,13 @@ start_journal_io:
 	/* Done it all: now write the commit record asynchronously. */
 
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
-		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 		err = journal_submit_commit_record(journal, commit_transaction,
 						 &cbh, crc32_sum);
 		if (err)
 			__jbd2_journal_abort_hard(journal);
+		if (journal->j_flags & JBD2_BARRIER)
+			blkdev_issue_flush(journal->j_dev, NULL);
 	}
 
 	/*
@@ -833,7 +836,7 @@ wait_for_iobuf:
 	jbd_debug(3, "JBD: commit phase 5\n");
 
 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 		err = journal_submit_commit_record(journal, commit_transaction,
 						&cbh, crc32_sum);
 		if (err)
-- 
cgit v1.2.2


From 1f7bebb9e911d870fa8f997ddff838e82b5715ea Mon Sep 17 00:00:00 2001
From: Andreas Schlick <schlick@lavabit.com>
Date: Thu, 10 Sep 2009 23:16:07 -0400
Subject: ext4: Always set dx_node's fake_dirent explicitly.

When ext4_dx_add_entry() has to split an index node, it has to ensure that
name_len of dx_node's fake_dirent is also zero, because otherwise e2fsck
won't recognise it as an intermediate htree node and consider the htree to
be corrupted.

Signed-off-by: Andreas Schlick <schlick@lavabit.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f27c8164c1fe..42f81d285cd5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1597,9 +1597,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			goto cleanup;
 		node2 = (struct dx_node *)(bh2->b_data);
 		entries2 = node2->entries;
+		memset(&node2->fake, 0, sizeof(struct fake_dirent));
 		node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
 							   sb->s_blocksize);
-		node2->fake.inode = 0;
 		BUFFER_TRACE(frame->bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, frame->bh);
 		if (err)
-- 
cgit v1.2.2


From d8a8559cd7a9ccac98d5f6f13297a2ff68a43627 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 2 Sep 2009 12:34:32 +0200
Subject: writeback: get rid of generic_sync_sb_inodes() export

This adds two new exported functions:

- writeback_inodes_sb(), which only attempts to writeback dirty inodes on
  this super_block, for WB_SYNC_NONE writeout.
- sync_inodes_sb(), which writes out all dirty inodes on this super_block
  and also waits for the IO to complete.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 70 +++++++++++++++++++++++++++++++++----------------------
 fs/sync.c         | 18 +++++++-------
 fs/ubifs/budget.c | 16 ++-----------
 fs/ubifs/super.c  |  8 +------
 4 files changed, 55 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c54226be5294..271e5f44e871 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -458,8 +458,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
  * on the writer throttling path, and we get decent balancing between many
  * throttled threads: we don't want them all piling up on inode_sync_wait.
  */
-void generic_sync_sb_inodes(struct super_block *sb,
-				struct writeback_control *wbc)
+static void generic_sync_sb_inodes(struct super_block *sb,
+				   struct writeback_control *wbc)
 {
 	const unsigned long start = jiffies;	/* livelock avoidance */
 	int sync = wbc->sync_mode == WB_SYNC_ALL;
@@ -593,13 +593,6 @@ void generic_sync_sb_inodes(struct super_block *sb,
 
 	return;		/* Leave any unwritten inodes on s_io */
 }
-EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
-
-static void sync_sb_inodes(struct super_block *sb,
-				struct writeback_control *wbc)
-{
-	generic_sync_sb_inodes(sb, wbc);
-}
 
 /*
  * Start writeback of dirty pagecache data against all unlocked inodes.
@@ -640,7 +633,7 @@ restart:
 			 */
 			if (down_read_trylock(&sb->s_umount)) {
 				if (sb->s_root)
-					sync_sb_inodes(sb, wbc);
+					generic_sync_sb_inodes(sb, wbc);
 				up_read(&sb->s_umount);
 			}
 			spin_lock(&sb_lock);
@@ -653,35 +646,56 @@ restart:
 	spin_unlock(&sb_lock);
 }
 
-/*
- * writeback and wait upon the filesystem's dirty inodes.  The caller will
- * do this in two passes - one to write, and one to wait.
- *
- * A finite limit is set on the number of pages which will be written.
- * To prevent infinite livelock of sys_sync().
+/**
+ * writeback_inodes_sb	-	writeback dirty inodes from given super_block
+ * @sb: the superblock
  *
- * We add in the number of potentially dirty inodes, because each inode write
- * can dirty pagecache in the underlying blockdev.
+ * Start writeback on some inodes on this super_block. No guarantees are made
+ * on how many (if any) will be written, and this function does not wait
+ * for IO completion of submitted IO. The number of pages submitted is
+ * returned.
  */
-void sync_inodes_sb(struct super_block *sb, int wait)
+long writeback_inodes_sb(struct super_block *sb)
 {
 	struct writeback_control wbc = {
-		.sync_mode	= wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+		.sync_mode	= WB_SYNC_NONE,
 		.range_start	= 0,
 		.range_end	= LLONG_MAX,
 	};
+	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+	long nr_to_write;
 
-	if (!wait) {
-		unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-		unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-
-		wbc.nr_to_write = nr_dirty + nr_unstable +
+	nr_to_write = nr_dirty + nr_unstable +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
-	} else
-		wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
 
-	sync_sb_inodes(sb, &wbc);
+	wbc.nr_to_write = nr_to_write;
+	generic_sync_sb_inodes(sb, &wbc);
+	return nr_to_write - wbc.nr_to_write;
+}
+EXPORT_SYMBOL(writeback_inodes_sb);
+
+/**
+ * sync_inodes_sb	-	sync sb inode pages
+ * @sb: the superblock
+ *
+ * This function writes and waits on any dirty inode belonging to this
+ * super_block. The number of pages synced is returned.
+ */
+long sync_inodes_sb(struct super_block *sb)
+{
+	struct writeback_control wbc = {
+		.sync_mode	= WB_SYNC_ALL,
+		.range_start	= 0,
+		.range_end	= LLONG_MAX,
+	};
+	long nr_to_write = LONG_MAX; /* doesn't actually matter */
+
+	wbc.nr_to_write = nr_to_write;
+	generic_sync_sb_inodes(sb, &wbc);
+	return nr_to_write - wbc.nr_to_write;
 }
+EXPORT_SYMBOL(sync_inodes_sb);
 
 /**
  * write_inode_now	-	write an inode to disk
diff --git a/fs/sync.c b/fs/sync.c
index 3422ba61d86d..66f210476f40 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -19,20 +19,22 @@
 			SYNC_FILE_RANGE_WAIT_AFTER)
 
 /*
- * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
- * just dirties buffers with inodes so we have to submit IO for these buffers
- * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
- * case write_inode() functions do sync_dirty_buffer() and thus effectively
- * write one block at a time.
+ * Do the filesystem syncing work. For simple filesystems
+ * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
+ * submit IO for these buffers via __sync_blockdev(). This also speeds up the
+ * wait == 1 case since in that case write_inode() functions do
+ * sync_dirty_buffer() and thus effectively write one block at a time.
  */
 static int __sync_filesystem(struct super_block *sb, int wait)
 {
 	/* Avoid doing twice syncing and cache pruning for quota sync */
-	if (!wait)
+	if (!wait) {
 		writeout_quota_sb(sb, -1);
-	else
+		writeback_inodes_sb(sb);
+	} else {
 		sync_quota_sb(sb, -1);
-	sync_inodes_sb(sb, wait);
+		sync_inodes_sb(sb);
+	}
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, wait);
 	return __sync_blockdev(sb->s_bdev, wait);
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index eaf6d891d46f..1c8991b0db13 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -65,26 +65,14 @@
 static int shrink_liability(struct ubifs_info *c, int nr_to_write)
 {
 	int nr_written;
-	struct writeback_control wbc = {
-		.sync_mode   = WB_SYNC_NONE,
-		.range_end   = LLONG_MAX,
-		.nr_to_write = nr_to_write,
-	};
-
-	generic_sync_sb_inodes(c->vfs_sb, &wbc);
-	nr_written = nr_to_write - wbc.nr_to_write;
 
+	nr_written = writeback_inodes_sb(c->vfs_sb);
 	if (!nr_written) {
 		/*
 		 * Re-try again but wait on pages/inodes which are being
 		 * written-back concurrently (e.g., by pdflush).
 		 */
-		memset(&wbc, 0, sizeof(struct writeback_control));
-		wbc.sync_mode   = WB_SYNC_ALL;
-		wbc.range_end   = LLONG_MAX;
-		wbc.nr_to_write = nr_to_write;
-		generic_sync_sb_inodes(c->vfs_sb, &wbc);
-		nr_written = nr_to_write - wbc.nr_to_write;
+		nr_written = sync_inodes_sb(c->vfs_sb);
 	}
 
 	dbg_budg("%d pages were written back", nr_written);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 26d2e0d80465..8d6050a5966c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -438,12 +438,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 {
 	int i, err;
 	struct ubifs_info *c = sb->s_fs_info;
-	struct writeback_control wbc = {
-		.sync_mode   = WB_SYNC_ALL,
-		.range_start = 0,
-		.range_end   = LLONG_MAX,
-		.nr_to_write = LONG_MAX,
-	};
 
 	/*
 	 * Zero @wait is just an advisory thing to help the file system shove
@@ -462,7 +456,7 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	 * the user be able to get more accurate results of 'statfs()' after
 	 * they synchronize the file system.
 	 */
-	generic_sync_sb_inodes(sb, &wbc);
+	sync_inodes_sb(sb);
 
 	/*
 	 * Synchronize write buffers, because 'ubifs_run_commit()' does not
-- 
cgit v1.2.2


From 66f3b8e2e103a0b93b945764d98e9ba46cb926dd Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 2 Sep 2009 09:19:46 +0200
Subject: writeback: move dirty inodes from super_block to backing_dev_info

This is a first step at introducing per-bdi flusher threads. We should
have no change in behaviour, although sb_has_dirty_inodes() is now
ridiculously expensive, as there's no easy way to answer that question.
Not a huge problem, since it'll be deleted in subsequent patches.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 197 +++++++++++++++++++++++++++++++++++-------------------
 fs/super.c        |   3 -
 2 files changed, 127 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 271e5f44e871..45ad4bb700e6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -25,6 +25,7 @@
 #include <linux/buffer_head.h>
 #include "internal.h"
 
+#define inode_to_bdi(inode)	((inode)->i_mapping->backing_dev_info)
 
 /**
  * writeback_acquire - attempt to get exclusive writeback access to a device
@@ -165,12 +166,13 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			goto out;
 
 		/*
-		 * If the inode was already on s_dirty/s_io/s_more_io, don't
-		 * reposition it (that would break s_dirty time-ordering).
+		 * If the inode was already on b_dirty/b_io/b_more_io, don't
+		 * reposition it (that would break b_dirty time-ordering).
 		 */
 		if (!was_dirty) {
 			inode->dirtied_when = jiffies;
-			list_move(&inode->i_list, &sb->s_dirty);
+			list_move(&inode->i_list,
+					&inode_to_bdi(inode)->b_dirty);
 		}
 	}
 out:
@@ -191,31 +193,30 @@ static int write_inode(struct inode *inode, int sync)
  * furthest end of its superblock's dirty-inode list.
  *
  * Before stamping the inode's ->dirtied_when, we check to see whether it is
- * already the most-recently-dirtied inode on the s_dirty list.  If that is
+ * already the most-recently-dirtied inode on the b_dirty list.  If that is
  * the case then the inode must have been redirtied while it was being written
  * out and we don't reset its dirtied_when.
  */
 static void redirty_tail(struct inode *inode)
 {
-	struct super_block *sb = inode->i_sb;
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
 
-	if (!list_empty(&sb->s_dirty)) {
-		struct inode *tail_inode;
+	if (!list_empty(&bdi->b_dirty)) {
+		struct inode *tail;
 
-		tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
-		if (time_before(inode->dirtied_when,
-				tail_inode->dirtied_when))
+		tail = list_entry(bdi->b_dirty.next, struct inode, i_list);
+		if (time_before(inode->dirtied_when, tail->dirtied_when))
 			inode->dirtied_when = jiffies;
 	}
-	list_move(&inode->i_list, &sb->s_dirty);
+	list_move(&inode->i_list, &bdi->b_dirty);
 }
 
 /*
- * requeue inode for re-scanning after sb->s_io list is exhausted.
+ * requeue inode for re-scanning after bdi->b_io list is exhausted.
  */
 static void requeue_io(struct inode *inode)
 {
-	list_move(&inode->i_list, &inode->i_sb->s_more_io);
+	list_move(&inode->i_list, &inode_to_bdi(inode)->b_more_io);
 }
 
 static void inode_sync_complete(struct inode *inode)
@@ -262,18 +263,50 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 /*
  * Queue all expired dirty inodes for io, eldest first.
  */
-static void queue_io(struct super_block *sb,
-				unsigned long *older_than_this)
+static void queue_io(struct backing_dev_info *bdi,
+		     unsigned long *older_than_this)
+{
+	list_splice_init(&bdi->b_more_io, bdi->b_io.prev);
+	move_expired_inodes(&bdi->b_dirty, &bdi->b_io, older_than_this);
+}
+
+static int sb_on_inode_list(struct super_block *sb, struct list_head *list)
 {
-	list_splice_init(&sb->s_more_io, sb->s_io.prev);
-	move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this);
+	struct inode *inode;
+	int ret = 0;
+
+	spin_lock(&inode_lock);
+	list_for_each_entry(inode, list, i_list) {
+		if (inode->i_sb == sb) {
+			ret = 1;
+			break;
+		}
+	}
+	spin_unlock(&inode_lock);
+	return ret;
 }
 
 int sb_has_dirty_inodes(struct super_block *sb)
 {
-	return !list_empty(&sb->s_dirty) ||
-	       !list_empty(&sb->s_io) ||
-	       !list_empty(&sb->s_more_io);
+	struct backing_dev_info *bdi;
+	int ret = 0;
+
+	/*
+	 * This is REALLY expensive right now, but it'll go away
+	 * when the bdi writeback is introduced
+	 */
+	mutex_lock(&bdi_lock);
+	list_for_each_entry(bdi, &bdi_list, bdi_list) {
+		if (sb_on_inode_list(sb, &bdi->b_dirty) ||
+		    sb_on_inode_list(sb, &bdi->b_io) ||
+		    sb_on_inode_list(sb, &bdi->b_more_io)) {
+			ret = 1;
+			break;
+		}
+	}
+	mutex_unlock(&bdi_lock);
+
+	return ret;
 }
 EXPORT_SYMBOL(sb_has_dirty_inodes);
 
@@ -322,11 +355,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	if (inode->i_state & I_SYNC) {
 		/*
 		 * If this inode is locked for writeback and we are not doing
-		 * writeback-for-data-integrity, move it to s_more_io so that
+		 * writeback-for-data-integrity, move it to b_more_io so that
 		 * writeback can proceed with the other inodes on s_io.
 		 *
 		 * We'll have another go at writing back this inode when we
-		 * completed a full scan of s_io.
+		 * completed a full scan of b_io.
 		 */
 		if (!wait) {
 			requeue_io(inode);
@@ -371,11 +404,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 			/*
 			 * We didn't write back all the pages.  nfs_writepages()
 			 * sometimes bales out without doing anything. Redirty
-			 * the inode; Move it from s_io onto s_more_io/s_dirty.
+			 * the inode; Move it from b_io onto b_more_io/b_dirty.
 			 */
 			/*
 			 * akpm: if the caller was the kupdate function we put
-			 * this inode at the head of s_dirty so it gets first
+			 * this inode at the head of b_dirty so it gets first
 			 * consideration.  Otherwise, move it to the tail, for
 			 * the reasons described there.  I'm not really sure
 			 * how much sense this makes.  Presumably I had a good
@@ -385,7 +418,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 			if (wbc->for_kupdate) {
 				/*
 				 * For the kupdate function we move the inode
-				 * to s_more_io so it will get more writeout as
+				 * to b_more_io so it will get more writeout as
 				 * soon as the queue becomes uncongested.
 				 */
 				inode->i_state |= I_DIRTY_PAGES;
@@ -433,51 +466,34 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	return ret;
 }
 
-/*
- * Write out a superblock's list of dirty inodes.  A wait will be performed
- * upon no inodes, all inodes or the final one, depending upon sync_mode.
- *
- * If older_than_this is non-NULL, then only write out inodes which
- * had their first dirtying at a time earlier than *older_than_this.
- *
- * If we're a pdflush thread, then implement pdflush collision avoidance
- * against the entire list.
- *
- * If `bdi' is non-zero then we're being asked to writeback a specific queue.
- * This function assumes that the blockdev superblock's inodes are backed by
- * a variety of queues, so all inodes are searched.  For other superblocks,
- * assume that all inodes are backed by the same queue.
- *
- * FIXME: this linear search could get expensive with many fileystems.  But
- * how to fix?  We need to go from an address_space to all inodes which share
- * a queue with that address_space.  (Easy: have a global "dirty superblocks"
- * list).
- *
- * The inodes to be written are parked on sb->s_io.  They are moved back onto
- * sb->s_dirty as they are selected for writing.  This way, none can be missed
- * on the writer throttling path, and we get decent balancing between many
- * throttled threads: we don't want them all piling up on inode_sync_wait.
- */
-static void generic_sync_sb_inodes(struct super_block *sb,
-				   struct writeback_control *wbc)
+static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
+				    struct writeback_control *wbc,
+				    struct super_block *sb)
 {
+	const int is_blkdev_sb = sb_is_blkdev_sb(sb);
 	const unsigned long start = jiffies;	/* livelock avoidance */
-	int sync = wbc->sync_mode == WB_SYNC_ALL;
 
 	spin_lock(&inode_lock);
-	if (!wbc->for_kupdate || list_empty(&sb->s_io))
-		queue_io(sb, wbc->older_than_this);
 
-	while (!list_empty(&sb->s_io)) {
-		struct inode *inode = list_entry(sb->s_io.prev,
+	if (!wbc->for_kupdate || list_empty(&bdi->b_io))
+		queue_io(bdi, wbc->older_than_this);
+
+	while (!list_empty(&bdi->b_io)) {
+		struct inode *inode = list_entry(bdi->b_io.prev,
 						struct inode, i_list);
-		struct address_space *mapping = inode->i_mapping;
-		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		long pages_skipped;
 
+		/*
+		 * super block given and doesn't match, skip this inode
+		 */
+		if (sb && sb != inode->i_sb) {
+			redirty_tail(inode);
+			continue;
+		}
+
 		if (!bdi_cap_writeback_dirty(bdi)) {
 			redirty_tail(inode);
-			if (sb_is_blkdev_sb(sb)) {
+			if (is_blkdev_sb) {
 				/*
 				 * Dirty memory-backed blockdev: the ramdisk
 				 * driver does this.  Skip just this inode
@@ -499,14 +515,14 @@ static void generic_sync_sb_inodes(struct super_block *sb,
 
 		if (wbc->nonblocking && bdi_write_congested(bdi)) {
 			wbc->encountered_congestion = 1;
-			if (!sb_is_blkdev_sb(sb))
+			if (!is_blkdev_sb)
 				break;		/* Skip a congested fs */
 			requeue_io(inode);
 			continue;		/* Skip a congested blockdev */
 		}
 
 		if (wbc->bdi && bdi != wbc->bdi) {
-			if (!sb_is_blkdev_sb(sb))
+			if (!is_blkdev_sb)
 				break;		/* fs has the wrong queue */
 			requeue_io(inode);
 			continue;		/* blockdev has wrong queue */
@@ -544,13 +560,57 @@ static void generic_sync_sb_inodes(struct super_block *sb,
 			wbc->more_io = 1;
 			break;
 		}
-		if (!list_empty(&sb->s_more_io))
+		if (!list_empty(&bdi->b_more_io))
 			wbc->more_io = 1;
 	}
 
-	if (sync) {
+	spin_unlock(&inode_lock);
+	/* Leave any unwritten inodes on b_io */
+}
+
+/*
+ * Write out a superblock's list of dirty inodes.  A wait will be performed
+ * upon no inodes, all inodes or the final one, depending upon sync_mode.
+ *
+ * If older_than_this is non-NULL, then only write out inodes which
+ * had their first dirtying at a time earlier than *older_than_this.
+ *
+ * If we're a pdlfush thread, then implement pdflush collision avoidance
+ * against the entire list.
+ *
+ * If `bdi' is non-zero then we're being asked to writeback a specific queue.
+ * This function assumes that the blockdev superblock's inodes are backed by
+ * a variety of queues, so all inodes are searched.  For other superblocks,
+ * assume that all inodes are backed by the same queue.
+ *
+ * FIXME: this linear search could get expensive with many fileystems.  But
+ * how to fix?  We need to go from an address_space to all inodes which share
+ * a queue with that address_space.  (Easy: have a global "dirty superblocks"
+ * list).
+ *
+ * The inodes to be written are parked on bdi->b_io.  They are moved back onto
+ * bdi->b_dirty as they are selected for writing.  This way, none can be missed
+ * on the writer throttling path, and we get decent balancing between many
+ * throttled threads: we don't want them all piling up on inode_sync_wait.
+ */
+static void generic_sync_sb_inodes(struct super_block *sb,
+				   struct writeback_control *wbc)
+{
+	struct backing_dev_info *bdi;
+
+	if (!wbc->bdi) {
+		mutex_lock(&bdi_lock);
+		list_for_each_entry(bdi, &bdi_list, bdi_list)
+			generic_sync_bdi_inodes(bdi, wbc, sb);
+		mutex_unlock(&bdi_lock);
+	} else
+		generic_sync_bdi_inodes(wbc->bdi, wbc, sb);
+
+	if (wbc->sync_mode == WB_SYNC_ALL) {
 		struct inode *inode, *old_inode = NULL;
 
+		spin_lock(&inode_lock);
+
 		/*
 		 * Data integrity sync. Must wait for all pages under writeback,
 		 * because there may have been pages dirtied before our sync
@@ -588,10 +648,7 @@ static void generic_sync_sb_inodes(struct super_block *sb,
 		}
 		spin_unlock(&inode_lock);
 		iput(old_inode);
-	} else
-		spin_unlock(&inode_lock);
-
-	return;		/* Leave any unwritten inodes on s_io */
+	}
 }
 
 /*
@@ -599,8 +656,8 @@ static void generic_sync_sb_inodes(struct super_block *sb,
  *
  * Note:
  * We don't need to grab a reference to superblock here. If it has non-empty
- * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
- * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all
+ * ->b_dirty it's hadn't been killed yet and kill_super() won't proceed
+ * past sync_inodes_sb() until the ->b_dirty/b_io/b_more_io lists are all
  * empty. Since __sync_single_inode() regains inode_lock before it finally moves
  * inode from superblock lists we are OK.
  *
diff --git a/fs/super.c b/fs/super.c
index 2761d3e22ed9..0d22ce3be4aa 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -62,9 +62,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
 			s = NULL;
 			goto out;
 		}
-		INIT_LIST_HEAD(&s->s_dirty);
-		INIT_LIST_HEAD(&s->s_io);
-		INIT_LIST_HEAD(&s->s_more_io);
 		INIT_LIST_HEAD(&s->s_files);
 		INIT_LIST_HEAD(&s->s_instances);
 		INIT_HLIST_HEAD(&s->s_anon);
-- 
cgit v1.2.2


From 03ba3782e8dcc5b0e1efe440d33084f066e38cae Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 9 Sep 2009 09:08:54 +0200
Subject: writeback: switch to per-bdi threads for flushing data

This gets rid of pdflush for bdi writeout and kupdated style cleaning.
pdflush writeout suffers from lack of locality and also requires more
threads to handle the same workload, since it has to work in a
non-blocking fashion against each queue. This also introduces lumpy
behaviour and potential request starvation, since pdflush can be starved
for queue access if others are accessing it. A sample ffsb workload that
does random writes to files is about 8% faster here on a simple SATA drive
during the benchmark phase. File layout also seems a LOT more smooth in
vmstat:

 r  b   swpd   free   buff  cache   si   so    bi    bo   in    cs us sy id wa
 0  1      0 608848   2652 375372    0    0     0 71024  604    24  1 10 48 42
 0  1      0 549644   2712 433736    0    0     0 60692  505    27  1  8 48 44
 1  0      0 476928   2784 505192    0    0     4 29540  553    24  0  9 53 37
 0  1      0 457972   2808 524008    0    0     0 54876  331    16  0  4 38 58
 0  1      0 366128   2928 614284    0    0     4 92168  710    58  0 13 53 34
 0  1      0 295092   3000 684140    0    0     0 62924  572    23  0  9 53 37
 0  1      0 236592   3064 741704    0    0     4 58256  523    17  0  8 48 44
 0  1      0 165608   3132 811464    0    0     0 57460  560    21  0  8 54 38
 0  1      0 102952   3200 873164    0    0     4 74748  540    29  1 10 48 41
 0  1      0  48604   3252 926472    0    0     0 53248  469    29  0  7 47 45

where vanilla tends to fluctuate a lot in the creation phase:

 r  b   swpd   free   buff  cache   si   so    bi    bo   in    cs us sy id wa
 1  1      0 678716   5792 303380    0    0     0 74064  565    50  1 11 52 36
 1  0      0 662488   5864 319396    0    0     4   352  302   329  0  2 47 51
 0  1      0 599312   5924 381468    0    0     0 78164  516    55  0  9 51 40
 0  1      0 519952   6008 459516    0    0     4 78156  622    56  1 11 52 37
 1  1      0 436640   6092 541632    0    0     0 82244  622    54  0 11 48 41
 0  1      0 436640   6092 541660    0    0     0     8  152    39  0  0 51 49
 0  1      0 332224   6200 644252    0    0     4 102800  728    46  1 13 49 36
 1  0      0 274492   6260 701056    0    0     4 12328  459    49  0  7 50 43
 0  1      0 211220   6324 763356    0    0     0 106940  515    37  1 10 51 39
 1  0      0 160412   6376 813468    0    0     0  8224  415    43  0  6 49 45
 1  1      0  85980   6452 886556    0    0     4 113516  575    39  1 11 54 34
 0  2      0  85968   6452 886620    0    0     0  1640  158   211  0  0 46 54

A 10 disk test with btrfs performs 26% faster with per-bdi flushing. A
SSD based writeback test on XFS performs over 20% better as well, with
the throughput being very stable around 1GB/sec, where pdflush only
manages 750MB/sec and fluctuates wildly while doing so. Random buffered
writes to many files behave a lot better as well, as does random mmap'ed
writes.

A separate thread is added to sync the super blocks. In the long term,
adding sync_supers_bdi() functionality could get rid of this thread again.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/buffer.c       |   2 +-
 fs/fs-writeback.c | 999 ++++++++++++++++++++++++++++++++++++++----------------
 fs/super.c        |   2 +-
 fs/sync.c         |   2 +-
 4 files changed, 713 insertions(+), 292 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 28f320fac4d4..90a98865b0cc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -281,7 +281,7 @@ static void free_more_memory(void)
 	struct zone *zone;
 	int nid;
 
-	wakeup_pdflush(1024);
+	wakeup_flusher_threads(1024);
 	yield();
 
 	for_each_online_node(nid) {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 45ad4bb700e6..7f6dae8aa47f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -19,6 +19,8 @@
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
@@ -27,165 +29,208 @@
 
 #define inode_to_bdi(inode)	((inode)->i_mapping->backing_dev_info)
 
-/**
- * writeback_acquire - attempt to get exclusive writeback access to a device
- * @bdi: the device's backing_dev_info structure
- *
- * It is a waste of resources to have more than one pdflush thread blocked on
- * a single request queue.  Exclusion at the request_queue level is obtained
- * via a flag in the request_queue's backing_dev_info.state.
- *
- * Non-request_queue-backed address_spaces will share default_backing_dev_info,
- * unless they implement their own.  Which is somewhat inefficient, as this
- * may prevent concurrent writeback against multiple devices.
+/*
+ * Work items for the bdi_writeback threads
  */
-static int writeback_acquire(struct backing_dev_info *bdi)
+struct bdi_work {
+	struct list_head list;
+	struct list_head wait_list;
+	struct rcu_head rcu_head;
+
+	unsigned long seen;
+	atomic_t pending;
+
+	struct super_block *sb;
+	unsigned long nr_pages;
+	enum writeback_sync_modes sync_mode;
+
+	unsigned long state;
+};
+
+enum {
+	WS_USED_B = 0,
+	WS_ONSTACK_B,
+};
+
+#define WS_USED (1 << WS_USED_B)
+#define WS_ONSTACK (1 << WS_ONSTACK_B)
+
+static inline bool bdi_work_on_stack(struct bdi_work *work)
+{
+	return test_bit(WS_ONSTACK_B, &work->state);
+}
+
+static inline void bdi_work_init(struct bdi_work *work,
+				 struct writeback_control *wbc)
+{
+	INIT_RCU_HEAD(&work->rcu_head);
+	work->sb = wbc->sb;
+	work->nr_pages = wbc->nr_to_write;
+	work->sync_mode = wbc->sync_mode;
+	work->state = WS_USED;
+}
+
+static inline void bdi_work_init_on_stack(struct bdi_work *work,
+					  struct writeback_control *wbc)
 {
-	return !test_and_set_bit(BDI_pdflush, &bdi->state);
+	bdi_work_init(work, wbc);
+	work->state |= WS_ONSTACK;
 }
 
 /**
  * writeback_in_progress - determine whether there is writeback in progress
  * @bdi: the device's backing_dev_info structure.
  *
- * Determine whether there is writeback in progress against a backing device.
+ * Determine whether there is writeback waiting to be handled against a
+ * backing device.
  */
 int writeback_in_progress(struct backing_dev_info *bdi)
 {
-	return test_bit(BDI_pdflush, &bdi->state);
+	return !list_empty(&bdi->work_list);
 }
 
-/**
- * writeback_release - relinquish exclusive writeback access against a device.
- * @bdi: the device's backing_dev_info structure
- */
-static void writeback_release(struct backing_dev_info *bdi)
+static void bdi_work_clear(struct bdi_work *work)
 {
-	BUG_ON(!writeback_in_progress(bdi));
-	clear_bit(BDI_pdflush, &bdi->state);
+	clear_bit(WS_USED_B, &work->state);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&work->state, WS_USED_B);
 }
 
-static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+static void bdi_work_free(struct rcu_head *head)
 {
-	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
-		struct dentry *dentry;
-		const char *name = "?";
+	struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
 
-		dentry = d_find_alias(inode);
-		if (dentry) {
-			spin_lock(&dentry->d_lock);
-			name = (const char *) dentry->d_name.name;
-		}
-		printk(KERN_DEBUG
-		       "%s(%d): dirtied inode %lu (%s) on %s\n",
-		       current->comm, task_pid_nr(current), inode->i_ino,
-		       name, inode->i_sb->s_id);
-		if (dentry) {
-			spin_unlock(&dentry->d_lock);
-			dput(dentry);
-		}
-	}
+	if (!bdi_work_on_stack(work))
+		kfree(work);
+	else
+		bdi_work_clear(work);
 }
 
-/**
- *	__mark_inode_dirty -	internal function
- *	@inode: inode to mark
- *	@flags: what kind of dirty (i.e. I_DIRTY_SYNC)
- *	Mark an inode as dirty. Callers should use mark_inode_dirty or
- *  	mark_inode_dirty_sync.
- *
- * Put the inode on the super block's dirty list.
- *
- * CAREFUL! We mark it dirty unconditionally, but move it onto the
- * dirty list only if it is hashed or if it refers to a blockdev.
- * If it was not hashed, it will never be added to the dirty list
- * even if it is later hashed, as it will have been marked dirty already.
- *
- * In short, make sure you hash any inodes _before_ you start marking
- * them dirty.
- *
- * This function *must* be atomic for the I_DIRTY_PAGES case -
- * set_page_dirty() is called under spinlock in several places.
- *
- * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
- * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
- * the kernel-internal blockdev inode represents the dirtying time of the
- * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
- * page->mapping->host, so the page-dirtying time is recorded in the internal
- * blockdev inode.
- */
-void __mark_inode_dirty(struct inode *inode, int flags)
+static void wb_work_complete(struct bdi_work *work)
 {
-	struct super_block *sb = inode->i_sb;
+	const enum writeback_sync_modes sync_mode = work->sync_mode;
 
 	/*
-	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
-	 * dirty the inode itself
+	 * For allocated work, we can clear the done/seen bit right here.
+	 * For on-stack work, we need to postpone both the clear and free
+	 * to after the RCU grace period, since the stack could be invalidated
+	 * as soon as bdi_work_clear() has done the wakeup.
 	 */
-	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
-		if (sb->s_op->dirty_inode)
-			sb->s_op->dirty_inode(inode);
-	}
+	if (!bdi_work_on_stack(work))
+		bdi_work_clear(work);
+	if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work))
+		call_rcu(&work->rcu_head, bdi_work_free);
+}
 
+static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
+{
 	/*
-	 * make sure that changes are seen by all cpus before we test i_state
-	 * -- mikulas
+	 * The caller has retrieved the work arguments from this work,
+	 * drop our reference. If this is the last ref, delete and free it
 	 */
-	smp_mb();
+	if (atomic_dec_and_test(&work->pending)) {
+		struct backing_dev_info *bdi = wb->bdi;
 
-	/* avoid the locking if we can */
-	if ((inode->i_state & flags) == flags)
-		return;
-
-	if (unlikely(block_dump))
-		block_dump___mark_inode_dirty(inode);
+		spin_lock(&bdi->wb_lock);
+		list_del_rcu(&work->list);
+		spin_unlock(&bdi->wb_lock);
 
-	spin_lock(&inode_lock);
-	if ((inode->i_state & flags) != flags) {
-		const int was_dirty = inode->i_state & I_DIRTY;
+		wb_work_complete(work);
+	}
+}
 
-		inode->i_state |= flags;
+static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
+{
+	if (work) {
+		work->seen = bdi->wb_mask;
+		BUG_ON(!work->seen);
+		atomic_set(&work->pending, bdi->wb_cnt);
+		BUG_ON(!bdi->wb_cnt);
 
 		/*
-		 * If the inode is being synced, just update its dirty state.
-		 * The unlocker will place the inode on the appropriate
-		 * superblock list, based upon its state.
+		 * Make sure stores are seen before it appears on the list
 		 */
-		if (inode->i_state & I_SYNC)
-			goto out;
+		smp_mb();
 
-		/*
-		 * Only add valid (hashed) inodes to the superblock's
-		 * dirty list.  Add blockdev inodes as well.
-		 */
-		if (!S_ISBLK(inode->i_mode)) {
-			if (hlist_unhashed(&inode->i_hash))
-				goto out;
-		}
-		if (inode->i_state & (I_FREEING|I_CLEAR))
-			goto out;
+		spin_lock(&bdi->wb_lock);
+		list_add_tail_rcu(&work->list, &bdi->work_list);
+		spin_unlock(&bdi->wb_lock);
+	}
+
+	/*
+	 * If the default thread isn't there, make sure we add it. When
+	 * it gets created and wakes up, we'll run this work.
+	 */
+	if (unlikely(list_empty_careful(&bdi->wb_list)))
+		wake_up_process(default_backing_dev_info.wb.task);
+	else {
+		struct bdi_writeback *wb = &bdi->wb;
 
 		/*
-		 * If the inode was already on b_dirty/b_io/b_more_io, don't
-		 * reposition it (that would break b_dirty time-ordering).
+		 * If we failed allocating the bdi work item, wake up the wb
+		 * thread always. As a safety precaution, it'll flush out
+		 * everything
 		 */
-		if (!was_dirty) {
-			inode->dirtied_when = jiffies;
-			list_move(&inode->i_list,
-					&inode_to_bdi(inode)->b_dirty);
-		}
+		if (!wb_has_dirty_io(wb)) {
+			if (work)
+				wb_clear_pending(wb, work);
+		} else if (wb->task)
+			wake_up_process(wb->task);
 	}
-out:
-	spin_unlock(&inode_lock);
 }
 
-EXPORT_SYMBOL(__mark_inode_dirty);
+/*
+ * Used for on-stack allocated work items. The caller needs to wait until
+ * the wb threads have acked the work before it's safe to continue.
+ */
+static void bdi_wait_on_work_clear(struct bdi_work *work)
+{
+	wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
+		    TASK_UNINTERRUPTIBLE);
+}
 
-static int write_inode(struct inode *inode, int sync)
+static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
 {
-	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
-		return inode->i_sb->s_op->write_inode(inode, sync);
-	return 0;
+	struct bdi_work *work;
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (work)
+		bdi_work_init(work, wbc);
+
+	return work;
+}
+
+void bdi_start_writeback(struct writeback_control *wbc)
+{
+	const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
+	struct bdi_work work_stack, *work = NULL;
+
+	if (!must_wait)
+		work = bdi_alloc_work(wbc);
+
+	if (!work) {
+		work = &work_stack;
+		bdi_work_init_on_stack(work, wbc);
+	}
+
+	bdi_queue_work(wbc->bdi, work);
+
+	/*
+	 * If the sync mode is WB_SYNC_ALL, block waiting for the work to
+	 * complete. If not, we only need to wait for the work to be started,
+	 * if we allocated it on-stack. We use the same mechanism, if the
+	 * wait bit is set in the bdi_work struct, then threads will not
+	 * clear pending until after they are done.
+	 *
+	 * Note that work == &work_stack if must_wait is true, so we don't
+	 * need to do call_rcu() here ever, since the completion path will
+	 * have done that for us.
+	 */
+	if (must_wait || work == &work_stack) {
+		bdi_wait_on_work_clear(work);
+		if (work != &work_stack)
+			call_rcu(&work->rcu_head, bdi_work_free);
+	}
 }
 
 /*
@@ -199,16 +244,16 @@ static int write_inode(struct inode *inode, int sync)
  */
 static void redirty_tail(struct inode *inode)
 {
-	struct backing_dev_info *bdi = inode_to_bdi(inode);
+	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 
-	if (!list_empty(&bdi->b_dirty)) {
+	if (!list_empty(&wb->b_dirty)) {
 		struct inode *tail;
 
-		tail = list_entry(bdi->b_dirty.next, struct inode, i_list);
+		tail = list_entry(wb->b_dirty.next, struct inode, i_list);
 		if (time_before(inode->dirtied_when, tail->dirtied_when))
 			inode->dirtied_when = jiffies;
 	}
-	list_move(&inode->i_list, &bdi->b_dirty);
+	list_move(&inode->i_list, &wb->b_dirty);
 }
 
 /*
@@ -216,7 +261,9 @@ static void redirty_tail(struct inode *inode)
  */
 static void requeue_io(struct inode *inode)
 {
-	list_move(&inode->i_list, &inode_to_bdi(inode)->b_more_io);
+	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+
+	list_move(&inode->i_list, &wb->b_more_io);
 }
 
 static void inode_sync_complete(struct inode *inode)
@@ -263,52 +310,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 /*
  * Queue all expired dirty inodes for io, eldest first.
  */
-static void queue_io(struct backing_dev_info *bdi,
-		     unsigned long *older_than_this)
+static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
 {
-	list_splice_init(&bdi->b_more_io, bdi->b_io.prev);
-	move_expired_inodes(&bdi->b_dirty, &bdi->b_io, older_than_this);
+	list_splice_init(&wb->b_more_io, wb->b_io.prev);
+	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
 }
 
-static int sb_on_inode_list(struct super_block *sb, struct list_head *list)
-{
-	struct inode *inode;
-	int ret = 0;
-
-	spin_lock(&inode_lock);
-	list_for_each_entry(inode, list, i_list) {
-		if (inode->i_sb == sb) {
-			ret = 1;
-			break;
-		}
-	}
-	spin_unlock(&inode_lock);
-	return ret;
-}
-
-int sb_has_dirty_inodes(struct super_block *sb)
+static int write_inode(struct inode *inode, int sync)
 {
-	struct backing_dev_info *bdi;
-	int ret = 0;
-
-	/*
-	 * This is REALLY expensive right now, but it'll go away
-	 * when the bdi writeback is introduced
-	 */
-	mutex_lock(&bdi_lock);
-	list_for_each_entry(bdi, &bdi_list, bdi_list) {
-		if (sb_on_inode_list(sb, &bdi->b_dirty) ||
-		    sb_on_inode_list(sb, &bdi->b_io) ||
-		    sb_on_inode_list(sb, &bdi->b_more_io)) {
-			ret = 1;
-			break;
-		}
-	}
-	mutex_unlock(&bdi_lock);
-
-	return ret;
+	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
+		return inode->i_sb->s_op->write_inode(inode, sync);
+	return 0;
 }
-EXPORT_SYMBOL(sb_has_dirty_inodes);
 
 /*
  * Wait for writeback on an inode to complete.
@@ -466,20 +479,71 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	return ret;
 }
 
-static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
-				    struct writeback_control *wbc,
-				    struct super_block *sb)
+/*
+ * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
+ * before calling writeback. So make sure that we do pin it, so it doesn't
+ * go away while we are writing inodes from it.
+ *
+ * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
+ * 1 if we failed.
+ */
+static int pin_sb_for_writeback(struct writeback_control *wbc,
+				   struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	/*
+	 * Caller must already hold the ref for this
+	 */
+	if (wbc->sync_mode == WB_SYNC_ALL) {
+		WARN_ON(!rwsem_is_locked(&sb->s_umount));
+		return 0;
+	}
+
+	spin_lock(&sb_lock);
+	sb->s_count++;
+	if (down_read_trylock(&sb->s_umount)) {
+		if (sb->s_root) {
+			spin_unlock(&sb_lock);
+			return 0;
+		}
+		/*
+		 * umounted, drop rwsem again and fall through to failure
+		 */
+		up_read(&sb->s_umount);
+	}
+
+	sb->s_count--;
+	spin_unlock(&sb_lock);
+	return 1;
+}
+
+static void unpin_sb_for_writeback(struct writeback_control *wbc,
+				   struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		return;
+
+	up_read(&sb->s_umount);
+	put_super(sb);
+}
+
+static void writeback_inodes_wb(struct bdi_writeback *wb,
+				struct writeback_control *wbc)
 {
+	struct super_block *sb = wbc->sb;
 	const int is_blkdev_sb = sb_is_blkdev_sb(sb);
 	const unsigned long start = jiffies;	/* livelock avoidance */
 
 	spin_lock(&inode_lock);
 
-	if (!wbc->for_kupdate || list_empty(&bdi->b_io))
-		queue_io(bdi, wbc->older_than_this);
+	if (!wbc->for_kupdate || list_empty(&wb->b_io))
+		queue_io(wb, wbc->older_than_this);
 
-	while (!list_empty(&bdi->b_io)) {
-		struct inode *inode = list_entry(bdi->b_io.prev,
+	while (!list_empty(&wb->b_io)) {
+		struct inode *inode = list_entry(wb->b_io.prev,
 						struct inode, i_list);
 		long pages_skipped;
 
@@ -491,7 +555,7 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
 			continue;
 		}
 
-		if (!bdi_cap_writeback_dirty(bdi)) {
+		if (!bdi_cap_writeback_dirty(wb->bdi)) {
 			redirty_tail(inode);
 			if (is_blkdev_sb) {
 				/*
@@ -513,7 +577,7 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
 			continue;
 		}
 
-		if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
 			wbc->encountered_congestion = 1;
 			if (!is_blkdev_sb)
 				break;		/* Skip a congested fs */
@@ -521,13 +585,6 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
 			continue;		/* Skip a congested blockdev */
 		}
 
-		if (wbc->bdi && bdi != wbc->bdi) {
-			if (!is_blkdev_sb)
-				break;		/* fs has the wrong queue */
-			requeue_io(inode);
-			continue;		/* blockdev has wrong queue */
-		}
-
 		/*
 		 * Was this inode dirtied after sync_sb_inodes was called?
 		 * This keeps sync from extra jobs and livelock.
@@ -535,16 +592,16 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
 		if (inode_dirtied_after(inode, start))
 			break;
 
-		/* Is another pdflush already flushing this queue? */
-		if (current_is_pdflush() && !writeback_acquire(bdi))
-			break;
+		if (pin_sb_for_writeback(wbc, inode)) {
+			requeue_io(inode);
+			continue;
+		}
 
 		BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
 		__iget(inode);
 		pages_skipped = wbc->pages_skipped;
 		writeback_single_inode(inode, wbc);
-		if (current_is_pdflush())
-			writeback_release(bdi);
+		unpin_sb_for_writeback(wbc, inode);
 		if (wbc->pages_skipped != pages_skipped) {
 			/*
 			 * writeback is not making progress due to locked
@@ -560,7 +617,7 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
 			wbc->more_io = 1;
 			break;
 		}
-		if (!list_empty(&bdi->b_more_io))
+		if (!list_empty(&wb->b_more_io))
 			wbc->more_io = 1;
 	}
 
@@ -568,139 +625,500 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
 	/* Leave any unwritten inodes on b_io */
 }
 
+void writeback_inodes_wbc(struct writeback_control *wbc)
+{
+	struct backing_dev_info *bdi = wbc->bdi;
+
+	writeback_inodes_wb(&bdi->wb, wbc);
+}
+
 /*
- * Write out a superblock's list of dirty inodes.  A wait will be performed
- * upon no inodes, all inodes or the final one, depending upon sync_mode.
- *
- * If older_than_this is non-NULL, then only write out inodes which
- * had their first dirtying at a time earlier than *older_than_this.
- *
- * If we're a pdlfush thread, then implement pdflush collision avoidance
- * against the entire list.
+ * The maximum number of pages to writeout in a single bdi flush/kupdate
+ * operation.  We do this so we don't hold I_SYNC against an inode for
+ * enormous amounts of time, which would block a userspace task which has
+ * been forced to throttle against that inode.  Also, the code reevaluates
+ * the dirty each time it has written this many pages.
+ */
+#define MAX_WRITEBACK_PAGES     1024
+
+static inline bool over_bground_thresh(void)
+{
+	unsigned long background_thresh, dirty_thresh;
+
+	get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+
+	return (global_page_state(NR_FILE_DIRTY) +
+		global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
+}
+
+/*
+ * Explicit flushing or periodic writeback of "old" data.
  *
- * If `bdi' is non-zero then we're being asked to writeback a specific queue.
- * This function assumes that the blockdev superblock's inodes are backed by
- * a variety of queues, so all inodes are searched.  For other superblocks,
- * assume that all inodes are backed by the same queue.
+ * Define "old": the first time one of an inode's pages is dirtied, we mark the
+ * dirtying-time in the inode's address_space.  So this periodic writeback code
+ * just walks the superblock inode list, writing back any inodes which are
+ * older than a specific point in time.
  *
- * FIXME: this linear search could get expensive with many fileystems.  But
- * how to fix?  We need to go from an address_space to all inodes which share
- * a queue with that address_space.  (Easy: have a global "dirty superblocks"
- * list).
+ * Try to run once per dirty_writeback_interval.  But if a writeback event
+ * takes longer than a dirty_writeback_interval interval, then leave a
+ * one-second gap.
  *
- * The inodes to be written are parked on bdi->b_io.  They are moved back onto
- * bdi->b_dirty as they are selected for writing.  This way, none can be missed
- * on the writer throttling path, and we get decent balancing between many
- * throttled threads: we don't want them all piling up on inode_sync_wait.
+ * older_than_this takes precedence over nr_to_write.  So we'll only write back
+ * all dirty pages if they are all attached to "old" mappings.
  */
-static void generic_sync_sb_inodes(struct super_block *sb,
-				   struct writeback_control *wbc)
+static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
+			 struct super_block *sb,
+			 enum writeback_sync_modes sync_mode, int for_kupdate)
 {
-	struct backing_dev_info *bdi;
-
-	if (!wbc->bdi) {
-		mutex_lock(&bdi_lock);
-		list_for_each_entry(bdi, &bdi_list, bdi_list)
-			generic_sync_bdi_inodes(bdi, wbc, sb);
-		mutex_unlock(&bdi_lock);
-	} else
-		generic_sync_bdi_inodes(wbc->bdi, wbc, sb);
+	struct writeback_control wbc = {
+		.bdi			= wb->bdi,
+		.sb			= sb,
+		.sync_mode		= sync_mode,
+		.older_than_this	= NULL,
+		.for_kupdate		= for_kupdate,
+		.range_cyclic		= 1,
+	};
+	unsigned long oldest_jif;
+	long wrote = 0;
 
-	if (wbc->sync_mode == WB_SYNC_ALL) {
-		struct inode *inode, *old_inode = NULL;
+	if (wbc.for_kupdate) {
+		wbc.older_than_this = &oldest_jif;
+		oldest_jif = jiffies -
+				msecs_to_jiffies(dirty_expire_interval * 10);
+	}
 
-		spin_lock(&inode_lock);
+	for (;;) {
+		/*
+		 * Don't flush anything for non-integrity writeback where
+		 * no nr_pages was given
+		 */
+		if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE)
+			break;
 
 		/*
-		 * Data integrity sync. Must wait for all pages under writeback,
-		 * because there may have been pages dirtied before our sync
-		 * call, but which had writeout started before we write it out.
-		 * In which case, the inode may not be on the dirty list, but
-		 * we still have to wait for that writeout.
+		 * If no specific pages were given and this is just a
+		 * periodic background writeout and we are below the
+		 * background dirty threshold, don't do anything
 		 */
-		list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-			struct address_space *mapping;
+		if (for_kupdate && nr_pages <= 0 && !over_bground_thresh())
+			break;
 
-			if (inode->i_state &
-					(I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
-				continue;
-			mapping = inode->i_mapping;
-			if (mapping->nrpages == 0)
+		wbc.more_io = 0;
+		wbc.encountered_congestion = 0;
+		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+		wbc.pages_skipped = 0;
+		writeback_inodes_wb(wb, &wbc);
+		nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+
+		/*
+		 * If we ran out of stuff to write, bail unless more_io got set
+		 */
+		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
+			if (wbc.more_io && !wbc.for_kupdate)
 				continue;
-			__iget(inode);
-			spin_unlock(&inode_lock);
+			break;
+		}
+	}
+
+	return wrote;
+}
+
+/*
+ * Return the next bdi_work struct that hasn't been processed by this
+ * wb thread yet
+ */
+static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
+					   struct bdi_writeback *wb)
+{
+	struct bdi_work *work, *ret = NULL;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(work, &bdi->work_list, list) {
+		if (!test_and_clear_bit(wb->nr, &work->seen))
+			continue;
+
+		ret = work;
+		break;
+	}
+
+	rcu_read_unlock();
+	return ret;
+}
+
+static long wb_check_old_data_flush(struct bdi_writeback *wb)
+{
+	unsigned long expired;
+	long nr_pages;
+
+	expired = wb->last_old_flush +
+			msecs_to_jiffies(dirty_writeback_interval * 10);
+	if (time_before(jiffies, expired))
+		return 0;
+
+	wb->last_old_flush = jiffies;
+	nr_pages = global_page_state(NR_FILE_DIRTY) +
+			global_page_state(NR_UNSTABLE_NFS) +
+			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
+
+	if (nr_pages)
+		return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1);
+
+	return 0;
+}
+
+/*
+ * Retrieve work items and do the writeback they describe
+ */
+long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
+{
+	struct backing_dev_info *bdi = wb->bdi;
+	struct bdi_work *work;
+	long nr_pages, wrote = 0;
+
+	while ((work = get_next_work_item(bdi, wb)) != NULL) {
+		enum writeback_sync_modes sync_mode;
+
+		nr_pages = work->nr_pages;
+
+		/*
+		 * Override sync mode, in case we must wait for completion
+		 */
+		if (force_wait)
+			work->sync_mode = sync_mode = WB_SYNC_ALL;
+		else
+			sync_mode = work->sync_mode;
+
+		/*
+		 * If this isn't a data integrity operation, just notify
+		 * that we have seen this work and we are now starting it.
+		 */
+		if (sync_mode == WB_SYNC_NONE)
+			wb_clear_pending(wb, work);
+
+		wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0);
+
+		/*
+		 * This is a data integrity writeback, so only do the
+		 * notification when we have completed the work.
+		 */
+		if (sync_mode == WB_SYNC_ALL)
+			wb_clear_pending(wb, work);
+	}
+
+	/*
+	 * Check for periodic writeback, kupdated() style
+	 */
+	wrote += wb_check_old_data_flush(wb);
+
+	return wrote;
+}
+
+/*
+ * Handle writeback of dirty data for the device backed by this bdi. Also
+ * wakes up periodically and does kupdated style flushing.
+ */
+int bdi_writeback_task(struct bdi_writeback *wb)
+{
+	unsigned long last_active = jiffies;
+	unsigned long wait_jiffies = -1UL;
+	long pages_written;
+
+	while (!kthread_should_stop()) {
+		pages_written = wb_do_writeback(wb, 0);
+
+		if (pages_written)
+			last_active = jiffies;
+		else if (wait_jiffies != -1UL) {
+			unsigned long max_idle;
+
 			/*
-			 * We hold a reference to 'inode' so it couldn't have
-			 * been removed from s_inodes list while we dropped the
-			 * inode_lock.  We cannot iput the inode now as we can
-			 * be holding the last reference and we cannot iput it
-			 * under inode_lock. So we keep the reference and iput
-			 * it later.
+			 * Longest period of inactivity that we tolerate. If we
+			 * see dirty data again later, the task will get
+			 * recreated automatically.
 			 */
-			iput(old_inode);
-			old_inode = inode;
+			max_idle = max(5UL * 60 * HZ, wait_jiffies);
+			if (time_after(jiffies, max_idle + last_active))
+				break;
+		}
+
+		wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(wait_jiffies);
+		try_to_freeze();
+	}
+
+	return 0;
+}
+
+/*
+ * Schedule writeback for all backing devices. Expensive! If this is a data
+ * integrity operation, writeback will be complete when this returns. If
+ * we are simply called for WB_SYNC_NONE, then writeback will merely be
+ * scheduled to run.
+ */
+static void bdi_writeback_all(struct writeback_control *wbc)
+{
+	const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
+	struct backing_dev_info *bdi;
+	struct bdi_work *work;
+	LIST_HEAD(list);
+
+restart:
+	spin_lock(&bdi_lock);
+
+	list_for_each_entry(bdi, &bdi_list, bdi_list) {
+		struct bdi_work *work;
+
+		if (!bdi_has_dirty_io(bdi))
+			continue;
 
-			filemap_fdatawait(mapping);
+		/*
+		 * If work allocation fails, do the writes inline. We drop
+		 * the lock and restart the list writeout. This should be OK,
+		 * since this happens rarely and because the writeout should
+		 * eventually make more free memory available.
+		 */
+		work = bdi_alloc_work(wbc);
+		if (!work) {
+			struct writeback_control __wbc;
 
-			cond_resched();
+			/*
+			 * Not a data integrity writeout, just continue
+			 */
+			if (!must_wait)
+				continue;
 
-			spin_lock(&inode_lock);
+			spin_unlock(&bdi_lock);
+			__wbc = *wbc;
+			__wbc.bdi = bdi;
+			writeback_inodes_wbc(&__wbc);
+			goto restart;
 		}
-		spin_unlock(&inode_lock);
-		iput(old_inode);
+		if (must_wait)
+			list_add_tail(&work->wait_list, &list);
+
+		bdi_queue_work(bdi, work);
+	}
+
+	spin_unlock(&bdi_lock);
+
+	/*
+	 * If this is for WB_SYNC_ALL, wait for pending work to complete
+	 * before returning.
+	 */
+	while (!list_empty(&list)) {
+		work = list_entry(list.next, struct bdi_work, wait_list);
+		list_del(&work->wait_list);
+		bdi_wait_on_work_clear(work);
+		call_rcu(&work->rcu_head, bdi_work_free);
 	}
 }
 
 /*
- * Start writeback of dirty pagecache data against all unlocked inodes.
+ * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
+ * the whole world.
+ */
+void wakeup_flusher_threads(long nr_pages)
+{
+	struct writeback_control wbc = {
+		.sync_mode	= WB_SYNC_NONE,
+		.older_than_this = NULL,
+		.range_cyclic	= 1,
+	};
+
+	if (nr_pages == 0)
+		nr_pages = global_page_state(NR_FILE_DIRTY) +
+				global_page_state(NR_UNSTABLE_NFS);
+	wbc.nr_to_write = nr_pages;
+	bdi_writeback_all(&wbc);
+}
+
+static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+{
+	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
+		struct dentry *dentry;
+		const char *name = "?";
+
+		dentry = d_find_alias(inode);
+		if (dentry) {
+			spin_lock(&dentry->d_lock);
+			name = (const char *) dentry->d_name.name;
+		}
+		printk(KERN_DEBUG
+		       "%s(%d): dirtied inode %lu (%s) on %s\n",
+		       current->comm, task_pid_nr(current), inode->i_ino,
+		       name, inode->i_sb->s_id);
+		if (dentry) {
+			spin_unlock(&dentry->d_lock);
+			dput(dentry);
+		}
+	}
+}
+
+/**
+ *	__mark_inode_dirty -	internal function
+ *	@inode: inode to mark
+ *	@flags: what kind of dirty (i.e. I_DIRTY_SYNC)
+ *	Mark an inode as dirty. Callers should use mark_inode_dirty or
+ *  	mark_inode_dirty_sync.
  *
- * Note:
- * We don't need to grab a reference to superblock here. If it has non-empty
- * ->b_dirty it's hadn't been killed yet and kill_super() won't proceed
- * past sync_inodes_sb() until the ->b_dirty/b_io/b_more_io lists are all
- * empty. Since __sync_single_inode() regains inode_lock before it finally moves
- * inode from superblock lists we are OK.
+ * Put the inode on the super block's dirty list.
+ *
+ * CAREFUL! We mark it dirty unconditionally, but move it onto the
+ * dirty list only if it is hashed or if it refers to a blockdev.
+ * If it was not hashed, it will never be added to the dirty list
+ * even if it is later hashed, as it will have been marked dirty already.
+ *
+ * In short, make sure you hash any inodes _before_ you start marking
+ * them dirty.
  *
- * If `older_than_this' is non-zero then only flush inodes which have a
- * flushtime older than *older_than_this.
+ * This function *must* be atomic for the I_DIRTY_PAGES case -
+ * set_page_dirty() is called under spinlock in several places.
  *
- * If `bdi' is non-zero then we will scan the first inode against each
- * superblock until we find the matching ones.  One group will be the dirty
- * inodes against a filesystem.  Then when we hit the dummy blockdev superblock,
- * sync_sb_inodes will seekout the blockdev which matches `bdi'.  Maybe not
- * super-efficient but we're about to do a ton of I/O...
+ * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
+ * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
+ * the kernel-internal blockdev inode represents the dirtying time of the
+ * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
+ * page->mapping->host, so the page-dirtying time is recorded in the internal
+ * blockdev inode.
  */
-void
-writeback_inodes(struct writeback_control *wbc)
+void __mark_inode_dirty(struct inode *inode, int flags)
 {
-	struct super_block *sb;
+	struct super_block *sb = inode->i_sb;
 
-	might_sleep();
-	spin_lock(&sb_lock);
-restart:
-	list_for_each_entry_reverse(sb, &super_blocks, s_list) {
-		if (sb_has_dirty_inodes(sb)) {
-			/* we're making our own get_super here */
-			sb->s_count++;
-			spin_unlock(&sb_lock);
-			/*
-			 * If we can't get the readlock, there's no sense in
-			 * waiting around, most of the time the FS is going to
-			 * be unmounted by the time it is released.
-			 */
-			if (down_read_trylock(&sb->s_umount)) {
-				if (sb->s_root)
-					generic_sync_sb_inodes(sb, wbc);
-				up_read(&sb->s_umount);
-			}
-			spin_lock(&sb_lock);
-			if (__put_super_and_need_restart(sb))
-				goto restart;
+	/*
+	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
+	 * dirty the inode itself
+	 */
+	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+		if (sb->s_op->dirty_inode)
+			sb->s_op->dirty_inode(inode);
+	}
+
+	/*
+	 * make sure that changes are seen by all cpus before we test i_state
+	 * -- mikulas
+	 */
+	smp_mb();
+
+	/* avoid the locking if we can */
+	if ((inode->i_state & flags) == flags)
+		return;
+
+	if (unlikely(block_dump))
+		block_dump___mark_inode_dirty(inode);
+
+	spin_lock(&inode_lock);
+	if ((inode->i_state & flags) != flags) {
+		const int was_dirty = inode->i_state & I_DIRTY;
+
+		inode->i_state |= flags;
+
+		/*
+		 * If the inode is being synced, just update its dirty state.
+		 * The unlocker will place the inode on the appropriate
+		 * superblock list, based upon its state.
+		 */
+		if (inode->i_state & I_SYNC)
+			goto out;
+
+		/*
+		 * Only add valid (hashed) inodes to the superblock's
+		 * dirty list.  Add blockdev inodes as well.
+		 */
+		if (!S_ISBLK(inode->i_mode)) {
+			if (hlist_unhashed(&inode->i_hash))
+				goto out;
+		}
+		if (inode->i_state & (I_FREEING|I_CLEAR))
+			goto out;
+
+		/*
+		 * If the inode was already on b_dirty/b_io/b_more_io, don't
+		 * reposition it (that would break b_dirty time-ordering).
+		 */
+		if (!was_dirty) {
+			struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+
+			inode->dirtied_when = jiffies;
+			list_move(&inode->i_list, &wb->b_dirty);
 		}
-		if (wbc->nr_to_write <= 0)
-			break;
 	}
-	spin_unlock(&sb_lock);
+out:
+	spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL(__mark_inode_dirty);
+
+/*
+ * Write out a superblock's list of dirty inodes.  A wait will be performed
+ * upon no inodes, all inodes or the final one, depending upon sync_mode.
+ *
+ * If older_than_this is non-NULL, then only write out inodes which
+ * had their first dirtying at a time earlier than *older_than_this.
+ *
+ * If we're a pdlfush thread, then implement pdflush collision avoidance
+ * against the entire list.
+ *
+ * If `bdi' is non-zero then we're being asked to writeback a specific queue.
+ * This function assumes that the blockdev superblock's inodes are backed by
+ * a variety of queues, so all inodes are searched.  For other superblocks,
+ * assume that all inodes are backed by the same queue.
+ *
+ * The inodes to be written are parked on bdi->b_io.  They are moved back onto
+ * bdi->b_dirty as they are selected for writing.  This way, none can be missed
+ * on the writer throttling path, and we get decent balancing between many
+ * throttled threads: we don't want them all piling up on inode_sync_wait.
+ */
+static void wait_sb_inodes(struct writeback_control *wbc)
+{
+	struct inode *inode, *old_inode = NULL;
+
+	/*
+	 * We need to be protected against the filesystem going from
+	 * r/o to r/w or vice versa.
+	 */
+	WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount));
+
+	spin_lock(&inode_lock);
+
+	/*
+	 * Data integrity sync. Must wait for all pages under writeback,
+	 * because there may have been pages dirtied before our sync
+	 * call, but which had writeout started before we write it out.
+	 * In which case, the inode may not be on the dirty list, but
+	 * we still have to wait for that writeout.
+	 */
+	list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) {
+		struct address_space *mapping;
+
+		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+			continue;
+		mapping = inode->i_mapping;
+		if (mapping->nrpages == 0)
+			continue;
+		__iget(inode);
+		spin_unlock(&inode_lock);
+		/*
+		 * We hold a reference to 'inode' so it couldn't have
+		 * been removed from s_inodes list while we dropped the
+		 * inode_lock.  We cannot iput the inode now as we can
+		 * be holding the last reference and we cannot iput it
+		 * under inode_lock. So we keep the reference and iput
+		 * it later.
+		 */
+		iput(old_inode);
+		old_inode = inode;
+
+		filemap_fdatawait(mapping);
+
+		cond_resched();
+
+		spin_lock(&inode_lock);
+	}
+	spin_unlock(&inode_lock);
+	iput(old_inode);
 }
 
 /**
@@ -715,6 +1133,7 @@ restart:
 long writeback_inodes_sb(struct super_block *sb)
 {
 	struct writeback_control wbc = {
+		.sb		= sb,
 		.sync_mode	= WB_SYNC_NONE,
 		.range_start	= 0,
 		.range_end	= LLONG_MAX,
@@ -727,7 +1146,7 @@ long writeback_inodes_sb(struct super_block *sb)
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 
 	wbc.nr_to_write = nr_to_write;
-	generic_sync_sb_inodes(sb, &wbc);
+	bdi_writeback_all(&wbc);
 	return nr_to_write - wbc.nr_to_write;
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
@@ -742,6 +1161,7 @@ EXPORT_SYMBOL(writeback_inodes_sb);
 long sync_inodes_sb(struct super_block *sb)
 {
 	struct writeback_control wbc = {
+		.sb		= sb,
 		.sync_mode	= WB_SYNC_ALL,
 		.range_start	= 0,
 		.range_end	= LLONG_MAX,
@@ -749,7 +1169,8 @@ long sync_inodes_sb(struct super_block *sb)
 	long nr_to_write = LONG_MAX; /* doesn't actually matter */
 
 	wbc.nr_to_write = nr_to_write;
-	generic_sync_sb_inodes(sb, &wbc);
+	bdi_writeback_all(&wbc);
+	wait_sb_inodes(&wbc);
 	return nr_to_write - wbc.nr_to_write;
 }
 EXPORT_SYMBOL(sync_inodes_sb);
diff --git a/fs/super.c b/fs/super.c
index 0d22ce3be4aa..9cda337ddae2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -168,7 +168,7 @@ int __put_super_and_need_restart(struct super_block *sb)
  *	Drops a temporary reference, frees superblock if there's no
  *	references left.
  */
-static void put_super(struct super_block *sb)
+void put_super(struct super_block *sb)
 {
 	spin_lock(&sb_lock);
 	__put_super(sb);
diff --git a/fs/sync.c b/fs/sync.c
index 66f210476f40..103cc7fdd3df 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -120,7 +120,7 @@ restart:
  */
 SYSCALL_DEFINE0(sync)
 {
-	wakeup_pdflush(0);
+	wakeup_flusher_threads(0);
 	sync_filesystems(0);
 	sync_filesystems(1);
 	if (unlikely(laptop_mode))
-- 
cgit v1.2.2


From d0bceac747b547c0b4769b91fec7d3c15600153f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 18 May 2009 08:20:32 +0200
Subject: writeback: get rid of pdflush completely

It is now unused, so kill it off.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 7f6dae8aa47f..2e601ce581c9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -29,6 +29,11 @@
 
 #define inode_to_bdi(inode)	((inode)->i_mapping->backing_dev_info)
 
+/*
+ * We don't actually have pdflush, but this one is exported though /proc...
+ */
+int nr_pdflush_threads;
+
 /*
  * Work items for the bdi_writeback threads
  */
-- 
cgit v1.2.2


From d993831fa7ffeb89e994f046f93eeb09ec91df08 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 12 Jun 2009 14:45:52 +0200
Subject: writeback: add name to backing_dev_info

This enables us to track who does what and print info. Its main use
is catching dirty inodes on the default_backing_dev_info, so we can
fix that up.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/btrfs/disk-io.c   | 1 +
 fs/char_dev.c        | 1 +
 fs/configfs/inode.c  | 1 +
 fs/fuse/inode.c      | 1 +
 fs/hugetlbfs/inode.c | 1 +
 fs/nfs/client.c      | 1 +
 fs/ocfs2/dlm/dlmfs.c | 1 +
 fs/ramfs/inode.c     | 1 +
 fs/sysfs/inode.c     | 1 +
 fs/ubifs/super.c     | 1 +
 10 files changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e83be2e4602c..15831d5c7367 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1352,6 +1352,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
 	int err;
 
+	bdi->name = "btrfs";
 	bdi->capabilities = BDI_CAP_MAP_COPY;
 	err = bdi_init(bdi);
 	if (err)
diff --git a/fs/char_dev.c b/fs/char_dev.c
index a173551e19d7..7c27a8ebef6a 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -31,6 +31,7 @@
  * - no readahead or I/O queue unplugging required
  */
 struct backing_dev_info directly_mappable_cdev_bdi = {
+	.name = "char",
 	.capabilities	= (
 #ifdef CONFIG_MMU
 		/* permit private copies of the data to be taken */
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4921e7426d95..a2f746066c5d 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -51,6 +51,7 @@ static const struct address_space_operations configfs_aops = {
 };
 
 static struct backing_dev_info configfs_backing_dev_info = {
+	.name		= "configfs",
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f91ccc4a189d..4567db6f9430 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -801,6 +801,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 {
 	int err;
 
+	fc->bdi.name = "fuse";
 	fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 	fc->bdi.unplug_io_fn = default_unplug_io_fn;
 	/* fuse does it's own writeback accounting */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index cb88dac8ccaa..a93b885311d8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -44,6 +44,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations;
 static const struct inode_operations hugetlbfs_inode_operations;
 
 static struct backing_dev_info hugetlbfs_backing_dev_info = {
+	.name		= "hugetlbfs",
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8d25ccb2d51d..c6be84a161f6 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -879,6 +879,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
 		server->rsize = NFS_MAX_FILE_IO_SIZE;
 	server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
+	server->backing_dev_info.name = "nfs";
 	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
 
 	if (server->wsize > max_rpc_payload)
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 1c9efb406a96..02bf17808bdc 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -325,6 +325,7 @@ clear_fields:
 }
 
 static struct backing_dev_info dlmfs_backing_dev_info = {
+	.name		= "ocfs2-dlmfs",
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 0ff7566c767c..a7f0110fca4c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -46,6 +46,7 @@ static const struct super_operations ramfs_ops;
 static const struct inode_operations ramfs_dir_inode_operations;
 
 static struct backing_dev_info ramfs_backing_dev_info = {
+	.name		= "ramfs",
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK |
 			  BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 555f0ff988df..e57f98e54fce 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -29,6 +29,7 @@ static const struct address_space_operations sysfs_aops = {
 };
 
 static struct backing_dev_info sysfs_backing_dev_info = {
+	.name		= "sysfs",
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 8d6050a5966c..51763aa8f4de 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1965,6 +1965,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	 *
 	 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
 	 */
+	c->bdi.name = "ubifs",
 	c->bdi.capabilities = BDI_CAP_MAP_COPY;
 	c->bdi.unplug_io_fn = default_unplug_io_fn;
 	err  = bdi_init(&c->bdi);
-- 
cgit v1.2.2


From 500b067c5e6ceea49cf280a02597b1169320e08c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 9 Sep 2009 09:10:25 +0200
Subject: writeback: check for registered bdi in flusher add and inode dirty

Also a debugging aid. We want to catch dirty inodes being added to
backing devices that don't do writeback.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2e601ce581c9..da86ef58e427 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1046,6 +1046,14 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		 */
 		if (!was_dirty) {
 			struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+			struct backing_dev_info *bdi = wb->bdi;
+
+			if (bdi_cap_writeback_dirty(bdi) &&
+			    !test_bit(BDI_registered, &bdi->state)) {
+				WARN_ON(1);
+				printk(KERN_ERR "bdi-%s not registered\n",
+								bdi->name);
+			}
 
 			inode->dirtied_when = jiffies;
 			list_move(&inode->i_list, &wb->b_dirty);
-- 
cgit v1.2.2


From 1f98a13f623e0ef666690a18c1250335fc6d7ef1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 11 Sep 2009 14:32:04 +0200
Subject: bio: first step in sanitizing the bio->bi_rw flag testing

Get rid of any functions that test for these bits and make callers
use bio_rw_flagged() directly. Then it is at least directly apparent
what variable and flag they check.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/btrfs/volumes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5dbefd11b4af..5cf405b0828d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -260,7 +260,7 @@ loop_lock:
 		num_run++;
 		batch_run++;
 
-		if (bio_sync(cur))
+		if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
 			num_sync_run++;
 
 		if (need_resched()) {
@@ -2903,7 +2903,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
 	bio->bi_rw |= rw;
 
 	spin_lock(&device->io_lock);
-	if (bio_sync(bio))
+	if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
 		pending_bios = &device->pending_sync_bios;
 	else
 		pending_bios = &device->pending_bios;
-- 
cgit v1.2.2


From 723590ed52d244b025f10a8e2f54c2f57ed02b4d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Sat, 15 Aug 2009 08:43:22 +0200
Subject: splice: update mtime and atime on files

Splice should update the modification and access times on regular
files just like read and write. Not updating mtime will confuse
backup tools, etc...

This patch only adds the time updates for regular files.  For pipes
and other special files that splice touches the need for updating the
times is less clear.  Let's discuss and fix that separately.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 73766d24f97b..aec4014291be 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -502,8 +502,10 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 		len = left;
 
 	ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
-	if (ret > 0)
+	if (ret > 0) {
 		*ppos += ret;
+		file_accessed(in);
+	}
 
 	return ret;
 }
@@ -963,8 +965,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
 		ret = file_remove_suid(out);
-		if (!ret)
+		if (!ret) {
+			file_update_time(out);
 			ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
+		}
 		mutex_unlock(&inode->i_mutex);
 	} while (ret > 0);
 	splice_from_pipe_end(pipe, &sd);
-- 
cgit v1.2.2


From 7ad9bb651fc2036ea94bed94da76a4b08959a911 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 11 Sep 2009 16:51:28 -0400
Subject: ext4: Fix initalization of s_flex_groups

The s_flex_groups array should have been initialized using atomic_add
to sum up the free counts from the block groups that make up a
flex_bg.  By using atomic_set, the value of the s_flex_groups array
was set to the values of the last block group in the flex_bg.

The impact of this bug is that the block and inode allocation
algorithms might not pick the best flex_bg for new allocation.

Thanks to Damien Guibouret for pointing out this problem!

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9f6fa3f74629..04c693357336 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1694,12 +1694,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, NULL);
 
 		flex_group = ext4_flex_group(sbi, i);
-		atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
-			   ext4_free_inodes_count(sb, gdp));
-		atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
-			   ext4_free_blks_count(sb, gdp));
-		atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
-			   ext4_used_dirs_count(sb, gdp));
+		atomic_add(ext4_free_inodes_count(sb, gdp),
+			   &sbi->s_flex_groups[flex_group].free_inodes);
+		atomic_add(ext4_free_blks_count(sb, gdp),
+			   &sbi->s_flex_groups[flex_group].free_blocks);
+		atomic_add(ext4_used_dirs_count(sb, gdp),
+			   &sbi->s_flex_groups[flex_group].used_dirs);
 	}
 
 	return 1;
-- 
cgit v1.2.2


From a9327cac440be4d8333bba975cbbf76045096275 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Fri, 11 Sep 2009 09:18:54 +0200
Subject: Seperate read and write statistics of in_flight requests

Currently, there is a single in_flight counter measuring the number of
requests in the request_queue. But some monitoring tools would like to
know how many read requests and write requests are in progress. Split the
current in_flight counter into two seperate counters for read and write.

This information is exported as a sysfs attribute, as changing the
currently available stat files would break the existing tools.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index ea4e6cb29e13..619ba99dfe39 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev,
 		part_stat_read(p, merges[WRITE]),
 		(unsigned long long)part_stat_read(p, sectors[WRITE]),
 		jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
-		p->in_flight,
+		part_in_flight(p),
 		jiffies_to_msecs(part_stat_read(p, io_ticks)),
 		jiffies_to_msecs(part_stat_read(p, time_in_queue)));
 }
 
+ssize_t part_inflight_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+
+	return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
+}
+
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 ssize_t part_fail_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
@@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
 	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_size.attr,
 	&dev_attr_alignment_offset.attr,
 	&dev_attr_stat.attr,
+	&dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
-- 
cgit v1.2.2


From 746cd1e7e4a555ddaee53b19a46e05c9c61eaf09 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 12 Sep 2009 07:35:43 +0200
Subject: block: use blkdev_issue_discard in blk_ioctl_discard

blk_ioctl_discard duplicates large amounts of code from blkdev_issue_discard,
the only difference between the two is that blkdev_issue_discard needs to
send a barrier discard request and blk_ioctl_discard a non-barrier one,
and blk_ioctl_discard needs to wait on the request.  To facilitates this
add a flags argument to blkdev_issue_discard to control both aspects of the
behaviour.  This will be very useful later on for using the waiting
funcitonality for other callers.

Based on an earlier patch from Matthew Wilcox <matthew@wil.cx>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/btrfs/extent-tree.c | 3 ++-
 fs/gfs2/rgrp.c         | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 72a2b9c28e9f..535f85ba104f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1511,7 +1511,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
-	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
+			     DISCARD_FL_BARRIER);
 }
 #endif
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fba795798d3a..fbc43241f2ef 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -857,7 +857,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 					goto start_new_extent;
 				if ((start + nr_sects) != blk) {
 					rv = blkdev_issue_discard(bdev, start,
-							    nr_sects, GFP_NOFS);
+							    nr_sects, GFP_NOFS,
+							    DISCARD_FL_BARRIER);
 					if (rv)
 						goto fail;
 					nr_sects = 0;
@@ -871,7 +872,8 @@ start_new_extent:
 		}
 	}
 	if (nr_sects) {
-		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS);
+		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
+					 DISCARD_FL_BARRIER);
 		if (rv)
 			goto fail;
 	}
-- 
cgit v1.2.2


From 86d006365610fe6cda243d89b67d5047dca44656 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 14 Sep 2009 09:50:57 +0100
Subject: GFS2: Whitespace fixes

Reported-by: Daniel Walker <dwalker@fifo99.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c  | 2 +-
 fs/gfs2/xattr.c | 4 ++--
 fs/gfs2/xattr.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index caaa665d4033..18d3a28554ac 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1463,7 +1463,7 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
 {
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
 	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
-	        (unsigned long long)rgd->rd_addr);
+		(unsigned long long)rgd->rd_addr);
 	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
 	gfs2_rgrp_dump(NULL, rgd->rd_gl);
 	rgd->rd_flags |= GFS2_RDF_ERROR;
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 34ae9ba4c9fe..8a0f8ef6ee27 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -562,7 +562,7 @@ int gfs2_xattr_get(struct inode *inode, int type, const char *name,
 		return error;
 	if (!el.el_ea)
 		return -ENODATA;
-	if (size) 
+	if (size)
 		error = gfs2_ea_get_copy(ip, &el, buffer, size);
 	else
 		error = GFS2_EA_DATA_LEN(el.el_ea);
@@ -1046,7 +1046,7 @@ static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
 				     GFS2_EA2NEXT(el->el_prev) == el->el_ea);
 	}
 
-	return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
+	return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0);
 }
 
 static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index 4040a188f63b..cbdfd7743733 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h
@@ -19,7 +19,7 @@ struct iattr;
 #define GFS2_EA_SIZE(ea) \
 ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
       ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
-                                  (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
+				  (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
 
 #define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
 #define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
-- 
cgit v1.2.2


From 1b2f5a641bca91966fd91dc489238068a0ab4b69 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 22 Aug 2009 19:10:07 +0900
Subject: nilfs2: fix ignored error code in __nilfs_read_inode()

The __nilfs_read_inode function is ignoring the error code returned
from nilfs_read_inode_common(), and wrongly delivers a success code
(zero) when it escapes from the function in erroneous cases.

This adds the missing error handling.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index fe9d8f2a13f8..807e584b163d 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -430,7 +430,8 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
 
 	raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
 
-	if (nilfs_read_inode_common(inode, raw_inode))
+	err = nilfs_read_inode_common(inode, raw_inode);
+	if (err)
 		goto failed_unmap;
 
 	if (S_ISREG(inode->i_mode)) {
-- 
cgit v1.2.2


From b5696e5e0dbfb6323277d51d67d230317c18aba9 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 3 Sep 2009 17:42:48 +0200
Subject: nilfs2: fix format string compile warning (ino_t)

Unlike on most other architectures ino_t is an unsigned int on s390.
So add an explicit cast to avoid this compile warning:

fs/nilfs2/recovery.c: In function 'recover_dsync_blocks':
fs/nilfs2/recovery.c:555: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'ino_t'

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/recovery.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index d80cc71be749..6dc83591d118 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -552,7 +552,8 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
 		printk(KERN_WARNING
 		       "NILFS warning: error recovering data block "
 		       "(err=%d, ino=%lu, block-offset=%llu)\n",
-		       err, rb->ino, (unsigned long long)rb->blkoff);
+		       err, (unsigned long)rb->ino,
+		       (unsigned long long)rb->blkoff);
 		if (!err2)
 			err2 = err;
  next:
-- 
cgit v1.2.2


From 027d6404eb4327878454db72a006adfcb8001bb8 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 2 Aug 2009 22:45:33 +0900
Subject: nilfs2: use semaphore to protect pointer to a writable FS-instance

will get rid of nilfs_get_writer() and nilfs_put_writer() pair used to
retain a writable FS-instance for a period.

The pair functions were making up some kind of recursive lock with a
mutex, but they became overkill since the commit
201913ed746c7724a40d33ee5a0b6a1fd2ef3193.  Furthermore, they caused
the following lockdep warning because the mutex can be released by a
task which didn't lock it:

 =====================================
 [ BUG: bad unlock balance detected! ]
 -------------------------------------
 kswapd0/422 is trying to release lock (&nilfs->ns_writer_mutex) at:
 [<c1359ff5>] mutex_unlock+0x8/0xa
 but there are no more locks to release!

 other info that might help us debug this:
 no locks held by kswapd0/422.

 stack backtrace:
 Pid: 422, comm: kswapd0 Not tainted 2.6.31-rc4-nilfs #51
 Call Trace:
  [<c1358f97>] ? printk+0xf/0x18
  [<c104fea7>] print_unlock_inbalance_bug+0xcc/0xd7
  [<c11578de>] ? prop_put_global+0x3/0x35
  [<c1050195>] lock_release+0xed/0x1dc
  [<c1359ff5>] ? mutex_unlock+0x8/0xa
  [<c1359f83>] __mutex_unlock_slowpath+0xaf/0x119
  [<c1359ff5>] mutex_unlock+0x8/0xa
  [<d1284add>] nilfs_mdt_write_page+0xd8/0xe1 [nilfs2]
  [<c1092653>] shrink_page_list+0x379/0x68d
  [<c109171b>] ? isolate_pages_global+0xb4/0x18c
  [<c1092bd2>] shrink_list+0x26b/0x54b
  [<c10930be>] shrink_zone+0x20c/0x2a2
  [<c10936b7>] kswapd+0x407/0x591
  [<c1091667>] ? isolate_pages_global+0x0/0x18c
  [<c1040603>] ? autoremove_wake_function+0x0/0x33
  [<c10932b0>] ? kswapd+0x0/0x591
  [<c104033b>] kthread+0x69/0x6e
  [<c10402d2>] ? kthread+0x0/0x6e
  [<c1003e33>] kernel_thread_helper+0x7/0x1a

This patch uses a reader/writer semaphore instead of the own lock and
kills this warning.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/mdt.c       |  8 +++++---
 fs/nilfs2/the_nilfs.c |  3 +--
 fs/nilfs2/the_nilfs.h | 27 ++++++---------------------
 3 files changed, 12 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 2dfd47714ae5..a5cd80162dc5 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -402,6 +402,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 	struct inode *inode = container_of(page->mapping,
 					   struct inode, i_data);
 	struct super_block *sb = inode->i_sb;
+	struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
 	struct nilfs_sb_info *writer = NULL;
 	int err = 0;
 
@@ -411,9 +412,10 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 	if (page->mapping->assoc_mapping)
 		return 0; /* Do not request flush for shadow page cache */
 	if (!sb) {
-		writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
+		down_read(&nilfs->ns_writer_sem);
+		writer = nilfs->ns_writer;
 		if (!writer) {
-			nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
+			up_read(&nilfs->ns_writer_sem);
 			return -EROFS;
 		}
 		sb = writer->s_super;
@@ -425,7 +427,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 		nilfs_flush_segment(sb, inode->i_ino);
 
 	if (writer)
-		nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
+		up_read(&nilfs->ns_writer_sem);
 	return err;
 }
 
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8b8889825716..0c4573653b87 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -68,12 +68,11 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 
 	nilfs->ns_bdev = bdev;
 	atomic_set(&nilfs->ns_count, 1);
-	atomic_set(&nilfs->ns_writer_refcount, -1);
 	atomic_set(&nilfs->ns_ndirtyblks, 0);
 	init_rwsem(&nilfs->ns_sem);
 	init_rwsem(&nilfs->ns_super_sem);
 	mutex_init(&nilfs->ns_mount_mutex);
-	mutex_init(&nilfs->ns_writer_mutex);
+	init_rwsem(&nilfs->ns_writer_sem);
 	INIT_LIST_HEAD(&nilfs->ns_list);
 	INIT_LIST_HEAD(&nilfs->ns_supers);
 	spin_lock_init(&nilfs->ns_last_segment_lock);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 1b9caafb8662..fa3a1dfe4053 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -50,8 +50,7 @@ enum {
  * @ns_sem: semaphore for shared states
  * @ns_super_sem: semaphore for global operations across super block instances
  * @ns_mount_mutex: mutex protecting mount process of nilfs
- * @ns_writer_mutex: mutex protecting ns_writer attach/detach
- * @ns_writer_refcount: number of referrers on ns_writer
+ * @ns_writer_sem: semaphore protecting ns_writer attach/detach
  * @ns_current: back pointer to current mount
  * @ns_sbh: buffer heads of on-disk super blocks
  * @ns_sbp: pointers to super block data
@@ -100,8 +99,7 @@ struct the_nilfs {
 	struct rw_semaphore	ns_sem;
 	struct rw_semaphore	ns_super_sem;
 	struct mutex		ns_mount_mutex;
-	struct mutex		ns_writer_mutex;
-	atomic_t		ns_writer_refcount;
+	struct rw_semaphore	ns_writer_sem;
 
 	/*
 	 * components protected by ns_super_sem
@@ -221,34 +219,21 @@ static inline void get_nilfs(struct the_nilfs *nilfs)
 	atomic_inc(&nilfs->ns_count);
 }
 
-static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs)
-{
-	if (atomic_inc_and_test(&nilfs->ns_writer_refcount))
-		mutex_lock(&nilfs->ns_writer_mutex);
-	return nilfs->ns_writer;
-}
-
-static inline void nilfs_put_writer(struct the_nilfs *nilfs)
-{
-	if (atomic_add_negative(-1, &nilfs->ns_writer_refcount))
-		mutex_unlock(&nilfs->ns_writer_mutex);
-}
-
 static inline void
 nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 {
-	mutex_lock(&nilfs->ns_writer_mutex);
+	down_write(&nilfs->ns_writer_sem);
 	nilfs->ns_writer = sbi;
-	mutex_unlock(&nilfs->ns_writer_mutex);
+	up_write(&nilfs->ns_writer_sem);
 }
 
 static inline void
 nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 {
-	mutex_lock(&nilfs->ns_writer_mutex);
+	down_write(&nilfs->ns_writer_sem);
 	if (sbi == nilfs->ns_writer)
 		nilfs->ns_writer = NULL;
-	mutex_unlock(&nilfs->ns_writer_mutex);
+	up_write(&nilfs->ns_writer_sem);
 }
 
 static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
-- 
cgit v1.2.2


From 143511046765504d2d1be633efd710f8d84e0407 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 6 Sep 2009 17:49:49 +0900
Subject: nilfs2: always lookup disk block address before reading metadata
 block

The current metadata file code skips disk address lookup for its data
block if the buffer has a mapped flag.

This has a potential risk to cause read request to be performed
against the stale block address that GC moved, and it may lead to meta
data corruption.  The mapped flag is safe if the buffer has an
uptodate flag, otherwise it may prevent necessary update of disk
address in the next read.

This will avoid the potential problem by ensuring disk address lookup
before reading metadata block even for buffers with the mapped flag.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/mdt.c | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index a5cd80162dc5..1ae8d56052c3 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -103,15 +103,12 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
 		goto failed_unlock;
 
 	err = -EEXIST;
-	if (buffer_uptodate(bh) || buffer_mapped(bh))
+	if (buffer_uptodate(bh))
 		goto failed_bh;
-#if 0
-	/* The uptodate flag is not protected by the page lock, but
-	   the mapped flag is.  Thus, we don't have to wait the buffer. */
+
 	wait_on_buffer(bh);
 	if (buffer_uptodate(bh))
 		goto failed_bh;
-#endif
 
 	bh->b_bdev = nilfs->ns_bdev;
 	err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
@@ -162,17 +159,15 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
 		unlock_buffer(bh);
 		goto out;
 	}
-	if (!buffer_mapped(bh)) { /* unused buffer */
-		ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff,
-					&blknum);
-		if (unlikely(ret)) {
-			unlock_buffer(bh);
-			goto failed_bh;
-		}
-		bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
-		bh->b_blocknr = blknum;
-		set_buffer_mapped(bh);
+
+	ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum);
+	if (unlikely(ret)) {
+		unlock_buffer(bh);
+		goto failed_bh;
 	}
+	bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
+	bh->b_blocknr = blknum;
+	set_buffer_mapped(bh);
 
 	bh->b_end_io = end_buffer_read_sync;
 	get_bh(bh);
-- 
cgit v1.2.2


From b58a285ba40866e22e5876969a8f796a74311d9e Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Wed, 24 Jun 2009 20:06:34 +0900
Subject: nilfs2: implement nilfs_show_options to display mount options in
 /proc/mounts

This is a patch to display mount options in procfs.
Mount options will show up in the /proc/mounts as other fs does.

...
/dev/sda6 /mnt nilfs2 ro,relatime,barrier=off,cp=3,order=strict 0 0
...

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/super.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 151964f0de4c..ebbefb9d26c9 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -50,6 +50,8 @@
 #include <linux/writeback.h>
 #include <linux/kobject.h>
 #include <linux/exportfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
 #include "nilfs.h"
 #include "mdt.h"
 #include "alloc.h"
@@ -529,6 +531,26 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
+static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	struct super_block *sb = vfs->mnt_sb;
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+
+	if (!nilfs_test_opt(sbi, BARRIER))
+		seq_printf(seq, ",barrier=off");
+	if (nilfs_test_opt(sbi, SNAPSHOT))
+		seq_printf(seq, ",cp=%llu",
+			   (unsigned long long int)sbi->s_snapshot_cno);
+	if (nilfs_test_opt(sbi, ERRORS_RO))
+		seq_printf(seq, ",errors=remount-ro");
+	if (nilfs_test_opt(sbi, ERRORS_PANIC))
+		seq_printf(seq, ",errors=panic");
+	if (nilfs_test_opt(sbi, STRICT_ORDER))
+		seq_printf(seq, ",order=strict");
+
+	return 0;
+}
+
 static struct super_operations nilfs_sops = {
 	.alloc_inode    = nilfs_alloc_inode,
 	.destroy_inode  = nilfs_destroy_inode,
@@ -546,7 +568,7 @@ static struct super_operations nilfs_sops = {
 	.remount_fs     = nilfs_remount,
 	.clear_inode    = nilfs_clear_inode,
 	/* .umount_begin */
-	/* .show_options */
+	.show_options = nilfs_show_options
 };
 
 static struct inode *
-- 
cgit v1.2.2


From ec5d66abdb0caf8b753a138568f20770a3d64c8c Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Fri, 10 Jul 2009 19:57:28 +0900
Subject: nilfs2: remove redundant super block commit

This removes redundant super block commit.

nilfs_write_super will call nilfs_commit_super to store super block
into block device.  However, nilfs_put_super will call
nilfs_commit_super right after calling nilfs_write_super.  So calling
nilfs_write_super in nilfs_put_super would be redundant.

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/super.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index ebbefb9d26c9..e543eda0d939 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -313,9 +313,6 @@ static void nilfs_put_super(struct super_block *sb)
 
 	lock_kernel();
 
-	if (sb->s_dirt)
-		nilfs_write_super(sb);
-
 	nilfs_detach_segment_constructor(sbi);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
-- 
cgit v1.2.2


From 6233caa9d5b153c2190d6c1c35c1dd1010104fc1 Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Thu, 23 Jul 2009 01:26:33 +0900
Subject: nilfs2: fix disorder of nilfs_write_super in nilfs_sync_fs

This fixes disorder of nilfs_write_super in nilfs_sync_fs.  Commiting
super block must be the end of the function so that every changes are
reflected.

->sync_fs() is not called frequently so this makes nilfs_sync_fs call
nilfs_commit_super instead of nilfs_write_super.

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/super.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index e543eda0d939..9926a1d6d225 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -385,13 +385,19 @@ static void nilfs_write_super(struct super_block *sb)
 
 static int nilfs_sync_fs(struct super_block *sb, int wait)
 {
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct the_nilfs *nilfs = sbi->s_nilfs;
 	int err = 0;
 
-	nilfs_write_super(sb);
-
 	/* This function is called when super block should be written back */
 	if (wait)
 		err = nilfs_construct_segment(sb);
+
+	down_write(&nilfs->ns_sem);
+	if (sb->s_dirt)
+		nilfs_commit_super(sbi, 1);
+	up_write(&nilfs->ns_sem);
+
 	return err;
 }
 
-- 
cgit v1.2.2


From 79efdd94111f30c373fce05e4e5822d8ff671c2a Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Thu, 23 Jul 2009 01:26:34 +0900
Subject: nilfs2: clean up nilfs_write_super

Separate conditions that check if syncing super block and alternative
super block are required as inline functions to reuse the conditions.

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/super.c     | 11 +++--------
 fs/nilfs2/the_nilfs.h | 14 ++++++++++++++
 2 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 9926a1d6d225..70e8613a1954 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -367,17 +367,12 @@ static void nilfs_write_super(struct super_block *sb)
 
 	down_write(&nilfs->ns_sem);
 	if (!(sb->s_flags & MS_RDONLY)) {
-		struct nilfs_super_block **sbp = nilfs->ns_sbp;
-		u64 t = get_seconds();
-		int dupsb;
-
-		if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] &&
-		    t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) {
+		if (!nilfs_discontinued(nilfs) &&
+		    !nilfs_sb_need_update(nilfs)) {
 			up_write(&nilfs->ns_sem);
 			return;
 		}
-		dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
-		nilfs_commit_super(sbi, dupsb);
+		nilfs_commit_super(sbi, nilfs_altsb_need_update(nilfs));
 	}
 	sb->s_dirt = 0;
 	up_write(&nilfs->ns_sem);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index fa3a1dfe4053..68e9626e3d44 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -200,6 +200,20 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
 #define NILFS_SB_FREQ		10
 #define NILFS_ALTSB_FREQ	60  /* spare superblock */
 
+static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
+{
+	u64 t = get_seconds();
+	return t < nilfs->ns_sbwtime[0] ||
+		 t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ;
+}
+
+static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs)
+{
+	u64 t = get_seconds();
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
+}
+
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
 struct the_nilfs *find_or_create_nilfs(struct block_device *);
 void put_nilfs(struct the_nilfs *);
-- 
cgit v1.2.2


From 1dfa27105a6cf4171c439fd40a004a0a17838115 Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Thu, 23 Jul 2009 01:33:49 +0900
Subject: nilfs2: stop using periodic write_super callback

This removes nilfs_write_super and commit super block in nilfs
internal thread, instead of periodic write_super callback.

VFS layer calls ->write_super callback periodically.  However,
it looks like that calling back is ommited when disk I/O is busy.
And when cleanerd (nilfs GC) is runnig, disk I/O tend to be busy thus
nilfs superblock is not synchronized as nilfs designed.

To avoid it, syncing superblock by nilfs thread instead of pdflush.

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/segment.c |  7 ++++++-
 fs/nilfs2/super.c   | 46 +---------------------------------------------
 2 files changed, 7 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 51ff3d0a4ee2..683df89dbae5 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2501,7 +2501,8 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
 		if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
 		    nilfs_discontinued(nilfs)) {
 			down_write(&nilfs->ns_sem);
-			req->sb_err = nilfs_commit_super(sbi, 0);
+			req->sb_err = nilfs_commit_super(sbi,
+					nilfs_altsb_need_update(nilfs));
 			up_write(&nilfs->ns_sem);
 		}
 	}
@@ -2689,6 +2690,7 @@ static int nilfs_segctor_thread(void *arg)
 	} else {
 		DEFINE_WAIT(wait);
 		int should_sleep = 1;
+		struct the_nilfs *nilfs;
 
 		prepare_to_wait(&sci->sc_wait_daemon, &wait,
 				TASK_INTERRUPTIBLE);
@@ -2709,6 +2711,9 @@ static int nilfs_segctor_thread(void *arg)
 		finish_wait(&sci->sc_wait_daemon, &wait);
 		timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
 			   time_after_eq(jiffies, sci->sc_timer->expires));
+		nilfs = sci->sc_sbi->s_nilfs;
+		if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs))
+			set_nilfs_discontinued(nilfs);
 	}
 	goto loop;
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 70e8613a1954..ed8ec5718ae6 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -67,7 +67,6 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
 		   "(NILFS)");
 MODULE_LICENSE("GPL");
 
-static void nilfs_write_super(struct super_block *sb);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
 
 /**
@@ -335,49 +334,6 @@ static void nilfs_put_super(struct super_block *sb)
 	unlock_kernel();
 }
 
-/**
- * nilfs_write_super - write super block(s) of NILFS
- * @sb: super_block
- *
- * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and
- * clears s_dirt.  This function is called in the section protected by
- * lock_super().
- *
- * The s_dirt flag is managed by each filesystem and we protect it by ns_sem
- * of the struct the_nilfs.  Lock order must be as follows:
- *
- *   1. lock_super()
- *   2.    down_write(&nilfs->ns_sem)
- *
- * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer
- * of the super block (nilfs->ns_sbp[]).
- *
- * In most cases, VFS functions call lock_super() before calling these
- * methods.  So we must be careful not to bring on deadlocks when using
- * lock_super();  see generic_shutdown_super(), write_super(), and so on.
- *
- * Note that order of lock_kernel() and lock_super() depends on contexts
- * of VFS.  We should also note that lock_kernel() can be used in its
- * protective section and only the outermost one has an effect.
- */
-static void nilfs_write_super(struct super_block *sb)
-{
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-	struct the_nilfs *nilfs = sbi->s_nilfs;
-
-	down_write(&nilfs->ns_sem);
-	if (!(sb->s_flags & MS_RDONLY)) {
-		if (!nilfs_discontinued(nilfs) &&
-		    !nilfs_sb_need_update(nilfs)) {
-			up_write(&nilfs->ns_sem);
-			return;
-		}
-		nilfs_commit_super(sbi, nilfs_altsb_need_update(nilfs));
-	}
-	sb->s_dirt = 0;
-	up_write(&nilfs->ns_sem);
-}
-
 static int nilfs_sync_fs(struct super_block *sb, int wait)
 {
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
@@ -558,7 +514,7 @@ static struct super_operations nilfs_sops = {
 	/* .drop_inode	  = nilfs_drop_inode, */
 	.delete_inode   = nilfs_delete_inode,
 	.put_super      = nilfs_put_super,
-	.write_super    = nilfs_write_super,
+	/* .write_super    = nilfs_write_super, */
 	.sync_fs        = nilfs_sync_fs,
 	/* .write_super_lockfs */
 	/* .unlockfs */
-- 
cgit v1.2.2


From c1b353f04a42f1b531be920149b85343d56a6b3b Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 19 Jun 2009 15:25:42 +0900
Subject: nilfs2: use GFP_NOIO for bio_alloc instead of GFP_NOWAIT

Alberto Bertogli advised me about bio_alloc() use in nilfs:
On Sat, 13 Jun 2009 22:52:40 -0300, Alberto Bertogli wrote:
> By the way, those bio_alloc()s are using GFP_NOWAIT but it looks
> like they could use at least GFP_NOIO or GFP_NOFS, since the caller
> can (and sometimes do) sleep. The only caller is nilfs_submit_bh(),
> which calls nilfs_submit_seg_bio() which can sleep calling
> wait_for_completion().

This takes in the comment and replaces the use of GFP_NOWAIT flag with
GFP_NOIO.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/segbuf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 9e3fe17bb96b..e6d9e37fa241 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -316,10 +316,10 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
 {
 	struct bio *bio;
 
-	bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+	bio = bio_alloc(GFP_NOIO, nr_vecs);
 	if (bio == NULL) {
 		while (!bio && (nr_vecs >>= 1))
-			bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+			bio = bio_alloc(GFP_NOIO, nr_vecs);
 	}
 	if (likely(bio)) {
 		bio->bi_bdev = sb->s_bdev;
-- 
cgit v1.2.2


From a4f0b9c5b4ae83636dafde8f3a0e04b5e411a0f3 Mon Sep 17 00:00:00 2001
From: Zhang Qiang <zhangqiang.buaa@gmail.com>
Date: Sun, 9 Aug 2009 19:13:10 +0800
Subject: nilfs2: An unassigned variable is assigned to a never used structure
 member

nilfs2: In procedure 'nilfs_get_sb()', when a nilfs filesysttem is
mounted for the first time, local variable 'nilfs->ns_last_cno' is
used before loading the latest checkpoint number from disk (in
'nilfs_fill_super'). 'nilfs->ns_last_cno' is assigned to 'sd.cno', but
'sd.cno' has never been used in the procedure.

Signed-off-by: Zhang Qiang <zhangqiang.buaa@gmail.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/super.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index ed8ec5718ae6..019752f7d2da 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1103,10 +1103,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	 */
 	sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
 
-	if (!sd.cno)
-		/* trying to get the latest checkpoint.  */
-		sd.cno = nilfs_last_cno(nilfs);
-
 	/*
 	 * Get super block instance holding the nilfs_sb_info struct.
 	 * A new instance is allocated if no existing mount is present or
-- 
cgit v1.2.2


From 43be0ec0387a5ccce2e064cb78502e7b2b4dd590 Mon Sep 17 00:00:00 2001
From: Zhu Yanhai <zhu.yanhai@gmail.com>
Date: Wed, 12 Aug 2009 14:17:59 +0800
Subject: nilfs2: add more check routines in mount process

nilfs2: Add more safeguard routines and protections in mount process,
which also makes nilfs2 report consistency error messages when
checkpoint number is invalid.

Signed-off-by: Zhu Yanhai <zhu.yanhai@gmail.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/cpfile.c | 11 ++++++++---
 fs/nilfs2/super.c  |  7 ++++++-
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index aec942cf79e3..1c6cfb59128d 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -815,8 +815,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
 	void *kaddr;
 	int ret;
 
-	if (cno == 0)
-		return -ENOENT; /* checkpoint number 0 is invalid */
+	/* CP number is invalid if it's zero or larger than the
+	largest	exist one.*/
+	if (cno == 0 || cno >= nilfs_mdt_cno(cpfile))
+		return -ENOENT;
 	down_read(&NILFS_MDT(cpfile)->mi_sem);
 
 	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
@@ -824,7 +826,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
 		goto out;
 	kaddr = kmap_atomic(bh->b_page, KM_USER0);
 	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
-	ret = nilfs_checkpoint_snapshot(cp);
+	if (nilfs_checkpoint_invalid(cp))
+		ret = -ENOENT;
+	else
+		ret = nilfs_checkpoint_snapshot(cp);
 	kunmap_atomic(kaddr, KM_USER0);
 	brelse(bh);
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 019752f7d2da..50284add7880 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -792,10 +792,15 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 
 	if (sb->s_flags & MS_RDONLY) {
 		if (nilfs_test_opt(sbi, SNAPSHOT)) {
+			down_read(&nilfs->ns_segctor_sem);
 			err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
 						       sbi->s_snapshot_cno);
-			if (err < 0)
+			up_read(&nilfs->ns_segctor_sem);
+			if (err < 0) {
+				if (err == -ENOENT)
+					err = -EINVAL;
 				goto failed_sbi;
+			}
 			if (!err) {
 				printk(KERN_ERR
 				       "NILFS: The specified checkpoint is "
-- 
cgit v1.2.2


From 1cf58fa840472ec7df6bf2312885949ebb308853 Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Thu, 3 Sep 2009 22:24:17 +0900
Subject: nilfs2: shorten freeze period due to GC in write operation v3

This is a re-revised patch to shorten freeze period.
This version include a fix of the bug Konishi-san mentioned last time.

When GC is runnning, GC moves live block to difference segments.
Copying live blocks into memory is done in a transaction,
however it is not necessarily to be in the transaction.
This patch will get the nilfs_ioctl_move_blocks() out from
transaction lock and put it before the transaction.

I ran sysbench fileio test against nilfs partition.
I copied some DVD/CD images and created snapshot to create live blocks
before starting the benchmark.

Followings are summary of rc8 and rc8 w/ the patch of per-request
statistics, which is min/max and avg.  I ran each test three times and
bellow is average of those numers.

According to this benchmark result, average time is slightly degrated.
However, worstcase (max) result is significantly improved.
This can address a few seconds write freeze.

- random write per-request performance of rc8
 min   0.843ms
 max 680.406ms
 avg   3.050ms
- random write per-request performance of rc8 w/ this patch
 min   0.843ms -> 100.00%
 max 380.490ms ->  55.90%
 avg   3.233ms -> 106.00%

- sequential write per-request performance of rc8
 min   0.736ms
 max 774.343ms
 avg   2.883ms
- sequential write per-request performance of rc8 w/ this patch
 min   0.720ms ->  97.80%
 max  644.280ms->  83.20%
 avg   3.130ms -> 108.50%

-----8<-----8<-----nilfs_cleanerd.conf-----8<-----8<-----
protection_period       150
selection_policy        timestamp       # timestamp in ascend order
nsegments_per_clean     2
cleaning_interval       2
retry_interval          60
use_mmap
log_priority            info
-----8<-----8<-----nilfs_cleanerd.conf-----8<-----8<-----

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/ioctl.c     | 26 +++++++++++++++++++-------
 fs/nilfs2/the_nilfs.h |  2 ++
 2 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 6ea5f872e2de..6572ea4bc4df 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -442,12 +442,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
 	const char *msg;
 	int ret;
 
-	ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
-	if (ret < 0) {
-		msg = "cannot read source blocks";
-		goto failed;
-	}
-
 	ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]);
 	if (ret < 0) {
 		/*
@@ -548,7 +542,25 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 		}
 	}
 
-	ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+	/*
+	 * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(),
+	 * which will operates an inode list without blocking.
+	 * To protect the list from concurrent operations,
+	 * nilfs_ioctl_move_blocks should be atomic operation.
+	 */
+	if (test_and_set_bit(THE_NILFS_GC_RUNNING, &nilfs->ns_flags)) {
+		ret = -EBUSY;
+		goto out_free;
+	}
+
+	ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
+	if (ret < 0)
+		printk(KERN_ERR "NILFS: GC failed during preparation: "
+			"cannot read source blocks: err=%d\n", ret);
+	else
+		ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+
+	clear_nilfs_gc_running(nilfs);
 
  out_free:
 	while (--n >= 0)
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 68e9626e3d44..20abd55881e0 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -37,6 +37,7 @@ enum {
 	THE_NILFS_LOADED,       /* Roll-back/roll-forward has done and
 				   the latest checkpoint was loaded */
 	THE_NILFS_DISCONTINUED,	/* 'next' pointer chain has broken */
+	THE_NILFS_GC_RUNNING,	/* gc process is running */
 };
 
 /**
@@ -195,6 +196,7 @@ static inline int nilfs_##name(struct the_nilfs *nilfs)			\
 THE_NILFS_FNS(INIT, init)
 THE_NILFS_FNS(LOADED, loaded)
 THE_NILFS_FNS(DISCONTINUED, discontinued)
+THE_NILFS_FNS(GC_RUNNING, gc_running)
 
 /* Minimum interval of periodical update of superblocks (in seconds) */
 #define NILFS_SB_FREQ		10
-- 
cgit v1.2.2


From 9ead98637300dc7caefd904bbe1e092bf4d21f87 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 14 Aug 2009 23:39:45 +0900
Subject: nilfs2: remove nilfs_dat_abort_start and nilfs_dat_abort_free

These functions are not called from any functions.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/dat.c | 11 -----------
 fs/nilfs2/dat.h |  1 -
 2 files changed, 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 8927ca27e6f7..1cfcc1a0fdf1 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -109,12 +109,6 @@ void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
 	nilfs_palloc_commit_free_entry(dat, req);
 }
 
-void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req)
-{
-	nilfs_dat_abort_entry(dat, req);
-	nilfs_palloc_abort_free_entry(dat, req);
-}
-
 int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
 {
 	int ret;
@@ -140,11 +134,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
 	nilfs_dat_commit_entry(dat, req);
 }
 
-void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req)
-{
-	nilfs_dat_abort_entry(dat, req);
-}
-
 int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
 {
 	struct nilfs_dat_entry *entry;
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d328b81eead4..4b0b47ea85ed 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -39,7 +39,6 @@ void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
 int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
 void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
 			    sector_t);
-void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *);
 int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
 void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
 void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
-- 
cgit v1.2.2


From 6d28f7ea43856449ed2f344cb209af3ba1c6b757 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 15 Aug 2009 01:14:10 +0900
Subject: nilfs2: remove unused btree argument from btree functions

Even though many btree functions take a btree object as their first
argument, most of them are not used in their functions.

This sticky use of the btree argument is hurting code readability and
giving the possibility of inefficient code generation.

So, this removes the unnecessary btree arguments.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/btree.c | 456 +++++++++++++++++++++++++-----------------------------
 1 file changed, 208 insertions(+), 248 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index aa412724b64e..21ed8ccea4b9 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -71,21 +71,17 @@ void nilfs_btree_path_cache_destroy(void)
 	kmem_cache_destroy(nilfs_btree_path_cache);
 }
 
-static inline struct nilfs_btree_path *
-nilfs_btree_alloc_path(const struct nilfs_btree *btree)
+static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
 {
-	return (struct nilfs_btree_path *)
-		kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
+	return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
 }
 
-static inline void nilfs_btree_free_path(const struct nilfs_btree *btree,
-					 struct nilfs_btree_path *path)
+static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
 {
 	kmem_cache_free(nilfs_btree_path_cache, path);
 }
 
-static void nilfs_btree_init_path(const struct nilfs_btree *btree,
-				  struct nilfs_btree_path *path)
+static void nilfs_btree_init_path(struct nilfs_btree_path *path)
 {
 	int level;
 
@@ -101,8 +97,7 @@ static void nilfs_btree_init_path(const struct nilfs_btree *btree,
 	}
 }
 
-static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
-				   struct nilfs_btree_path *path)
+static void nilfs_btree_clear_path(struct nilfs_btree_path *path)
 {
 	int level;
 
@@ -148,129 +143,110 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
 }
 
 static inline int
-nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
-			   const struct nilfs_btree_node *node)
+nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
 {
 	return node->bn_flags;
 }
 
 static inline void
-nilfs_btree_node_set_flags(struct nilfs_btree *btree,
-			   struct nilfs_btree_node *node,
-			   int flags)
+nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
 {
 	node->bn_flags = flags;
 }
 
-static inline int nilfs_btree_node_root(const struct nilfs_btree *btree,
-					const struct nilfs_btree_node *node)
+static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node)
 {
-	return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT;
+	return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
 }
 
 static inline int
-nilfs_btree_node_get_level(const struct nilfs_btree *btree,
-			   const struct nilfs_btree_node *node)
+nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
 {
 	return node->bn_level;
 }
 
 static inline void
-nilfs_btree_node_set_level(struct nilfs_btree *btree,
-			   struct nilfs_btree_node *node,
-			   int level)
+nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
 {
 	node->bn_level = level;
 }
 
 static inline int
-nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree,
-			       const struct nilfs_btree_node *node)
+nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
 {
 	return le16_to_cpu(node->bn_nchildren);
 }
 
 static inline void
-nilfs_btree_node_set_nchildren(struct nilfs_btree *btree,
-			       struct nilfs_btree_node *node,
-			       int nchildren)
+nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
 {
 	node->bn_nchildren = cpu_to_le16(nchildren);
 }
 
-static inline int
-nilfs_btree_node_size(const struct nilfs_btree *btree)
+static inline int nilfs_btree_node_size(const struct nilfs_btree *btree)
 {
 	return 1 << btree->bt_bmap.b_inode->i_blkbits;
 }
 
 static inline int
-nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree,
-			       const struct nilfs_btree_node *node)
+nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node,
+			       const struct nilfs_btree *btree)
 {
-	return nilfs_btree_node_root(btree, node) ?
+	return nilfs_btree_node_root(node) ?
 		NILFS_BTREE_ROOT_NCHILDREN_MIN :
 		NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
 }
 
 static inline int
-nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree,
-			       const struct nilfs_btree_node *node)
+nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node,
+			       const struct nilfs_btree *btree)
 {
-	return nilfs_btree_node_root(btree, node) ?
+	return nilfs_btree_node_root(node) ?
 		NILFS_BTREE_ROOT_NCHILDREN_MAX :
 		NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
 }
 
 static inline __le64 *
-nilfs_btree_node_dkeys(const struct nilfs_btree *btree,
-		       const struct nilfs_btree_node *node)
+nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
 {
 	return (__le64 *)((char *)(node + 1) +
-			  (nilfs_btree_node_root(btree, node) ?
+			  (nilfs_btree_node_root(node) ?
 			   0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
 }
 
 static inline __le64 *
-nilfs_btree_node_dptrs(const struct nilfs_btree *btree,
-		       const struct nilfs_btree_node *node)
+nilfs_btree_node_dptrs(const struct nilfs_btree_node *node,
+		       const struct nilfs_btree *btree)
 {
-	return (__le64 *)(nilfs_btree_node_dkeys(btree, node) +
-			  nilfs_btree_node_nchildren_max(btree, node));
+	return (__le64 *)(nilfs_btree_node_dkeys(node) +
+			  nilfs_btree_node_nchildren_max(node, btree));
 }
 
 static inline __u64
-nilfs_btree_node_get_key(const struct nilfs_btree *btree,
-			 const struct nilfs_btree_node *node, int index)
+nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
 {
-	return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) +
-					index));
+	return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index));
 }
 
 static inline void
-nilfs_btree_node_set_key(struct nilfs_btree *btree,
-			 struct nilfs_btree_node *node, int index, __u64 key)
+nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
 {
-	*(nilfs_btree_node_dkeys(btree, node) + index) =
-		nilfs_bmap_key_to_dkey(key);
+	*(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key);
 }
 
 static inline __u64
 nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
-			 const struct nilfs_btree_node *node,
-			 int index)
+			 const struct nilfs_btree_node *node, int index)
 {
-	return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) +
+	return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) +
 					index));
 }
 
 static inline void
 nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
-			 struct nilfs_btree_node *node,
-			 int index,
-			 __u64 ptr)
+			 struct nilfs_btree_node *node, int index, __u64 ptr)
 {
-	*(nilfs_btree_node_dptrs(btree, node) + index) =
+	*(nilfs_btree_node_dptrs(node, btree) + index) =
 		nilfs_bmap_ptr_to_dptr(ptr);
 }
 
@@ -283,12 +259,12 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree,
 	__le64 *dptrs;
 	int i;
 
-	nilfs_btree_node_set_flags(btree, node, flags);
-	nilfs_btree_node_set_level(btree, node, level);
-	nilfs_btree_node_set_nchildren(btree, node, nchildren);
+	nilfs_btree_node_set_flags(node, flags);
+	nilfs_btree_node_set_level(node, level);
+	nilfs_btree_node_set_nchildren(node, nchildren);
 
-	dkeys = nilfs_btree_node_dkeys(btree, node);
-	dptrs = nilfs_btree_node_dptrs(btree, node);
+	dkeys = nilfs_btree_node_dkeys(node);
+	dptrs = nilfs_btree_node_dptrs(node, btree);
 	for (i = 0; i < nchildren; i++) {
 		dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
 		dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
@@ -305,13 +281,13 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
 	__le64 *ldptrs, *rdptrs;
 	int lnchildren, rnchildren;
 
-	ldkeys = nilfs_btree_node_dkeys(btree, left);
-	ldptrs = nilfs_btree_node_dptrs(btree, left);
-	lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+	ldkeys = nilfs_btree_node_dkeys(left);
+	ldptrs = nilfs_btree_node_dptrs(left, btree);
+	lnchildren = nilfs_btree_node_get_nchildren(left);
 
-	rdkeys = nilfs_btree_node_dkeys(btree, right);
-	rdptrs = nilfs_btree_node_dptrs(btree, right);
-	rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+	rdkeys = nilfs_btree_node_dkeys(right);
+	rdptrs = nilfs_btree_node_dptrs(right, btree);
+	rnchildren = nilfs_btree_node_get_nchildren(right);
 
 	memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
 	memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
@@ -320,8 +296,8 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
 
 	lnchildren += n;
 	rnchildren -= n;
-	nilfs_btree_node_set_nchildren(btree, left, lnchildren);
-	nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+	nilfs_btree_node_set_nchildren(left, lnchildren);
+	nilfs_btree_node_set_nchildren(right, rnchildren);
 }
 
 /* Assume that the buffer heads corresponding to left and right are locked. */
@@ -334,13 +310,13 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
 	__le64 *ldptrs, *rdptrs;
 	int lnchildren, rnchildren;
 
-	ldkeys = nilfs_btree_node_dkeys(btree, left);
-	ldptrs = nilfs_btree_node_dptrs(btree, left);
-	lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+	ldkeys = nilfs_btree_node_dkeys(left);
+	ldptrs = nilfs_btree_node_dptrs(left, btree);
+	lnchildren = nilfs_btree_node_get_nchildren(left);
 
-	rdkeys = nilfs_btree_node_dkeys(btree, right);
-	rdptrs = nilfs_btree_node_dptrs(btree, right);
-	rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+	rdkeys = nilfs_btree_node_dkeys(right);
+	rdptrs = nilfs_btree_node_dptrs(right, btree);
+	rnchildren = nilfs_btree_node_get_nchildren(right);
 
 	memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
 	memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
@@ -349,8 +325,8 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
 
 	lnchildren -= n;
 	rnchildren += n;
-	nilfs_btree_node_set_nchildren(btree, left, lnchildren);
-	nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+	nilfs_btree_node_set_nchildren(left, lnchildren);
+	nilfs_btree_node_set_nchildren(right, rnchildren);
 }
 
 /* Assume that the buffer head corresponding to node is locked. */
@@ -362,9 +338,9 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
 	__le64 *dptrs;
 	int nchildren;
 
-	dkeys = nilfs_btree_node_dkeys(btree, node);
-	dptrs = nilfs_btree_node_dptrs(btree, node);
-	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	dkeys = nilfs_btree_node_dkeys(node);
+	dptrs = nilfs_btree_node_dptrs(node, btree);
+	nchildren = nilfs_btree_node_get_nchildren(node);
 	if (index < nchildren) {
 		memmove(dkeys + index + 1, dkeys + index,
 			(nchildren - index) * sizeof(*dkeys));
@@ -374,7 +350,7 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
 	dkeys[index] = nilfs_bmap_key_to_dkey(key);
 	dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
 	nchildren++;
-	nilfs_btree_node_set_nchildren(btree, node, nchildren);
+	nilfs_btree_node_set_nchildren(node, nchildren);
 }
 
 /* Assume that the buffer head corresponding to node is locked. */
@@ -388,11 +364,11 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
 	__le64 *dptrs;
 	int nchildren;
 
-	dkeys = nilfs_btree_node_dkeys(btree, node);
-	dptrs = nilfs_btree_node_dptrs(btree, node);
+	dkeys = nilfs_btree_node_dkeys(node);
+	dptrs = nilfs_btree_node_dptrs(node, btree);
 	key = nilfs_bmap_dkey_to_key(dkeys[index]);
 	ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
-	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	nchildren = nilfs_btree_node_get_nchildren(node);
 	if (keyp != NULL)
 		*keyp = key;
 	if (ptrp != NULL)
@@ -405,11 +381,10 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
 			(nchildren - index - 1) * sizeof(*dptrs));
 	}
 	nchildren--;
-	nilfs_btree_node_set_nchildren(btree, node, nchildren);
+	nilfs_btree_node_set_nchildren(node, nchildren);
 }
 
-static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
-				   const struct nilfs_btree_node *node,
+static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
 				   __u64 key, int *indexp)
 {
 	__u64 nkey;
@@ -417,12 +392,12 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
 
 	/* binary search */
 	low = 0;
-	high = nilfs_btree_node_get_nchildren(btree, node) - 1;
+	high = nilfs_btree_node_get_nchildren(node) - 1;
 	index = 0;
 	s = 0;
 	while (low <= high) {
 		index = (low + high) / 2;
-		nkey = nilfs_btree_node_get_key(btree, node, index);
+		nkey = nilfs_btree_node_get_key(node, index);
 		if (nkey == key) {
 			s = 0;
 			goto out;
@@ -436,9 +411,8 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
 	}
 
 	/* adjust index */
-	if (nilfs_btree_node_get_level(btree, node) >
-	    NILFS_BTREE_LEVEL_NODE_MIN) {
-		if ((s > 0) && (index > 0))
+	if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) {
+		if (s > 0 && index > 0)
 			index--;
 	} else if (s < 0)
 		index++;
@@ -456,25 +430,20 @@ nilfs_btree_get_root(const struct nilfs_btree *btree)
 }
 
 static inline struct nilfs_btree_node *
-nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree,
-			     const struct nilfs_btree_path *path,
-			     int level)
+nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
 {
 	return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
 }
 
 static inline struct nilfs_btree_node *
-nilfs_btree_get_sib_node(const struct nilfs_btree *btree,
-			 const struct nilfs_btree_path *path,
-			 int level)
+nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
 {
 	return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
 }
 
 static inline int nilfs_btree_height(const struct nilfs_btree *btree)
 {
-	return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree))
-		+ 1;
+	return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
 }
 
 static inline struct nilfs_btree_node *
@@ -484,7 +453,7 @@ nilfs_btree_get_node(const struct nilfs_btree *btree,
 {
 	return (level == nilfs_btree_height(btree) - 1) ?
 		nilfs_btree_get_root(btree) :
-		nilfs_btree_get_nonroot_node(btree, path, level);
+		nilfs_btree_get_nonroot_node(path, level);
 }
 
 static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
@@ -496,12 +465,11 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
 	int level, index, found, ret;
 
 	node = nilfs_btree_get_root(btree);
-	level = nilfs_btree_node_get_level(btree, node);
-	if ((level < minlevel) ||
-	    (nilfs_btree_node_get_nchildren(btree, node) <= 0))
+	level = nilfs_btree_node_get_level(node);
+	if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0)
 		return -ENOENT;
 
-	found = nilfs_btree_node_lookup(btree, node, key, &index);
+	found = nilfs_btree_node_lookup(node, key, &index);
 	ptr = nilfs_btree_node_get_ptr(btree, node, index);
 	path[level].bp_bh = NULL;
 	path[level].bp_index = index;
@@ -510,14 +478,13 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
 		ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
 		if (ret < 0)
 			return ret;
-		node = nilfs_btree_get_nonroot_node(btree, path, level);
-		BUG_ON(level != nilfs_btree_node_get_level(btree, node));
+		node = nilfs_btree_get_nonroot_node(path, level);
+		BUG_ON(level != nilfs_btree_node_get_level(node));
 		if (!found)
-			found = nilfs_btree_node_lookup(btree, node, key,
-							&index);
+			found = nilfs_btree_node_lookup(node, key, &index);
 		else
 			index = 0;
-		if (index < nilfs_btree_node_nchildren_max(btree, node))
+		if (index < nilfs_btree_node_nchildren_max(node, btree))
 			ptr = nilfs_btree_node_get_ptr(btree, node, index);
 		else {
 			WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
@@ -544,10 +511,10 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
 	int index, level, ret;
 
 	node = nilfs_btree_get_root(btree);
-	index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+	index = nilfs_btree_node_get_nchildren(node) - 1;
 	if (index < 0)
 		return -ENOENT;
-	level = nilfs_btree_node_get_level(btree, node);
+	level = nilfs_btree_node_get_level(node);
 	ptr = nilfs_btree_node_get_ptr(btree, node, index);
 	path[level].bp_bh = NULL;
 	path[level].bp_index = index;
@@ -556,15 +523,15 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
 		ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
 		if (ret < 0)
 			return ret;
-		node = nilfs_btree_get_nonroot_node(btree, path, level);
-		BUG_ON(level != nilfs_btree_node_get_level(btree, node));
-		index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+		node = nilfs_btree_get_nonroot_node(path, level);
+		BUG_ON(level != nilfs_btree_node_get_level(node));
+		index = nilfs_btree_node_get_nchildren(node) - 1;
 		ptr = nilfs_btree_node_get_ptr(btree, node, index);
 		path[level].bp_index = index;
 	}
 
 	if (keyp != NULL)
-		*keyp = nilfs_btree_node_get_key(btree, node, index);
+		*keyp = nilfs_btree_node_get_key(node, index);
 	if (ptrp != NULL)
 		*ptrp = ptr;
 
@@ -580,18 +547,18 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
 	int ret;
 
 	btree = (struct nilfs_btree *)bmap;
-	path = nilfs_btree_alloc_path(btree);
+	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(btree, path);
+	nilfs_btree_init_path(path);
 
 	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
 
 	if (ptrp != NULL)
 		*ptrp = ptr;
 
-	nilfs_btree_clear_path(btree, path);
-	nilfs_btree_free_path(btree, path);
+	nilfs_btree_clear_path(path);
+	nilfs_btree_free_path(path);
 
 	return ret;
 }
@@ -608,10 +575,10 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
 	int level = NILFS_BTREE_LEVEL_NODE_MIN;
 	int ret, cnt, index, maxlevel;
 
-	path = nilfs_btree_alloc_path(btree);
+	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(btree, path);
+	nilfs_btree_init_path(path);
 	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
 	if (ret < 0)
 		goto out;
@@ -631,8 +598,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
 	node = nilfs_btree_get_node(btree, path, level);
 	index = path[level].bp_index + 1;
 	for (;;) {
-		while (index < nilfs_btree_node_get_nchildren(btree, node)) {
-			if (nilfs_btree_node_get_key(btree, node, index) !=
+		while (index < nilfs_btree_node_get_nchildren(node)) {
+			if (nilfs_btree_node_get_key(node, index) !=
 			    key + cnt)
 				goto end;
 			ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
@@ -653,8 +620,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
 		/* look-up right sibling node */
 		node = nilfs_btree_get_node(btree, path, level + 1);
 		index = path[level + 1].bp_index + 1;
-		if (index >= nilfs_btree_node_get_nchildren(btree, node) ||
-		    nilfs_btree_node_get_key(btree, node, index) != key + cnt)
+		if (index >= nilfs_btree_node_get_nchildren(node) ||
+		    nilfs_btree_node_get_key(node, index) != key + cnt)
 			break;
 		ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
 		path[level + 1].bp_index = index;
@@ -664,7 +631,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
 		ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh);
 		if (ret < 0)
 			goto out;
-		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		node = nilfs_btree_get_nonroot_node(path, level);
 		index = 0;
 		path[level].bp_index = index;
 	}
@@ -672,8 +639,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
 	*ptrp = ptr;
 	ret = cnt;
  out:
-	nilfs_btree_clear_path(btree, path);
-	nilfs_btree_free_path(btree, path);
+	nilfs_btree_clear_path(path);
+	nilfs_btree_free_path(path);
 	return ret;
 }
 
@@ -685,9 +652,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
 		do {
 			lock_buffer(path[level].bp_bh);
 			nilfs_btree_node_set_key(
-				btree,
-				nilfs_btree_get_nonroot_node(
-					btree, path, level),
+				nilfs_btree_get_nonroot_node(path, level),
 				path[level].bp_index, key);
 			if (!buffer_dirty(path[level].bp_bh))
 				nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -698,8 +663,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
 
 	/* root */
 	if (level == nilfs_btree_height(btree) - 1) {
-		nilfs_btree_node_set_key(btree,
-					 nilfs_btree_get_root(btree),
+		nilfs_btree_node_set_key(nilfs_btree_get_root(btree),
 					 path[level].bp_index, key);
 	}
 }
@@ -712,7 +676,7 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
 
 	if (level < nilfs_btree_height(btree) - 1) {
 		lock_buffer(path[level].bp_bh);
-		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		node = nilfs_btree_get_nonroot_node(path, level);
 		nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
 					path[level].bp_index);
 		if (!buffer_dirty(path[level].bp_bh))
@@ -721,8 +685,8 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
 
 		if (path[level].bp_index == 0)
 			nilfs_btree_promote_key(btree, path, level + 1,
-						nilfs_btree_node_get_key(
-							btree, node, 0));
+						nilfs_btree_node_get_key(node,
+									 0));
 	} else {
 		node = nilfs_btree_get_root(btree);
 		nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
@@ -740,10 +704,10 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
 	lock_buffer(path[level].bp_bh);
 	lock_buffer(path[level].bp_sib_bh);
 
-	node = nilfs_btree_get_nonroot_node(btree, path, level);
-	left = nilfs_btree_get_sib_node(btree, path, level);
-	nchildren = nilfs_btree_node_get_nchildren(btree, node);
-	lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+	node = nilfs_btree_get_nonroot_node(path, level);
+	left = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	lnchildren = nilfs_btree_node_get_nchildren(left);
 	move = 0;
 
 	n = (nchildren + lnchildren + 1) / 2 - lnchildren;
@@ -764,7 +728,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
 	unlock_buffer(path[level].bp_sib_bh);
 
 	nilfs_btree_promote_key(btree, path, level + 1,
-				nilfs_btree_node_get_key(btree, node, 0));
+				nilfs_btree_node_get_key(node, 0));
 
 	if (move) {
 		brelse(path[level].bp_bh);
@@ -791,10 +755,10 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
 	lock_buffer(path[level].bp_bh);
 	lock_buffer(path[level].bp_sib_bh);
 
-	node = nilfs_btree_get_nonroot_node(btree, path, level);
-	right = nilfs_btree_get_sib_node(btree, path, level);
-	nchildren = nilfs_btree_node_get_nchildren(btree, node);
-	rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+	node = nilfs_btree_get_nonroot_node(path, level);
+	right = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	rnchildren = nilfs_btree_node_get_nchildren(right);
 	move = 0;
 
 	n = (nchildren + rnchildren + 1) / 2 - rnchildren;
@@ -816,15 +780,14 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
 
 	path[level + 1].bp_index++;
 	nilfs_btree_promote_key(btree, path, level + 1,
-				nilfs_btree_node_get_key(btree, right, 0));
+				nilfs_btree_node_get_key(right, 0));
 	path[level + 1].bp_index--;
 
 	if (move) {
 		brelse(path[level].bp_bh);
 		path[level].bp_bh = path[level].bp_sib_bh;
 		path[level].bp_sib_bh = NULL;
-		path[level].bp_index -=
-			nilfs_btree_node_get_nchildren(btree, node);
+		path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
 		path[level + 1].bp_index++;
 	} else {
 		brelse(path[level].bp_sib_bh);
@@ -846,9 +809,9 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
 	lock_buffer(path[level].bp_bh);
 	lock_buffer(path[level].bp_sib_bh);
 
-	node = nilfs_btree_get_nonroot_node(btree, path, level);
-	right = nilfs_btree_get_sib_node(btree, path, level);
-	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	node = nilfs_btree_get_nonroot_node(path, level);
+	right = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
 	move = 0;
 
 	n = (nchildren + 1) / 2;
@@ -867,16 +830,15 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
 	unlock_buffer(path[level].bp_bh);
 	unlock_buffer(path[level].bp_sib_bh);
 
-	newkey = nilfs_btree_node_get_key(btree, right, 0);
+	newkey = nilfs_btree_node_get_key(right, 0);
 	newptr = path[level].bp_newreq.bpr_ptr;
 
 	if (move) {
-		path[level].bp_index -=
-			nilfs_btree_node_get_nchildren(btree, node);
+		path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
 		nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
 					path[level].bp_index);
 
-		*keyp = nilfs_btree_node_get_key(btree, right, 0);
+		*keyp = nilfs_btree_node_get_key(right, 0);
 		*ptrp = path[level].bp_newreq.bpr_ptr;
 
 		brelse(path[level].bp_bh);
@@ -885,7 +847,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
 	} else {
 		nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
 
-		*keyp = nilfs_btree_node_get_key(btree, right, 0);
+		*keyp = nilfs_btree_node_get_key(right, 0);
 		*ptrp = path[level].bp_newreq.bpr_ptr;
 
 		brelse(path[level].bp_sib_bh);
@@ -905,12 +867,12 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
 	lock_buffer(path[level].bp_sib_bh);
 
 	root = nilfs_btree_get_root(btree);
-	child = nilfs_btree_get_sib_node(btree, path, level);
+	child = nilfs_btree_get_sib_node(path, level);
 
-	n = nilfs_btree_node_get_nchildren(btree, root);
+	n = nilfs_btree_node_get_nchildren(root);
 
 	nilfs_btree_node_move_right(btree, root, child, n);
-	nilfs_btree_node_set_level(btree, root, level + 1);
+	nilfs_btree_node_set_level(root, level + 1);
 
 	if (!buffer_dirty(path[level].bp_sib_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
@@ -922,7 +884,7 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
 
 	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
 
-	*keyp = nilfs_btree_node_get_key(btree, child, 0);
+	*keyp = nilfs_btree_node_get_key(child, 0);
 	*ptrp = path[level].bp_newreq.bpr_ptr;
 }
 
@@ -1007,9 +969,9 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
 	     level < nilfs_btree_height(btree) - 1;
 	     level++) {
-		node = nilfs_btree_get_nonroot_node(btree, path, level);
-		if (nilfs_btree_node_get_nchildren(btree, node) <
-		    nilfs_btree_node_nchildren_max(btree, node)) {
+		node = nilfs_btree_get_nonroot_node(path, level);
+		if (nilfs_btree_node_get_nchildren(node) <
+		    nilfs_btree_node_nchildren_max(node, btree)) {
 			path[level].bp_op = nilfs_btree_do_insert;
 			stats->bs_nblocks++;
 			goto out;
@@ -1026,8 +988,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 			if (ret < 0)
 				goto err_out_child_node;
 			sib = (struct nilfs_btree_node *)bh->b_data;
-			if (nilfs_btree_node_get_nchildren(btree, sib) <
-			    nilfs_btree_node_nchildren_max(btree, sib)) {
+			if (nilfs_btree_node_get_nchildren(sib) <
+			    nilfs_btree_node_nchildren_max(sib, btree)) {
 				path[level].bp_sib_bh = bh;
 				path[level].bp_op = nilfs_btree_carry_left;
 				stats->bs_nblocks++;
@@ -1038,15 +1000,15 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 
 		/* right sibling */
 		if (pindex <
-		    nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+		    nilfs_btree_node_get_nchildren(parent) - 1) {
 			sibptr = nilfs_btree_node_get_ptr(btree, parent,
 							  pindex + 1);
 			ret = nilfs_btree_get_block(btree, sibptr, &bh);
 			if (ret < 0)
 				goto err_out_child_node;
 			sib = (struct nilfs_btree_node *)bh->b_data;
-			if (nilfs_btree_node_get_nchildren(btree, sib) <
-			    nilfs_btree_node_nchildren_max(btree, sib)) {
+			if (nilfs_btree_node_get_nchildren(sib) <
+			    nilfs_btree_node_nchildren_max(sib, btree)) {
 				path[level].bp_sib_bh = bh;
 				path[level].bp_op = nilfs_btree_carry_right;
 				stats->bs_nblocks++;
@@ -1081,8 +1043,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 
 	/* root */
 	node = nilfs_btree_get_root(btree);
-	if (nilfs_btree_node_get_nchildren(btree, node) <
-	    nilfs_btree_node_nchildren_max(btree, node)) {
+	if (nilfs_btree_node_get_nchildren(node) <
+	    nilfs_btree_node_nchildren_max(node, btree)) {
 		path[level].bp_op = nilfs_btree_do_insert;
 		stats->bs_nblocks++;
 		goto out;
@@ -1164,10 +1126,10 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 	int level, ret;
 
 	btree = (struct nilfs_btree *)bmap;
-	path = nilfs_btree_alloc_path(btree);
+	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(btree, path);
+	nilfs_btree_init_path(path);
 
 	ret = nilfs_btree_do_lookup(btree, path, key, NULL,
 				    NILFS_BTREE_LEVEL_NODE_MIN);
@@ -1184,8 +1146,8 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 	nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
 
  out:
-	nilfs_btree_clear_path(btree, path);
-	nilfs_btree_free_path(btree, path);
+	nilfs_btree_clear_path(path);
+	nilfs_btree_free_path(path);
 	return ret;
 }
 
@@ -1197,7 +1159,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
 
 	if (level < nilfs_btree_height(btree) - 1) {
 		lock_buffer(path[level].bp_bh);
-		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		node = nilfs_btree_get_nonroot_node(path, level);
 		nilfs_btree_node_delete(btree, node, keyp, ptrp,
 					path[level].bp_index);
 		if (!buffer_dirty(path[level].bp_bh))
@@ -1205,7 +1167,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
 		unlock_buffer(path[level].bp_bh);
 		if (path[level].bp_index == 0)
 			nilfs_btree_promote_key(btree, path, level + 1,
-				nilfs_btree_node_get_key(btree, node, 0));
+				nilfs_btree_node_get_key(node, 0));
 	} else {
 		node = nilfs_btree_get_root(btree);
 		nilfs_btree_node_delete(btree, node, keyp, ptrp,
@@ -1225,10 +1187,10 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
 	lock_buffer(path[level].bp_bh);
 	lock_buffer(path[level].bp_sib_bh);
 
-	node = nilfs_btree_get_nonroot_node(btree, path, level);
-	left = nilfs_btree_get_sib_node(btree, path, level);
-	nchildren = nilfs_btree_node_get_nchildren(btree, node);
-	lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+	node = nilfs_btree_get_nonroot_node(path, level);
+	left = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	lnchildren = nilfs_btree_node_get_nchildren(left);
 
 	n = (nchildren + lnchildren) / 2 - nchildren;
 
@@ -1243,7 +1205,7 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
 	unlock_buffer(path[level].bp_sib_bh);
 
 	nilfs_btree_promote_key(btree, path, level + 1,
-				nilfs_btree_node_get_key(btree, node, 0));
+				nilfs_btree_node_get_key(node, 0));
 
 	brelse(path[level].bp_sib_bh);
 	path[level].bp_sib_bh = NULL;
@@ -1262,10 +1224,10 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
 	lock_buffer(path[level].bp_bh);
 	lock_buffer(path[level].bp_sib_bh);
 
-	node = nilfs_btree_get_nonroot_node(btree, path, level);
-	right = nilfs_btree_get_sib_node(btree, path, level);
-	nchildren = nilfs_btree_node_get_nchildren(btree, node);
-	rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+	node = nilfs_btree_get_nonroot_node(path, level);
+	right = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	rnchildren = nilfs_btree_node_get_nchildren(right);
 
 	n = (nchildren + rnchildren) / 2 - nchildren;
 
@@ -1281,7 +1243,7 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
 
 	path[level + 1].bp_index++;
 	nilfs_btree_promote_key(btree, path, level + 1,
-				nilfs_btree_node_get_key(btree, right, 0));
+				nilfs_btree_node_get_key(right, 0));
 	path[level + 1].bp_index--;
 
 	brelse(path[level].bp_sib_bh);
@@ -1300,10 +1262,10 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
 	lock_buffer(path[level].bp_bh);
 	lock_buffer(path[level].bp_sib_bh);
 
-	node = nilfs_btree_get_nonroot_node(btree, path, level);
-	left = nilfs_btree_get_sib_node(btree, path, level);
+	node = nilfs_btree_get_nonroot_node(path, level);
+	left = nilfs_btree_get_sib_node(path, level);
 
-	n = nilfs_btree_node_get_nchildren(btree, node);
+	n = nilfs_btree_node_get_nchildren(node);
 
 	nilfs_btree_node_move_left(btree, left, node, n);
 
@@ -1316,7 +1278,7 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
 	nilfs_btnode_delete(path[level].bp_bh);
 	path[level].bp_bh = path[level].bp_sib_bh;
 	path[level].bp_sib_bh = NULL;
-	path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
+	path[level].bp_index += nilfs_btree_node_get_nchildren(left);
 }
 
 static void nilfs_btree_concat_right(struct nilfs_btree *btree,
@@ -1331,10 +1293,10 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
 	lock_buffer(path[level].bp_bh);
 	lock_buffer(path[level].bp_sib_bh);
 
-	node = nilfs_btree_get_nonroot_node(btree, path, level);
-	right = nilfs_btree_get_sib_node(btree, path, level);
+	node = nilfs_btree_get_nonroot_node(path, level);
+	right = nilfs_btree_get_sib_node(path, level);
 
-	n = nilfs_btree_node_get_nchildren(btree, right);
+	n = nilfs_btree_node_get_nchildren(right);
 
 	nilfs_btree_node_move_left(btree, node, right, n);
 
@@ -1360,11 +1322,11 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
 
 	lock_buffer(path[level].bp_bh);
 	root = nilfs_btree_get_root(btree);
-	child = nilfs_btree_get_nonroot_node(btree, path, level);
+	child = nilfs_btree_get_nonroot_node(path, level);
 
 	nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
-	nilfs_btree_node_set_level(btree, root, level);
-	n = nilfs_btree_node_get_nchildren(btree, child);
+	nilfs_btree_node_set_level(root, level);
+	n = nilfs_btree_node_get_nchildren(child);
 	nilfs_btree_node_move_left(btree, root, child, n);
 	unlock_buffer(path[level].bp_bh);
 
@@ -1388,7 +1350,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
 	     level < nilfs_btree_height(btree) - 1;
 	     level++) {
-		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		node = nilfs_btree_get_nonroot_node(path, level);
 		path[level].bp_oldreq.bpr_ptr =
 			nilfs_btree_node_get_ptr(btree, node,
 						 path[level].bp_index);
@@ -1397,8 +1359,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 		if (ret < 0)
 			goto err_out_child_node;
 
-		if (nilfs_btree_node_get_nchildren(btree, node) >
-		    nilfs_btree_node_nchildren_min(btree, node)) {
+		if (nilfs_btree_node_get_nchildren(node) >
+		    nilfs_btree_node_nchildren_min(node, btree)) {
 			path[level].bp_op = nilfs_btree_do_delete;
 			stats->bs_nblocks++;
 			goto out;
@@ -1415,8 +1377,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 			if (ret < 0)
 				goto err_out_curr_node;
 			sib = (struct nilfs_btree_node *)bh->b_data;
-			if (nilfs_btree_node_get_nchildren(btree, sib) >
-			    nilfs_btree_node_nchildren_min(btree, sib)) {
+			if (nilfs_btree_node_get_nchildren(sib) >
+			    nilfs_btree_node_nchildren_min(sib, btree)) {
 				path[level].bp_sib_bh = bh;
 				path[level].bp_op = nilfs_btree_borrow_left;
 				stats->bs_nblocks++;
@@ -1428,7 +1390,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 				/* continue; */
 			}
 		} else if (pindex <
-			   nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+			   nilfs_btree_node_get_nchildren(parent) - 1) {
 			/* right sibling */
 			sibptr = nilfs_btree_node_get_ptr(btree, parent,
 							  pindex + 1);
@@ -1436,8 +1398,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 			if (ret < 0)
 				goto err_out_curr_node;
 			sib = (struct nilfs_btree_node *)bh->b_data;
-			if (nilfs_btree_node_get_nchildren(btree, sib) >
-			    nilfs_btree_node_nchildren_min(btree, sib)) {
+			if (nilfs_btree_node_get_nchildren(sib) >
+			    nilfs_btree_node_nchildren_min(sib, btree)) {
 				path[level].bp_sib_bh = bh;
 				path[level].bp_op = nilfs_btree_borrow_right;
 				stats->bs_nblocks++;
@@ -1452,7 +1414,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 			/* no siblings */
 			/* the only child of the root node */
 			WARN_ON(level != nilfs_btree_height(btree) - 2);
-			if (nilfs_btree_node_get_nchildren(btree, node) - 1 <=
+			if (nilfs_btree_node_get_nchildren(node) - 1 <=
 			    NILFS_BTREE_ROOT_NCHILDREN_MAX) {
 				path[level].bp_op = nilfs_btree_shrink;
 				stats->bs_nblocks += 2;
@@ -1523,10 +1485,10 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
 	int level, ret;
 
 	btree = (struct nilfs_btree *)bmap;
-	path = nilfs_btree_alloc_path(btree);
+	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(btree, path);
+	nilfs_btree_init_path(path);
 	ret = nilfs_btree_do_lookup(btree, path, key, NULL,
 				    NILFS_BTREE_LEVEL_NODE_MIN);
 	if (ret < 0)
@@ -1539,8 +1501,8 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
 	nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
 
 out:
-	nilfs_btree_clear_path(btree, path);
-	nilfs_btree_free_path(btree, path);
+	nilfs_btree_clear_path(path);
+	nilfs_btree_free_path(path);
 	return ret;
 }
 
@@ -1551,15 +1513,15 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
 	int ret;
 
 	btree = (struct nilfs_btree *)bmap;
-	path = nilfs_btree_alloc_path(btree);
+	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(btree, path);
+	nilfs_btree_init_path(path);
 
 	ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
 
-	nilfs_btree_clear_path(btree, path);
-	nilfs_btree_free_path(btree, path);
+	nilfs_btree_clear_path(path);
+	nilfs_btree_free_path(path);
 
 	return ret;
 }
@@ -1581,7 +1543,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
 		node = root;
 		break;
 	case 3:
-		nchildren = nilfs_btree_node_get_nchildren(btree, root);
+		nchildren = nilfs_btree_node_get_nchildren(root);
 		if (nchildren > 1)
 			return 0;
 		ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
@@ -1594,10 +1556,10 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
 		return 0;
 	}
 
-	nchildren = nilfs_btree_node_get_nchildren(btree, node);
-	maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	maxkey = nilfs_btree_node_get_key(node, nchildren - 1);
 	nextmaxkey = (nchildren > 1) ?
-		nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
+		nilfs_btree_node_get_key(node, nchildren - 2) : 0;
 	if (bh != NULL)
 		brelse(bh);
 
@@ -1623,7 +1585,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
 		node = root;
 		break;
 	case 3:
-		nchildren = nilfs_btree_node_get_nchildren(btree, root);
+		nchildren = nilfs_btree_node_get_nchildren(root);
 		WARN_ON(nchildren > 1);
 		ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
 		ret = nilfs_btree_get_block(btree, ptr, &bh);
@@ -1636,11 +1598,11 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
 		return -EINVAL;
 	}
 
-	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	nchildren = nilfs_btree_node_get_nchildren(node);
 	if (nchildren < nitems)
 		nitems = nchildren;
-	dkeys = nilfs_btree_node_dkeys(btree, node);
-	dptrs = nilfs_btree_node_dptrs(btree, node);
+	dkeys = nilfs_btree_node_dkeys(node);
+	dptrs = nilfs_btree_node_dptrs(node, btree);
 	for (i = 0; i < nitems; i++) {
 		keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
 		ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
@@ -1986,15 +1948,15 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
 	WARN_ON(!buffer_dirty(bh));
 
 	btree = (struct nilfs_btree *)bmap;
-	path = nilfs_btree_alloc_path(btree);
+	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(btree, path);
+	nilfs_btree_init_path(path);
 
 	if (buffer_nilfs_node(bh)) {
 		node = (struct nilfs_btree_node *)bh->b_data;
-		key = nilfs_btree_node_get_key(btree, node, 0);
-		level = nilfs_btree_node_get_level(btree, node);
+		key = nilfs_btree_node_get_key(node, 0);
+		level = nilfs_btree_node_get_level(node);
 	} else {
 		key = nilfs_bmap_data_get_key(bmap, bh);
 		level = NILFS_BTREE_LEVEL_DATA;
@@ -2013,8 +1975,8 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
 		nilfs_btree_propagate_p(btree, path, level, bh);
 
  out:
-	nilfs_btree_clear_path(btree, path);
-	nilfs_btree_free_path(btree, path);
+	nilfs_btree_clear_path(path);
+	nilfs_btree_free_path(path);
 
 	return ret;
 }
@@ -2037,12 +1999,12 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
 
 	get_bh(bh);
 	node = (struct nilfs_btree_node *)bh->b_data;
-	key = nilfs_btree_node_get_key(btree, node, 0);
-	level = nilfs_btree_node_get_level(btree, node);
+	key = nilfs_btree_node_get_key(node, 0);
+	level = nilfs_btree_node_get_level(node);
 	list_for_each(head, &lists[level]) {
 		cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
 		cnode = (struct nilfs_btree_node *)cbh->b_data;
-		ckey = nilfs_btree_node_get_key(btree, cnode, 0);
+		ckey = nilfs_btree_node_get_key(cnode, 0);
 		if (key < ckey)
 			break;
 	}
@@ -2120,8 +2082,7 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree,
 	nilfs_btree_node_set_ptr(btree, parent,
 				 path[level + 1].bp_index, blocknr);
 
-	key = nilfs_btree_node_get_key(btree, parent,
-				       path[level + 1].bp_index);
+	key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
 	/* on-disk format */
 	binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
 	binfo->bi_dat.bi_level = level;
@@ -2150,8 +2111,7 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
 	if (unlikely(ret < 0))
 		return ret;
 
-	key = nilfs_btree_node_get_key(btree, parent,
-				       path[level + 1].bp_index);
+	key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
 	/* on-disk format */
 	binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
 	binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
@@ -2171,15 +2131,15 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
 	int level, ret;
 
 	btree = (struct nilfs_btree *)bmap;
-	path = nilfs_btree_alloc_path(btree);
+	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(btree, path);
+	nilfs_btree_init_path(path);
 
 	if (buffer_nilfs_node(*bh)) {
 		node = (struct nilfs_btree_node *)(*bh)->b_data;
-		key = nilfs_btree_node_get_key(btree, node, 0);
-		level = nilfs_btree_node_get_level(btree, node);
+		key = nilfs_btree_node_get_key(node, 0);
+		level = nilfs_btree_node_get_level(node);
 	} else {
 		key = nilfs_bmap_data_get_key(bmap, *bh);
 		level = NILFS_BTREE_LEVEL_DATA;
@@ -2196,8 +2156,8 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
 		nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
 
  out:
-	nilfs_btree_clear_path(btree, path);
-	nilfs_btree_free_path(btree, path);
+	nilfs_btree_clear_path(path);
+	nilfs_btree_free_path(path);
 
 	return ret;
 }
@@ -2219,7 +2179,7 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
 
 	if (buffer_nilfs_node(*bh)) {
 		node = (struct nilfs_btree_node *)(*bh)->b_data;
-		key = nilfs_btree_node_get_key(btree, node, 0);
+		key = nilfs_btree_node_get_key(node, 0);
 	} else
 		key = nilfs_bmap_data_get_key(bmap, *bh);
 
@@ -2239,10 +2199,10 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
 	int ret;
 
 	btree = (struct nilfs_btree *)bmap;
-	path = nilfs_btree_alloc_path(btree);
+	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(btree, path);
+	nilfs_btree_init_path(path);
 
 	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
 	if (ret < 0) {
@@ -2262,8 +2222,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
 		nilfs_bmap_set_dirty(&btree->bt_bmap);
 
  out:
-	nilfs_btree_clear_path(btree, path);
-	nilfs_btree_free_path(btree, path);
+	nilfs_btree_clear_path(path);
+	nilfs_btree_free_path(path);
 	return ret;
 }
 
-- 
cgit v1.2.2


From 3218929dbd25245e0f601df1e359a3ed3f7fb03b Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 15 Aug 2009 01:54:59 +0900
Subject: nilfs2: stop zero-fill of btree path just before free it

The btree path object is cleared just before it is freed.

This will remove the code doing the unnecessary clear operation.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/btree.c | 36 ++++++++++++------------------------
 1 file changed, 12 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 21ed8ccea4b9..115b157d508b 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -97,25 +97,13 @@ static void nilfs_btree_init_path(struct nilfs_btree_path *path)
 	}
 }
 
-static void nilfs_btree_clear_path(struct nilfs_btree_path *path)
+static void nilfs_btree_release_path(struct nilfs_btree_path *path)
 {
 	int level;
 
-	for (level = NILFS_BTREE_LEVEL_DATA;
-	     level < NILFS_BTREE_LEVEL_MAX;
-	     level++) {
-		if (path[level].bp_bh != NULL) {
-			brelse(path[level].bp_bh);
-			path[level].bp_bh = NULL;
-		}
-		/* sib_bh is released or deleted by prepare or commit
-		 * operations. */
-		path[level].bp_sib_bh = NULL;
-		path[level].bp_index = 0;
-		path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
-		path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
-		path[level].bp_op = NULL;
-	}
+	for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX;
+	     level++)
+		brelse(path[level].bp_bh);
 }
 
 /*
@@ -557,7 +545,7 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
 	if (ptrp != NULL)
 		*ptrp = ptr;
 
-	nilfs_btree_clear_path(path);
+	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 
 	return ret;
@@ -639,7 +627,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
 	*ptrp = ptr;
 	ret = cnt;
  out:
-	nilfs_btree_clear_path(path);
+	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 	return ret;
 }
@@ -1146,7 +1134,7 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 	nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
 
  out:
-	nilfs_btree_clear_path(path);
+	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 	return ret;
 }
@@ -1501,7 +1489,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
 	nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
 
 out:
-	nilfs_btree_clear_path(path);
+	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 	return ret;
 }
@@ -1520,7 +1508,7 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
 
 	ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
 
-	nilfs_btree_clear_path(path);
+	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 
 	return ret;
@@ -1975,7 +1963,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
 		nilfs_btree_propagate_p(btree, path, level, bh);
 
  out:
-	nilfs_btree_clear_path(path);
+	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 
 	return ret;
@@ -2156,7 +2144,7 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
 		nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
 
  out:
-	nilfs_btree_clear_path(path);
+	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 
 	return ret;
@@ -2222,7 +2210,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
 		nilfs_bmap_set_dirty(&btree->bt_bmap);
 
  out:
-	nilfs_btree_clear_path(path);
+	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 	return ret;
 }
-- 
cgit v1.2.2


From 7a102b09232be1ad7c180dfd1f46c7aa95dff1e0 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 15 Aug 2009 13:47:09 +0900
Subject: nilfs2: remove individual gfp constants for each metadata file

This gets rid of NILFS_CPFILE_GFP, NILFS_SUFILE_GFP, NILFS_DAT_GFP,
and NILFS_IFILE_GFP.  All of these constants refer to NILFS_MDT_GFP,
and can be removed.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/cpfile.h    |  2 --
 fs/nilfs2/dat.h       |  1 -
 fs/nilfs2/ifile.h     |  1 -
 fs/nilfs2/mdt.c       |  5 +++--
 fs/nilfs2/mdt.h       |  3 +--
 fs/nilfs2/sufile.h    |  1 -
 fs/nilfs2/super.c     |  3 +--
 fs/nilfs2/the_nilfs.c | 12 ++++--------
 8 files changed, 9 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 788a45950197..debea896e701 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -27,8 +27,6 @@
 #include <linux/buffer_head.h>
 #include <linux/nilfs2_fs.h>
 
-#define NILFS_CPFILE_GFP	NILFS_MDT_GFP
-
 
 int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
 				struct nilfs_checkpoint **,
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index 4b0b47ea85ed..91dc3372a39a 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -27,7 +27,6 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 
-#define NILFS_DAT_GFP	NILFS_MDT_GFP
 
 struct nilfs_palloc_req;
 
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 5d30a35679b5..ecc3ba76db47 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -31,7 +31,6 @@
 #include "mdt.h"
 #include "alloc.h"
 
-#define NILFS_IFILE_GFP  NILFS_MDT_GFP
 
 static inline struct nilfs_inode *
 nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 1ae8d56052c3..9cb831899119 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -513,9 +513,10 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
 }
 
 struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
-			    ino_t ino, gfp_t gfp_mask)
+			    ino_t ino)
 {
-	struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask);
+	struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino,
+						   NILFS_MDT_GFP);
 
 	if (!inode)
 		return NULL;
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index df683e0bca6a..431599733c9b 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -74,8 +74,7 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
 int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
 int nilfs_mdt_fetch_dirty(struct inode *);
 
-struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
-			    gfp_t);
+struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t);
 struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
 				   ino_t, gfp_t);
 void nilfs_mdt_destroy(struct inode *);
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index a2c4d76c3366..0e99e5c0bd0f 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -28,7 +28,6 @@
 #include <linux/nilfs2_fs.h>
 #include "mdt.h"
 
-#define NILFS_SUFILE_GFP	NILFS_MDT_GFP
 
 static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
 {
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 50284add7880..55f3d6b60732 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -363,8 +363,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
 	list_add(&sbi->s_list, &nilfs->ns_supers);
 	up_write(&nilfs->ns_super_sem);
 
-	sbi->s_ifile = nilfs_mdt_new(
-		nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
+	sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO);
 	if (!sbi->s_ifile)
 		return -ENOMEM;
 
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 0c4573653b87..d4168e269c5d 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -187,23 +187,19 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
 	inode_size = nilfs->ns_inode_size;
 
 	err = -ENOMEM;
-	nilfs->ns_dat = nilfs_mdt_new(
-		nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
+	nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
 	if (unlikely(!nilfs->ns_dat))
 		goto failed;
 
-	nilfs->ns_gc_dat = nilfs_mdt_new(
-		nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
+	nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
 	if (unlikely(!nilfs->ns_gc_dat))
 		goto failed_dat;
 
-	nilfs->ns_cpfile = nilfs_mdt_new(
-		nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP);
+	nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO);
 	if (unlikely(!nilfs->ns_cpfile))
 		goto failed_gc_dat;
 
-	nilfs->ns_sufile = nilfs_mdt_new(
-		nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP);
+	nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO);
 	if (unlikely(!nilfs->ns_sufile))
 		goto failed_cpfile;
 
-- 
cgit v1.2.2


From bd8169efae8bdd292675c386010f6b35f0771057 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 15 Aug 2009 17:22:13 +0900
Subject: nilfs2: add update functions of virtual block address to dat

This is a preparation for the successive cleanup ("nilfs2: allow btree
to directly call dat operations").

This adds functions bundling a few operations to change an entry of
virtual block address on the dat file.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/bmap.c | 27 +++++++--------------------
 fs/nilfs2/dat.c  | 31 +++++++++++++++++++++++++++++++
 fs/nilfs2/dat.h  |  6 ++++++
 3 files changed, 44 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 99d58a028b94..13e95a907ece 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -533,38 +533,25 @@ int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap,
 				union nilfs_bmap_ptr_req *oldreq,
 				union nilfs_bmap_ptr_req *newreq)
 {
-	struct inode *dat = nilfs_bmap_get_dat(bmap);
-	int ret;
-
-	ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req);
-	if (ret < 0)
-		return ret;
-	ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req);
-	if (ret < 0)
-		nilfs_dat_abort_end(dat, &oldreq->bpr_req);
-
-	return ret;
+	return nilfs_dat_prepare_update(nilfs_bmap_get_dat(bmap),
+					&oldreq->bpr_req, &newreq->bpr_req);
 }
 
 void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap,
 				union nilfs_bmap_ptr_req *oldreq,
 				union nilfs_bmap_ptr_req *newreq)
 {
-	struct inode *dat = nilfs_bmap_get_dat(bmap);
-
-	nilfs_dat_commit_end(dat, &oldreq->bpr_req,
-			     bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
-	nilfs_dat_commit_alloc(dat, &newreq->bpr_req);
+	nilfs_dat_commit_update(nilfs_bmap_get_dat(bmap),
+				&oldreq->bpr_req, &newreq->bpr_req,
+				bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
 }
 
 void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap,
 			       union nilfs_bmap_ptr_req *oldreq,
 			       union nilfs_bmap_ptr_req *newreq)
 {
-	struct inode *dat = nilfs_bmap_get_dat(bmap);
-
-	nilfs_dat_abort_end(dat, &oldreq->bpr_req);
-	nilfs_dat_abort_alloc(dat, &newreq->bpr_req);
+	nilfs_dat_abort_update(nilfs_bmap_get_dat(bmap),
+			       &oldreq->bpr_req, &newreq->bpr_req);
 }
 
 static struct lock_class_key nilfs_bmap_dat_lock_key;
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 1cfcc1a0fdf1..1ff8e15bd36b 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -211,6 +211,37 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
 	nilfs_dat_abort_entry(dat, req);
 }
 
+int nilfs_dat_prepare_update(struct inode *dat,
+			     struct nilfs_palloc_req *oldreq,
+			     struct nilfs_palloc_req *newreq)
+{
+	int ret;
+
+	ret = nilfs_dat_prepare_end(dat, oldreq);
+	if (!ret) {
+		ret = nilfs_dat_prepare_alloc(dat, newreq);
+		if (ret < 0)
+			nilfs_dat_abort_end(dat, oldreq);
+	}
+	return ret;
+}
+
+void nilfs_dat_commit_update(struct inode *dat,
+			     struct nilfs_palloc_req *oldreq,
+			     struct nilfs_palloc_req *newreq, int dead)
+{
+	nilfs_dat_commit_end(dat, oldreq, dead);
+	nilfs_dat_commit_alloc(dat, newreq);
+}
+
+void nilfs_dat_abort_update(struct inode *dat,
+			    struct nilfs_palloc_req *oldreq,
+			    struct nilfs_palloc_req *newreq)
+{
+	nilfs_dat_abort_end(dat, oldreq);
+	nilfs_dat_abort_alloc(dat, newreq);
+}
+
 /**
  * nilfs_dat_mark_dirty -
  * @dat: DAT file inode
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index 91dc3372a39a..406070d3ff49 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -41,6 +41,12 @@ void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
 int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
 void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
 void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_prepare_update(struct inode *, struct nilfs_palloc_req *,
+			     struct nilfs_palloc_req *);
+void nilfs_dat_commit_update(struct inode *, struct nilfs_palloc_req *,
+			     struct nilfs_palloc_req *, int);
+void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *,
+			    struct nilfs_palloc_req *);
 
 int nilfs_dat_mark_dirty(struct inode *, __u64);
 int nilfs_dat_freev(struct inode *, __u64 *, size_t);
-- 
cgit v1.2.2


From 2e0c2c73923fed27337039ddfd69985e6c4b91fe Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 15 Aug 2009 15:34:33 +0900
Subject: nilfs2: allow btree code to directly call dat operations

The current btree code is written so that btree functions call dat
operations via wrapper functions in bmap.c when they allocate, free,
or modify virtual block addresses.

This abstraction requires additional function calls and causes
frequent call of nilfs_bmap_get_dat() function since it is used in the
every wrapper function.

This removes the wrapper functions and makes them available from
btree.c and direct.c, which will increase the opportunity of
compiler optimization.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/bmap.c   |  85 ----------------------------
 fs/nilfs2/bmap.h   |  69 +++++++++--------------
 fs/nilfs2/btree.c  | 151 +++++++++++++++++++++++++++----------------------
 fs/nilfs2/direct.c | 161 +++++++++++++++++++----------------------------------
 4 files changed, 167 insertions(+), 299 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 13e95a907ece..f98c5c4cf6e7 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -469,91 +469,6 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
 		(entries_per_group / NILFS_BMAP_GROUP_DIV);
 }
 
-int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
-				 union nilfs_bmap_ptr_req *req)
-{
-	return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
-				 union nilfs_bmap_ptr_req *req)
-{
-	nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
-			      union nilfs_bmap_ptr_req *req)
-{
-	nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req,
-		       sector_t blocknr)
-{
-	struct inode *dat = nilfs_bmap_get_dat(bmap);
-	int ret;
-
-	ret = nilfs_dat_prepare_start(dat, &req->bpr_req);
-	if (likely(!ret))
-		nilfs_dat_commit_start(dat, &req->bpr_req, blocknr);
-	return ret;
-}
-
-int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
-			     union nilfs_bmap_ptr_req *req)
-{
-	return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
-			     union nilfs_bmap_ptr_req *req)
-{
-	nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req,
-			     bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
-}
-
-void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
-			    union nilfs_bmap_ptr_req *req)
-{
-	nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr,
-		      sector_t blocknr)
-{
-	return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr);
-}
-
-int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
-{
-	return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
-}
-
-int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap,
-				union nilfs_bmap_ptr_req *oldreq,
-				union nilfs_bmap_ptr_req *newreq)
-{
-	return nilfs_dat_prepare_update(nilfs_bmap_get_dat(bmap),
-					&oldreq->bpr_req, &newreq->bpr_req);
-}
-
-void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap,
-				union nilfs_bmap_ptr_req *oldreq,
-				union nilfs_bmap_ptr_req *newreq)
-{
-	nilfs_dat_commit_update(nilfs_bmap_get_dat(bmap),
-				&oldreq->bpr_req, &newreq->bpr_req,
-				bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
-}
-
-void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap,
-			       union nilfs_bmap_ptr_req *oldreq,
-			       union nilfs_bmap_ptr_req *newreq)
-{
-	nilfs_dat_abort_update(nilfs_bmap_get_dat(bmap),
-			       &oldreq->bpr_req, &newreq->bpr_req);
-}
-
 static struct lock_class_key nilfs_bmap_dat_lock_key;
 static struct lock_class_key nilfs_bmap_mdt_lock_key;
 
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index b2890cdcef12..a4f64e54424c 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -28,6 +28,7 @@
 #include <linux/buffer_head.h>
 #include <linux/nilfs2_fs.h>
 #include "alloc.h"
+#include "dat.h"
 
 #define NILFS_BMAP_INVALID_PTR	0
 
@@ -164,86 +165,66 @@ void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
  * Internal use only
  */
 struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *);
-int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *,
-			       union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *,
-			       union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *,
-			      union nilfs_bmap_ptr_req *);
 
 static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap,
-					       union nilfs_bmap_ptr_req *req)
+					       union nilfs_bmap_ptr_req *req,
+					       struct inode *dat)
 {
-	if (NILFS_BMAP_USE_VBN(bmap))
-		return nilfs_bmap_prepare_alloc_v(bmap, req);
+	if (dat)
+		return nilfs_dat_prepare_alloc(dat, &req->bpr_req);
 	/* ignore target ptr */
 	req->bpr_ptr = bmap->b_last_allocated_ptr++;
 	return 0;
 }
 
 static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap,
-					       union nilfs_bmap_ptr_req *req)
+					       union nilfs_bmap_ptr_req *req,
+					       struct inode *dat)
 {
-	if (NILFS_BMAP_USE_VBN(bmap))
-		nilfs_bmap_commit_alloc_v(bmap, req);
+	if (dat)
+		nilfs_dat_commit_alloc(dat, &req->bpr_req);
 }
 
 static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap,
-					      union nilfs_bmap_ptr_req *req)
+					      union nilfs_bmap_ptr_req *req,
+					      struct inode *dat)
 {
-	if (NILFS_BMAP_USE_VBN(bmap))
-		nilfs_bmap_abort_alloc_v(bmap, req);
+	if (dat)
+		nilfs_dat_abort_alloc(dat, &req->bpr_req);
 	else
 		bmap->b_last_allocated_ptr--;
 }
 
-int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
-
 static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap,
-					     union nilfs_bmap_ptr_req *req)
+					     union nilfs_bmap_ptr_req *req,
+					     struct inode *dat)
 {
-	return NILFS_BMAP_USE_VBN(bmap) ?
-		nilfs_bmap_prepare_end_v(bmap, req) : 0;
+	return dat ? nilfs_dat_prepare_end(dat, &req->bpr_req) : 0;
 }
 
 static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap,
-					     union nilfs_bmap_ptr_req *req)
+					     union nilfs_bmap_ptr_req *req,
+					     struct inode *dat)
 {
-	if (NILFS_BMAP_USE_VBN(bmap))
-		nilfs_bmap_commit_end_v(bmap, req);
+	if (dat)
+		nilfs_dat_commit_end(dat, &req->bpr_req,
+				     bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
 }
 
 static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
-					    union nilfs_bmap_ptr_req *req)
+					    union nilfs_bmap_ptr_req *req,
+					    struct inode *dat)
 {
-	if (NILFS_BMAP_USE_VBN(bmap))
-		nilfs_bmap_abort_end_v(bmap, req);
+	if (dat)
+		nilfs_dat_abort_end(dat, &req->bpr_req);
 }
 
-int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *,
-		       sector_t);
-int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
-int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
-
-
 __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
 			      const struct buffer_head *);
 
 __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
 __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
 
-int nilfs_bmap_prepare_update_v(struct nilfs_bmap *,
-				union nilfs_bmap_ptr_req *,
-				union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_update_v(struct nilfs_bmap *,
-				union nilfs_bmap_ptr_req *,
-				union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_update_v(struct nilfs_bmap *,
-			       union nilfs_bmap_ptr_req *,
-			       union nilfs_bmap_ptr_req *);
-
 void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
 void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
 
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 115b157d508b..e25b507a474f 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -940,17 +940,20 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 	struct nilfs_btree_node *node, *parent, *sib;
 	__u64 sibptr;
 	int pindex, level, ret;
+	struct inode *dat = NULL;
 
 	stats->bs_nblocks = 0;
 	level = NILFS_BTREE_LEVEL_DATA;
 
 	/* allocate a new ptr for data block */
-	if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
+	if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
 		path[level].bp_newreq.bpr_ptr =
 			nilfs_btree_find_target_v(btree, path, key);
+		dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+	}
 
 	ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-					   &path[level].bp_newreq);
+					   &path[level].bp_newreq, dat);
 	if (ret < 0)
 		goto err_out_data;
 
@@ -1009,7 +1012,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 		path[level].bp_newreq.bpr_ptr =
 			path[level - 1].bp_newreq.bpr_ptr + 1;
 		ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-						   &path[level].bp_newreq);
+						   &path[level].bp_newreq, dat);
 		if (ret < 0)
 			goto err_out_child_node;
 		ret = nilfs_btree_get_new_block(btree,
@@ -1041,7 +1044,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 	/* grow */
 	path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
 	ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-					   &path[level].bp_newreq);
+					   &path[level].bp_newreq, dat);
 	if (ret < 0)
 		goto err_out_child_node;
 	ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
@@ -1069,16 +1072,18 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 
 	/* error */
  err_out_curr_node:
-	nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
+	nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
+				   dat);
  err_out_child_node:
 	for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
 		nilfs_btnode_delete(path[level].bp_sib_bh);
 		nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap,
-					   &path[level].bp_newreq);
+					   &path[level].bp_newreq, dat);
 
 	}
 
-	nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
+	nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
+				   dat);
  err_out_data:
 	*levelp = level;
 	stats->bs_nblocks = 0;
@@ -1089,16 +1094,19 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
 				      struct nilfs_btree_path *path,
 				      int maxlevel, __u64 key, __u64 ptr)
 {
+	struct inode *dat = NULL;
 	int level;
 
 	set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
 	ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
-	if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
+	if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
 		nilfs_btree_set_target_v(btree, key, ptr);
+		dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+	}
 
 	for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
 		nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap,
-					    &path[level - 1].bp_newreq);
+					    &path[level - 1].bp_newreq, dat);
 		path[level].bp_op(btree, path, level, &key, &ptr);
 	}
 
@@ -1326,7 +1334,8 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
 static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 				      struct nilfs_btree_path *path,
 				      int *levelp,
-				      struct nilfs_bmap_stats *stats)
+				      struct nilfs_bmap_stats *stats,
+				      struct inode *dat)
 {
 	struct buffer_head *bh;
 	struct nilfs_btree_node *node, *parent, *sib;
@@ -1343,7 +1352,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 			nilfs_btree_node_get_ptr(btree, node,
 						 path[level].bp_index);
 		ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
-						 &path[level].bp_oldreq);
+						 &path[level].bp_oldreq, dat);
 		if (ret < 0)
 			goto err_out_child_node;
 
@@ -1421,7 +1430,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 		nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
 
 	ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
-					 &path[level].bp_oldreq);
+					 &path[level].bp_oldreq, dat);
 	if (ret < 0)
 		goto err_out_child_node;
 
@@ -1436,12 +1445,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 
 	/* error */
  err_out_curr_node:
-	nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq);
+	nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat);
  err_out_child_node:
 	for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
 		brelse(path[level].bp_sib_bh);
 		nilfs_bmap_abort_end_ptr(&btree->bt_bmap,
-					 &path[level].bp_oldreq);
+					 &path[level].bp_oldreq, dat);
 	}
 	*levelp = level;
 	stats->bs_nblocks = 0;
@@ -1450,13 +1459,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 
 static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
 				      struct nilfs_btree_path *path,
-				      int maxlevel)
+				      int maxlevel, struct inode *dat)
 {
 	int level;
 
 	for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
 		nilfs_bmap_commit_end_ptr(&btree->bt_bmap,
-					  &path[level].bp_oldreq);
+					  &path[level].bp_oldreq, dat);
 		path[level].bp_op(btree, path, level, NULL, NULL);
 	}
 
@@ -1470,6 +1479,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
 	struct nilfs_btree *btree;
 	struct nilfs_btree_path *path;
 	struct nilfs_bmap_stats stats;
+	struct inode *dat;
 	int level, ret;
 
 	btree = (struct nilfs_btree *)bmap;
@@ -1482,10 +1492,14 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
 	if (ret < 0)
 		goto out;
 
-	ret = nilfs_btree_prepare_delete(btree, path, &level, &stats);
+
+	dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ?
+		nilfs_bmap_get_dat(&btree->bt_bmap) : NULL;
+
+	ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
 	if (ret < 0)
 		goto out;
-	nilfs_btree_commit_delete(btree, path, level);
+	nilfs_btree_commit_delete(btree, path, level, dat);
 	nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
 
 out:
@@ -1610,18 +1624,20 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
 				       struct nilfs_bmap_stats *stats)
 {
 	struct buffer_head *bh;
-	struct nilfs_btree *btree;
+	struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
+	struct inode *dat = NULL;
 	int ret;
 
-	btree = (struct nilfs_btree *)bmap;
 	stats->bs_nblocks = 0;
 
 	/* for data */
 	/* cannot find near ptr */
-	if (NILFS_BMAP_USE_VBN(bmap))
+	if (NILFS_BMAP_USE_VBN(bmap)) {
 		dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
+		dat = nilfs_bmap_get_dat(bmap);
+	}
 
-	ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq);
+	ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat);
 	if (ret < 0)
 		return ret;
 
@@ -1629,7 +1645,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
 	stats->bs_nblocks++;
 	if (nreq != NULL) {
 		nreq->bpr_ptr = dreq->bpr_ptr + 1;
-		ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq);
+		ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat);
 		if (ret < 0)
 			goto err_out_dreq;
 
@@ -1646,9 +1662,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
 
 	/* error */
  err_out_nreq:
-	nilfs_bmap_abort_alloc_ptr(bmap, nreq);
+	nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat);
  err_out_dreq:
-	nilfs_bmap_abort_alloc_ptr(bmap, dreq);
+	nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat);
 	stats->bs_nblocks = 0;
 	return ret;
 
@@ -1663,8 +1679,9 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 				      union nilfs_bmap_ptr_req *nreq,
 				      struct buffer_head *bh)
 {
-	struct nilfs_btree *btree;
+	struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
 	struct nilfs_btree_node *node;
+	struct inode *dat;
 	__u64 tmpptr;
 
 	/* free resources */
@@ -1675,11 +1692,11 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 	set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
 
 	/* convert and insert */
-	btree = (struct nilfs_btree *)bmap;
+	dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
 	nilfs_btree_init(bmap);
 	if (nreq != NULL) {
-		nilfs_bmap_commit_alloc_ptr(bmap, dreq);
-		nilfs_bmap_commit_alloc_ptr(bmap, nreq);
+		nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
+		nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat);
 
 		/* create child node at level 1 */
 		lock_buffer(bh);
@@ -1701,7 +1718,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 		nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
 				      2, 1, &keys[0], &tmpptr);
 	} else {
-		nilfs_bmap_commit_alloc_ptr(bmap, dreq);
+		nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
 
 		/* create root node at level 1 */
 		node = nilfs_btree_get_root(btree);
@@ -1772,7 +1789,7 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
 
 static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
 					struct nilfs_btree_path *path,
-					int level)
+					int level, struct inode *dat)
 {
 	struct nilfs_btree_node *parent;
 	int ret;
@@ -1782,9 +1799,8 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
 		nilfs_btree_node_get_ptr(btree, parent,
 					 path[level + 1].bp_index);
 	path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
-	ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap,
-					  &path[level].bp_oldreq,
-					  &path[level].bp_newreq);
+	ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
+				       &path[level].bp_newreq.bpr_req);
 	if (ret < 0)
 		return ret;
 
@@ -1796,9 +1812,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
 			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
 			&path[level].bp_ctxt);
 		if (ret < 0) {
-			nilfs_bmap_abort_update_v(&btree->bt_bmap,
-						  &path[level].bp_oldreq,
-						  &path[level].bp_newreq);
+			nilfs_dat_abort_update(dat,
+					       &path[level].bp_oldreq.bpr_req,
+					       &path[level].bp_newreq.bpr_req);
 			return ret;
 		}
 	}
@@ -1808,13 +1824,13 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
 
 static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
 					struct nilfs_btree_path *path,
-					int level)
+					int level, struct inode *dat)
 {
 	struct nilfs_btree_node *parent;
 
-	nilfs_bmap_commit_update_v(&btree->bt_bmap,
-				   &path[level].bp_oldreq,
-				   &path[level].bp_newreq);
+	nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
+				&path[level].bp_newreq.bpr_req,
+				btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS);
 
 	if (buffer_nilfs_node(path[level].bp_bh)) {
 		nilfs_btnode_commit_change_key(
@@ -1831,11 +1847,10 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
 
 static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
 				       struct nilfs_btree_path *path,
-				       int level)
+				       int level, struct inode *dat)
 {
-	nilfs_bmap_abort_update_v(&btree->bt_bmap,
-				  &path[level].bp_oldreq,
-				  &path[level].bp_newreq);
+	nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req,
+			       &path[level].bp_newreq.bpr_req);
 	if (buffer_nilfs_node(path[level].bp_bh))
 		nilfs_btnode_abort_change_key(
 			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
@@ -1844,14 +1859,14 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
 
 static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
 					   struct nilfs_btree_path *path,
-					   int minlevel,
-					   int *maxlevelp)
+					   int minlevel, int *maxlevelp,
+					   struct inode *dat)
 {
 	int level, ret;
 
 	level = minlevel;
 	if (!buffer_nilfs_volatile(path[level].bp_bh)) {
-		ret = nilfs_btree_prepare_update_v(btree, path, level);
+		ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
 		if (ret < 0)
 			return ret;
 	}
@@ -1859,7 +1874,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
 	       !buffer_dirty(path[level].bp_bh)) {
 
 		WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
-		ret = nilfs_btree_prepare_update_v(btree, path, level);
+		ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
 		if (ret < 0)
 			goto out;
 	}
@@ -1871,39 +1886,40 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
 	/* error */
  out:
 	while (--level > minlevel)
-		nilfs_btree_abort_update_v(btree, path, level);
+		nilfs_btree_abort_update_v(btree, path, level, dat);
 	if (!buffer_nilfs_volatile(path[level].bp_bh))
-		nilfs_btree_abort_update_v(btree, path, level);
+		nilfs_btree_abort_update_v(btree, path, level, dat);
 	return ret;
 }
 
 static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
 					   struct nilfs_btree_path *path,
-					   int minlevel,
-					   int maxlevel,
-					   struct buffer_head *bh)
+					   int minlevel, int maxlevel,
+					   struct buffer_head *bh,
+					   struct inode *dat)
 {
 	int level;
 
 	if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
-		nilfs_btree_commit_update_v(btree, path, minlevel);
+		nilfs_btree_commit_update_v(btree, path, minlevel, dat);
 
 	for (level = minlevel + 1; level <= maxlevel; level++)
-		nilfs_btree_commit_update_v(btree, path, level);
+		nilfs_btree_commit_update_v(btree, path, level, dat);
 }
 
 static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
 				   struct nilfs_btree_path *path,
-				   int level,
-				   struct buffer_head *bh)
+				   int level, struct buffer_head *bh)
 {
 	int maxlevel, ret;
 	struct nilfs_btree_node *parent;
+	struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
 	__u64 ptr;
 
 	get_bh(bh);
 	path[level].bp_bh = bh;
-	ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel);
+	ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel,
+					      dat);
 	if (ret < 0)
 		goto out;
 
@@ -1911,12 +1927,12 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
 		parent = nilfs_btree_get_node(btree, path, level + 1);
 		ptr = nilfs_btree_node_get_ptr(btree, parent,
 					       path[level + 1].bp_index);
-		ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr);
+		ret = nilfs_dat_mark_dirty(dat, ptr);
 		if (ret < 0)
 			goto out;
 	}
 
-	nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh);
+	nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat);
 
  out:
 	brelse(path[level].bp_bh);
@@ -1972,7 +1988,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
 static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
 				    struct buffer_head *bh)
 {
-	return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr);
+	return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr);
 }
 
 static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
@@ -2086,6 +2102,7 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
 				union nilfs_binfo *binfo)
 {
 	struct nilfs_btree_node *parent;
+	struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
 	__u64 key;
 	__u64 ptr;
 	union nilfs_bmap_ptr_req req;
@@ -2095,9 +2112,10 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
 	ptr = nilfs_btree_node_get_ptr(btree, parent,
 				       path[level + 1].bp_index);
 	req.bpr_ptr = ptr;
-	ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr);
-	if (unlikely(ret < 0))
+	ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
+	if (ret < 0)
 		return ret;
+	nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
 
 	key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
 	/* on-disk format */
@@ -2155,13 +2173,12 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
 				 sector_t blocknr,
 				 union nilfs_binfo *binfo)
 {
-	struct nilfs_btree *btree;
 	struct nilfs_btree_node *node;
 	__u64 key;
 	int ret;
 
-	btree = (struct nilfs_btree *)bmap;
-	ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr);
+	ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr,
+			     blocknr);
 	if (ret < 0)
 		return ret;
 
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 342d9765df8d..d369ac718277 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -125,106 +125,64 @@ static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
 	direct->d_bmap.b_last_allocated_ptr = ptr;
 }
 
-static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
-				       __u64 key,
-				       union nilfs_bmap_ptr_req *req,
-				       struct nilfs_bmap_stats *stats)
-{
-	int ret;
-
-	if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
-		req->bpr_ptr = nilfs_direct_find_target_v(direct, key);
-	ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req);
-	if (ret < 0)
-		return ret;
-
-	stats->bs_nblocks = 1;
-	return 0;
-}
-
-static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
-				       union nilfs_bmap_ptr_req *req,
-				       __u64 key, __u64 ptr)
-{
-	struct buffer_head *bh;
-
-	/* ptr must be a pointer to a buffer head. */
-	bh = (struct buffer_head *)((unsigned long)ptr);
-	set_buffer_nilfs_volatile(bh);
-
-	nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req);
-	nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
-
-	if (!nilfs_bmap_dirty(&direct->d_bmap))
-		nilfs_bmap_set_dirty(&direct->d_bmap);
-
-	if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
-		nilfs_direct_set_target_v(direct, key, req->bpr_ptr);
-}
-
 static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 {
-	struct nilfs_direct *direct;
+	struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
 	union nilfs_bmap_ptr_req req;
-	struct nilfs_bmap_stats stats;
+	struct inode *dat = NULL;
+	struct buffer_head *bh;
 	int ret;
 
-	direct = (struct nilfs_direct *)bmap;
 	if (key > NILFS_DIRECT_KEY_MAX)
 		return -ENOENT;
 	if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
 		return -EEXIST;
 
-	ret = nilfs_direct_prepare_insert(direct, key, &req, &stats);
-	if (ret < 0)
-		return ret;
-	nilfs_direct_commit_insert(direct, &req, key, ptr);
-	nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+	if (NILFS_BMAP_USE_VBN(bmap)) {
+		req.bpr_ptr = nilfs_direct_find_target_v(direct, key);
+		dat = nilfs_bmap_get_dat(bmap);
+	}
+	ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat);
+	if (!ret) {
+		/* ptr must be a pointer to a buffer head. */
+		bh = (struct buffer_head *)((unsigned long)ptr);
+		set_buffer_nilfs_volatile(bh);
 
-	return 0;
-}
+		nilfs_bmap_commit_alloc_ptr(bmap, &req, dat);
+		nilfs_direct_set_ptr(direct, key, req.bpr_ptr);
 
-static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
-				       union nilfs_bmap_ptr_req *req,
-				       __u64 key,
-				       struct nilfs_bmap_stats *stats)
-{
-	int ret;
+		if (!nilfs_bmap_dirty(bmap))
+			nilfs_bmap_set_dirty(bmap);
 
-	req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
-	ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req);
-	if (!ret)
-		stats->bs_nblocks = 1;
-	return ret;
-}
+		if (NILFS_BMAP_USE_VBN(bmap))
+			nilfs_direct_set_target_v(direct, key, req.bpr_ptr);
 
-static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
-				       union nilfs_bmap_ptr_req *req,
-				       __u64 key)
-{
-	nilfs_bmap_commit_end_ptr(&direct->d_bmap, req);
-	nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
+		nilfs_bmap_add_blocks(bmap, 1);
+	}
+	return ret;
 }
 
 static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
 {
-	struct nilfs_direct *direct;
+	struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
 	union nilfs_bmap_ptr_req req;
-	struct nilfs_bmap_stats stats;
+	struct inode *dat;
 	int ret;
 
-	direct = (struct nilfs_direct *)bmap;
-	if ((key > NILFS_DIRECT_KEY_MAX) ||
+	if (key > NILFS_DIRECT_KEY_MAX ||
 	    nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
 		return -ENOENT;
 
-	ret = nilfs_direct_prepare_delete(direct, &req, key, &stats);
-	if (ret < 0)
-		return ret;
-	nilfs_direct_commit_delete(direct, &req, key);
-	nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
+	dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
+	req.bpr_ptr = nilfs_direct_get_ptr(direct, key);
 
-	return 0;
+	ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat);
+	if (!ret) {
+		nilfs_bmap_commit_end_ptr(bmap, &req, dat);
+		nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
+		nilfs_bmap_sub_blocks(bmap, 1);
+	}
+	return ret;
 }
 
 static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
@@ -310,59 +268,56 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
 	return 0;
 }
 
-static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
-				    struct buffer_head *bh)
+static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
+				  struct buffer_head *bh)
 {
-	union nilfs_bmap_ptr_req oldreq, newreq;
+	struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
+	struct nilfs_palloc_req oldreq, newreq;
+	struct inode *dat;
 	__u64 key;
 	__u64 ptr;
 	int ret;
 
-	key = nilfs_bmap_data_get_key(&direct->d_bmap, bh);
+	if (!NILFS_BMAP_USE_VBN(bmap))
+		return 0;
+
+	dat = nilfs_bmap_get_dat(bmap);
+	key = nilfs_bmap_data_get_key(bmap, bh);
 	ptr = nilfs_direct_get_ptr(direct, key);
 	if (!buffer_nilfs_volatile(bh)) {
-		oldreq.bpr_ptr = ptr;
-		newreq.bpr_ptr = ptr;
-		ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq,
-						  &newreq);
+		oldreq.pr_entry_nr = ptr;
+		newreq.pr_entry_nr = ptr;
+		ret = nilfs_dat_prepare_update(dat, &oldreq, &newreq);
 		if (ret < 0)
 			return ret;
-		nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq);
+		nilfs_dat_commit_update(dat, &oldreq, &newreq,
+					bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
 		set_buffer_nilfs_volatile(bh);
-		nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
+		nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr);
 	} else
-		ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr);
+		ret = nilfs_dat_mark_dirty(dat, ptr);
 
 	return ret;
 }
 
-static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
-				  struct buffer_head *bh)
-{
-	struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
-
-	return NILFS_BMAP_USE_VBN(bmap) ?
-		nilfs_direct_propagate_v(direct, bh) : 0;
-}
-
 static int nilfs_direct_assign_v(struct nilfs_direct *direct,
 				 __u64 key, __u64 ptr,
 				 struct buffer_head **bh,
 				 sector_t blocknr,
 				 union nilfs_binfo *binfo)
 {
+	struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap);
 	union nilfs_bmap_ptr_req req;
 	int ret;
 
 	req.bpr_ptr = ptr;
-	ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr);
-	if (unlikely(ret < 0))
-		return ret;
-
-	binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
-	binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
-
-	return 0;
+	ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
+	if (!ret) {
+		nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
+		binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
+		binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+	}
+	return ret;
 }
 
 static int nilfs_direct_assign_p(struct nilfs_direct *direct,
-- 
cgit v1.2.2


From 0f3fe33b398abbecfcf9f08c16959d1a9a14a49a Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 15 Aug 2009 02:29:28 +0900
Subject: nilfs2: convert nilfs_bmap_lookup to an inline function

The nilfs_bmap_lookup() is now a wrapper function of
nilfs_bmap_lookup_at_level().

This moves the nilfs_bmap_lookup() to a header file converting it to
an inline function and gives an opportunity for optimization.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/bmap.c | 53 ++++++++++++++++++++---------------------------------
 fs/nilfs2/bmap.h |  7 ++++++-
 fs/nilfs2/mdt.c  |  4 ++--
 3 files changed, 28 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index f98c5c4cf6e7..08834df6ec68 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -36,6 +36,26 @@ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
 	return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
 }
 
+/**
+ * nilfs_bmap_lookup_at_level - find a data block or node block
+ * @bmap: bmap
+ * @key: key
+ * @level: level
+ * @ptrp: place to store the value associated to @key
+ *
+ * Description: nilfs_bmap_lookup_at_level() finds a record whose key
+ * matches @key in the block at @level of the bmap.
+ *
+ * Return Value: On success, 0 is returned and the record associated with @key
+ * is stored in the place pointed by @ptrp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A record associated with @key does not exist.
+ */
 int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
 			       __u64 *ptrp)
 {
@@ -69,39 +89,6 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
 	return ret;
 }
 
-/**
- * nilfs_bmap_lookup - find a record
- * @bmap: bmap
- * @key: key
- * @recp: pointer to record
- *
- * Description: nilfs_bmap_lookup() finds a record whose key matches @key in
- * @bmap.
- *
- * Return Value: On success, 0 is returned and the record associated with @key
- * is stored in the place pointed by @recp. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - A record associated with @key does not exist.
- */
-int nilfs_bmap_lookup(struct nilfs_bmap *bmap,
-		      unsigned long key,
-		      unsigned long *recp)
-{
-	__u64 ptr;
-	int ret;
-
-	/* XXX: use macro for level 1 */
-	ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr);
-	if (recp != NULL)
-		*recp = ptr;
-	return ret;
-}
-
 static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 {
 	__u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index a4f64e54424c..9980d7dbab91 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -142,7 +142,6 @@ struct nilfs_bmap {
 int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
 int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
 void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
-int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
 int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
 int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
 int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
@@ -161,6 +160,12 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
 void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
 
 
+static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
+				    __u64 *ptr)
+{
+	return nilfs_bmap_lookup_at_level(bmap, key, 1, ptr);
+}
+
 /*
  * Internal use only
  */
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 9cb831899119..156bf6091a96 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -136,7 +136,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
 		       int mode, struct buffer_head **out_bh)
 {
 	struct buffer_head *bh;
-	unsigned long blknum = 0;
+	__u64 blknum = 0;
 	int ret = -ENOMEM;
 
 	bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
@@ -166,7 +166,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
 		goto failed_bh;
 	}
 	bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
-	bh->b_blocknr = blknum;
+	bh->b_blocknr = (sector_t)blknum;
 	set_buffer_mapped(bh);
 
 	bh->b_end_io = end_buffer_read_sync;
-- 
cgit v1.2.2


From 41f4db0f48c72db3a93cc1a0b18368d9a9aca700 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 8 Aug 2009 16:09:46 +0900
Subject: fs/Kconfig: move nilfs2 outside misc filesystems

Some people asked me questions like the following:

On Wed, 15 Jul 2009 13:11:21 +0200, Leon Woestenberg wrote:
> just wondering, any reasons why NILFS2 is one of the miscellaneous
> filesystems and, for example, btrfs, is not in Kconfig?

Actually, nilfs is NOT a filesystem came from other operating systems,
but a filesystem created purely for Linux.  Nor is it a flash
filesystem but that for generic block devices.

So, this moves nilfs outside the misc category as I responded in LKML
"Re: Why does NILFS2 hide under Miscellaneous filesystems?"
(Message-Id: <20090716.002526.93465395.ryusuke@osrg.net>).

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/Kconfig        | 2 +-
 fs/nilfs2/Kconfig | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 0e7da7bb5d93..455aa207e67e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -43,6 +43,7 @@ source "fs/xfs/Kconfig"
 source "fs/gfs2/Kconfig"
 source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
+source "fs/nilfs2/Kconfig"
 
 endif # BLOCK
 
@@ -186,7 +187,6 @@ source "fs/romfs/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
-source "fs/nilfs2/Kconfig"
 
 endif # MISC_FILESYSTEMS
 
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 72da095d4009..251da07b2a1d 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -1,6 +1,6 @@
 config NILFS2_FS
 	tristate "NILFS2 file system support (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
+	depends on EXPERIMENTAL
 	select CRC32
 	help
 	  NILFS2 is a log-structured file system (LFS) supporting continuous
-- 
cgit v1.2.2


From 918941a3f3d46c2a69971b4718aaf13b1be2f1a7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 17 Aug 2009 18:50:08 +0200
Subject: ocfs2: Use __generic_file_aio_write instead of
 generic_file_aio_write_nolock

Use the new helper. We have to submit data pages ourselves in case of O_SYNC
write because __generic_file_aio_write does not do it for us. OCFS2 developpers
might think about moving the sync out of i_mutex which seems to be easily
possible but that's out of scope of this patch.

CC: ocfs2-devel@oss.oracle.com
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ocfs2/file.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index aa501d3f93f1..600227389387 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1871,8 +1871,7 @@ relock:
 			goto out_dio;
 		}
 	} else {
-		written = generic_file_aio_write_nolock(iocb, iov, nr_segs,
-							*ppos);
+		written = __generic_file_aio_write(iocb, iov, nr_segs, ppos);
 	}
 
 out_dio:
@@ -1880,18 +1879,21 @@ out_dio:
 	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
 
 	if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
-		/*
-		 * The generic write paths have handled getting data
-		 * to disk, but since we don't make use of the dirty
-		 * inode list, a manual journal commit is necessary
-		 * here.
-		 */
-		if (old_size != i_size_read(inode) ||
-		    old_clusters != OCFS2_I(inode)->ip_clusters) {
+		ret = filemap_fdatawrite_range(file->f_mapping, pos,
+					       pos + count - 1);
+		if (ret < 0)
+			written = ret;
+
+		if (!ret && (old_size != i_size_read(inode) ||
+		    old_clusters != OCFS2_I(inode)->ip_clusters)) {
 			ret = jbd2_journal_force_commit(osb->journal->j_journal);
 			if (ret < 0)
 				written = ret;
 		}
+
+		if (!ret)
+			ret = filemap_fdatawait_range(file->f_mapping, pos,
+						      pos + count - 1);
 	}
 
 	/* 
-- 
cgit v1.2.2


From eef99380679e20e7edc096aa4d8a98b875404d79 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Aug 2009 17:43:41 +0200
Subject: vfs: Rename generic_file_aio_write_nolock

generic_file_aio_write_nolock() is now used only by block devices and raw
character device. Filesystems should use __generic_file_aio_write() in case
generic_file_aio_write() doesn't suit them. So rename the function to
blkdev_aio_write() and move it to fs/blockdev.c.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/block_dev.c | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 94dfda24c06e..3581a4e53942 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1404,6 +1404,33 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	return blkdev_ioctl(bdev, mode, cmd, arg);
 }
 
+/*
+ * Write data to the block device.  Only intended for the block device itself
+ * and the raw driver which basically is a fake block device.
+ *
+ * Does not take i_mutex for the write and thus is not for general purpose
+ * use.
+ */
+ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			 unsigned long nr_segs, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t ret;
+
+	BUG_ON(iocb->ki_pos != pos);
+
+	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+	if (ret > 0 || ret == -EIOCBQUEUED) {
+		ssize_t err;
+
+		err = generic_write_sync(file, pos, ret);
+		if (err < 0 && ret > 0)
+			ret = err;
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blkdev_aio_write);
+
 /*
  * Try to release a page associated with block device when the system
  * is under memory pressure.
@@ -1436,7 +1463,7 @@ const struct file_operations def_blk_fops = {
 	.read		= do_sync_read,
 	.write		= do_sync_write,
   	.aio_read	= generic_file_aio_read,
-  	.aio_write	= generic_file_aio_write_nolock,
+	.aio_write	= blkdev_aio_write,
 	.mmap		= generic_file_mmap,
 	.fsync		= block_fsync,
 	.unlocked_ioctl	= block_ioctl,
-- 
cgit v1.2.2


From 148f948ba877f4d3cdef036b1ff6d9f68986706a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 17 Aug 2009 19:52:36 +0200
Subject: vfs: Introduce new helpers for syncing after writing to O_SYNC file
 or IS_SYNC inode

Introduce new function for generic inode syncing (vfs_fsync_range) and use
it from fsync() path. Introduce also new helper for syncing after a sync
write (generic_write_sync) using the generic function.

Use these new helpers for syncing from generic VFS functions. This makes
O_SYNC writes to block devices acquire i_mutex for syncing. If we really
care about this, we can make block_fsync() drop the i_mutex and reacquire
it before it returns.

CC: Evgeniy Polyakov <zbr@ioremap.net>
CC: ocfs2-devel@oss.oracle.com
CC: Joel Becker <joel.becker@oracle.com>
CC: Felix Blyakher <felixb@sgi.com>
CC: xfs@oss.sgi.com
CC: Anton Altaparmakov <aia21@cantab.net>
CC: linux-ntfs-dev@lists.sourceforge.net
CC: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
CC: linux-ext4@vger.kernel.org
CC: tytso@mit.edu
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/splice.c | 22 ++++++----------------
 fs/sync.c   | 55 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 54 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 73766d24f97b..819023733f8e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -976,25 +976,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 	if (ret > 0) {
 		unsigned long nr_pages;
+		int err;
 
-		*ppos += ret;
 		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
-		/*
-		 * If file or inode is SYNC and we actually wrote some data,
-		 * sync it.
-		 */
-		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
-			int err;
-
-			mutex_lock(&inode->i_mutex);
-			err = generic_osync_inode(inode, mapping,
-						  OSYNC_METADATA|OSYNC_DATA);
-			mutex_unlock(&inode->i_mutex);
-
-			if (err)
-				ret = err;
-		}
+		err = generic_write_sync(out, *ppos, ret);
+		if (err)
+			ret = err;
+		else
+			*ppos += ret;
 		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
 	}
 
diff --git a/fs/sync.c b/fs/sync.c
index 103cc7fdd3df..4e15da01923c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -178,19 +178,23 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 }
 
 /**
- * vfs_fsync - perform a fsync or fdatasync on a file
+ * vfs_fsync_range - helper to sync a range of data & metadata to disk
  * @file:		file to sync
  * @dentry:		dentry of @file
- * @data:		only perform a fdatasync operation
+ * @start:		offset in bytes of the beginning of data range to sync
+ * @end:		offset in bytes of the end of data range (inclusive)
+ * @datasync:		perform only datasync
  *
- * Write back data and metadata for @file to disk.  If @datasync is
- * set only metadata needed to access modified file data is written.
+ * Write back data in range @start..@end and metadata for @file to disk.  If
+ * @datasync is set only metadata needed to access modified file data is
+ * written.
  *
  * In case this function is called from nfsd @file may be %NULL and
  * only @dentry is set.  This can only happen when the filesystem
  * implements the export_operations API.
  */
-int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
+		    loff_t end, int datasync)
 {
 	const struct file_operations *fop;
 	struct address_space *mapping;
@@ -214,7 +218,7 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 		goto out;
 	}
 
-	ret = filemap_fdatawrite(mapping);
+	ret = filemap_fdatawrite_range(mapping, start, end);
 
 	/*
 	 * We need to protect against concurrent writers, which could cause
@@ -225,12 +229,32 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	if (!ret)
 		ret = err;
 	mutex_unlock(&mapping->host->i_mutex);
-	err = filemap_fdatawait(mapping);
+
+	err = filemap_fdatawait_range(mapping, start, end);
 	if (!ret)
 		ret = err;
 out:
 	return ret;
 }
+EXPORT_SYMBOL(vfs_fsync_range);
+
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file:		file to sync
+ * @dentry:		dentry of @file
+ * @datasync:		only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk.  If @datasync is
+ * set only metadata needed to access modified file data is written.
+ *
+ * In case this function is called from nfsd @file may be %NULL and
+ * only @dentry is set.  This can only happen when the filesystem
+ * implements the export_operations API.
+ */
+int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync);
+}
 EXPORT_SYMBOL(vfs_fsync);
 
 static int do_fsync(unsigned int fd, int datasync)
@@ -256,6 +280,23 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
 	return do_fsync(fd, 1);
 }
 
+/**
+ * generic_write_sync - perform syncing after a write if file / inode is sync
+ * @file:	file to which the write happened
+ * @pos:	offset where the write started
+ * @count:	length of the write
+ *
+ * This is just a simple wrapper about our general syncing function.
+ */
+int generic_write_sync(struct file *file, loff_t pos, loff_t count)
+{
+	if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
+		return 0;
+	return vfs_fsync_range(file, file->f_path.dentry, pos,
+			       pos + count - 1, 1);
+}
+EXPORT_SYMBOL(generic_write_sync);
+
 /*
  * sys_sync_file_range() permits finely controlled syncing over a segment of
  * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
-- 
cgit v1.2.2


From a2a735ad666a04306a708b5a0109cc1fe113f569 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 18 Aug 2009 17:54:11 +0200
Subject: ext2: Update comment about generic_osync_inode

We rely on generic_write_sync() now.

CC: linux-ext4@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e27130341d4f..1c1638f873a4 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -482,7 +482,7 @@ static int ext2_alloc_branch(struct inode *inode,
 		unlock_buffer(bh);
 		mark_buffer_dirty_inode(bh, inode);
 		/* We used to sync bh here if IS_SYNC(inode).
-		 * But we now rely upon generic_osync_inode()
+		 * But we now rely upon generic_write_sync()
 		 * and b_inode_buffers.  But not for directories.
 		 */
 		if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
-- 
cgit v1.2.2


From e367626b6164aeecb97fb7c20509ed8696babc26 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 18 Aug 2009 17:51:23 +0200
Subject: ext3: Remove syncing logic from ext3_file_write

Syncing is now properly done by generic_file_aio_write() so no special logic is
needed in ext3.

CC: linux-ext4@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/file.c | 61 +---------------------------------------------------------
 1 file changed, 1 insertion(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 299253214789..388bbdfa0b4e 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
 	return 0;
 }
 
-static ssize_t
-ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	ssize_t ret;
-	int err;
-
-	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-
-	/*
-	 * Skip flushing if there was an error, or if nothing was written.
-	 */
-	if (ret <= 0)
-		return ret;
-
-	/*
-	 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
-	 * journalling then we need to make sure that we force the transaction
-	 * to disk to keep all metadata uptodate synchronously.
-	 */
-	if (file->f_flags & O_SYNC) {
-		/*
-		 * If we are non-data-journaled, then the dirty data has
-		 * already been flushed to backing store by generic_osync_inode,
-		 * and the inode has been flushed too if there have been any
-		 * modifications other than mere timestamp updates.
-		 *
-		 * Open question --- do we care about flushing timestamps too
-		 * if the inode is IS_SYNC?
-		 */
-		if (!ext3_should_journal_data(inode))
-			return ret;
-
-		goto force_commit;
-	}
-
-	/*
-	 * So we know that there has been no forced data flush.  If the inode
-	 * is marked IS_SYNC, we need to force one ourselves.
-	 */
-	if (!IS_SYNC(inode))
-		return ret;
-
-	/*
-	 * Open question #2 --- should we force data to disk here too?  If we
-	 * don't, the only impact is that data=writeback filesystems won't
-	 * flush data to disk automatically on IS_SYNC, only metadata (but
-	 * historically, that is what ext2 has done.)
-	 */
-
-force_commit:
-	err = ext3_force_commit(inode->i_sb);
-	if (err)
-		return err;
-	return ret;
-}
-
 const struct file_operations ext3_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
-	.aio_write	= ext3_file_write,
+	.aio_write	= generic_file_aio_write,
 	.unlocked_ioctl	= ext3_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext3_compat_ioctl,
-- 
cgit v1.2.2


From 0d34ec62e18984ac9476208660372306ef54e70d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 18 Aug 2009 17:48:27 +0200
Subject: ext4: Remove syncing logic from ext4_file_write

The syncing is now properly handled by generic_file_aio_write() so
no special ext4 code is needed.

CC: linux-ext4@vger.kernel.org
CC: tytso@mit.edu
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/file.c | 53 ++---------------------------------------------------
 1 file changed, 2 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 27f3c5354c0e..5ca3eca70a1e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -58,10 +58,7 @@ static ssize_t
 ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos)
 {
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	ssize_t ret;
-	int err;
+	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
 
 	/*
 	 * If we have encountered a bitmap-format file, the size limit
@@ -81,53 +78,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 		}
 	}
 
-	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-	/*
-	 * Skip flushing if there was an error, or if nothing was written.
-	 */
-	if (ret <= 0)
-		return ret;
-
-	/*
-	 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
-	 * journalling then we need to make sure that we force the transaction
-	 * to disk to keep all metadata uptodate synchronously.
-	 */
-	if (file->f_flags & O_SYNC) {
-		/*
-		 * If we are non-data-journaled, then the dirty data has
-		 * already been flushed to backing store by generic_osync_inode,
-		 * and the inode has been flushed too if there have been any
-		 * modifications other than mere timestamp updates.
-		 *
-		 * Open question --- do we care about flushing timestamps too
-		 * if the inode is IS_SYNC?
-		 */
-		if (!ext4_should_journal_data(inode))
-			return ret;
-
-		goto force_commit;
-	}
-
-	/*
-	 * So we know that there has been no forced data flush.  If the inode
-	 * is marked IS_SYNC, we need to force one ourselves.
-	 */
-	if (!IS_SYNC(inode))
-		return ret;
-
-	/*
-	 * Open question #2 --- should we force data to disk here too?  If we
-	 * don't, the only impact is that data=writeback filesystems won't
-	 * flush data to disk automatically on IS_SYNC, only metadata (but
-	 * historically, that is what ext2 has done.)
-	 */
-
-force_commit:
-	err = ext4_force_commit(inode->i_sb);
-	if (err)
-		return err;
-	return ret;
+	return generic_file_aio_write(iocb, iov, nr_segs, pos);
 }
 
 static struct vm_operations_struct ext4_file_vm_ops = {
-- 
cgit v1.2.2


From ebbbf757c6b8577ac2fb6181c08c2059153bb0e2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 18 Aug 2009 18:13:58 +0200
Subject: ntfs: Use new syncing helpers and update comments

Use new syncing helpers in .write and .aio_write functions. Also
remove superfluous syncing in ntfs_file_buffered_write() and update
comments about generic_osync_inode().

CC: Anton Altaparmakov <aia21@cantab.net>
CC: linux-ntfs-dev@lists.sourceforge.net
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ntfs/file.c | 16 ++++------------
 fs/ntfs/mft.c  | 13 ++++++-------
 2 files changed, 10 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 3140a4429af1..4350d4993b18 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2076,14 +2076,6 @@ err_out:
 	*ppos = pos;
 	if (cached_page)
 		page_cache_release(cached_page);
-	/* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */
-	if (likely(!status)) {
-		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) {
-			if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb))
-				status = generic_osync_inode(vi, mapping,
-						OSYNC_METADATA|OSYNC_DATA);
-		}
-  	}
 	pagevec_lru_add_file(&lru_pvec);
 	ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
 			written ? "written" : "status", (unsigned long)written,
@@ -2145,8 +2137,8 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	mutex_lock(&inode->i_mutex);
 	ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
-	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		int err = sync_page_range(inode, mapping, pos, ret);
+	if (ret > 0) {
+		int err = generic_write_sync(file, pos, ret);
 		if (err < 0)
 			ret = err;
 	}
@@ -2173,8 +2165,8 @@ static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
 	if (ret == -EIOCBQUEUED)
 		ret = wait_on_sync_kiocb(&kiocb);
 	mutex_unlock(&inode->i_mutex);
-	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		int err = sync_page_range(inode, mapping, *ppos - ret, ret);
+	if (ret > 0) {
+		int err = generic_write_sync(file, *ppos - ret, ret);
 		if (err < 0)
 			ret = err;
 	}
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 23bf68453d7d..1caa0ef0b2bb 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -384,13 +384,12 @@ unm_err_out:
  * it is dirty in the inode meta data rather than the data page cache of the
  * inode, and thus there are no data pages that need writing out.  Therefore, a
  * full mark_inode_dirty() is overkill.  A mark_inode_dirty_sync(), on the
- * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to
- * ensure ->write_inode is called from generic_osync_inode() and this needs to
- * happen or the file data would not necessarily hit the device synchronously,
- * even though the vfs inode has the O_SYNC flag set.  Also, I_DIRTY_DATASYNC
- * simply "feels" better than just I_DIRTY_SYNC, since the file data has not
- * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own
- * would suggest.
+ * other hand, is not sufficient, because ->write_inode needs to be called even
+ * in case of fdatasync. This needs to happen or the file data would not
+ * necessarily hit the device synchronously, even though the vfs inode has the
+ * O_SYNC flag set.  Also, I_DIRTY_DATASYNC simply "feels" better than just
+ * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
+ * which is not what I_DIRTY_SYNC on its own would suggest.
  */
 void __mark_mft_record_dirty(ntfs_inode *ni)
 {
-- 
cgit v1.2.2


From d23c937b0f740888765676f6f82f509dbbb2bbad Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 18 Aug 2009 18:24:31 +0200
Subject: ocfs2: Update syncing after splicing to match generic version

Update ocfs2 specific splicing code to use generic syncing helper. The sync now
does not happen under rw_lock because generic_write_sync() acquires i_mutex
which ranks above rw_lock. That should not matter because standard fsync path
does not hold it either.

Acked-by: Joel Becker <Joel.Becker@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
CC: ocfs2-devel@oss.oracle.com
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ocfs2/file.c | 27 ++++++---------------------
 1 file changed, 6 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 600227389387..221c5e98957b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1993,31 +1993,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 
 	if (ret > 0) {
 		unsigned long nr_pages;
+		int err;
 
-		*ppos += ret;
 		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
-		/*
-		 * If file or inode is SYNC and we actually wrote some data,
-		 * sync it.
-		 */
-		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
-			int err;
-
-			mutex_lock(&inode->i_mutex);
-			err = ocfs2_rw_lock(inode, 1);
-			if (err < 0) {
-				mlog_errno(err);
-			} else {
-				err = generic_osync_inode(inode, mapping,
-						  OSYNC_METADATA|OSYNC_DATA);
-				ocfs2_rw_unlock(inode, 1);
-			}
-			mutex_unlock(&inode->i_mutex);
+		err = generic_write_sync(out, *ppos, ret);
+		if (err)
+			ret = err;
+		else
+			*ppos += ret;
 
-			if (err)
-				ret = err;
-		}
 		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
 	}
 
-- 
cgit v1.2.2


From af0f4414f343429971d33b0dd8dccc85c1f3dcd2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 18 Aug 2009 18:32:55 +0200
Subject: xfs: Convert sync_page_range() to simple
 filemap_write_and_wait_range()

Christoph Hellwig says that it is enough for XFS to call
filemap_write_and_wait_range() instead of sync_page_range() because we do
all the metadata syncing when forcing the log.

CC: Felix Blyakher <felixb@sgi.com>
CC: xfs@oss.sgi.com
CC: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/xfs/linux-2.6/xfs_lrw.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 7078974a6eee..fde63a3c4ecc 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -817,7 +817,8 @@ write_retry:
 		xfs_iunlock(xip, iolock);
 		if (need_i_mutex)
 			mutex_unlock(&inode->i_mutex);
-		error2 = sync_page_range(inode, mapping, pos, ret);
+		error2 = filemap_write_and_wait_range(mapping, pos,
+						      pos + ret - 1);
 		if (!error)
 			error = error2;
 		if (need_i_mutex)
-- 
cgit v1.2.2


From 2f3d675bcd4a84251d6e8eea8096ec8fc795e5d6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 17 Aug 2009 17:00:02 +0200
Subject: fat: Opencode sync_page_range_nolock()

fat_cont_expand() is the only user of sync_page_range_nolock(). It's also the
only user of generic_osync_inode() which does not have a file open.  So
opencode needed actions for FAT so that we can convert generic_osync_inode() to
a standard syncing path.

Update a comment about generic_osync_inode().

CC: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/fat/file.c | 22 ++++++++++++++++++++--
 fs/fat/misc.c |  4 ++--
 2 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/file.c b/fs/fat/file.c
index f042b965c95c..e8c159de236b 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -176,8 +176,26 @@ static int fat_cont_expand(struct inode *inode, loff_t size)
 
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
-	if (IS_SYNC(inode))
-		err = sync_page_range_nolock(inode, mapping, start, count);
+	if (IS_SYNC(inode)) {
+		int err2;
+
+		/*
+		 * Opencode syncing since we don't have a file open to use
+		 * standard fsync path.
+		 */
+		err = filemap_fdatawrite_range(mapping, start,
+					       start + count - 1);
+		err2 = sync_mapping_buffers(mapping);
+		if (!err)
+			err = err2;
+		err2 = write_inode_now(inode, 1);
+		if (!err)
+			err = err2;
+		if (!err) {
+			err =  filemap_fdatawait_range(mapping, start,
+						       start + count - 1);
+		}
+	}
 out:
 	return err;
 }
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index a6c20473dfd7..4e35be873e09 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -119,8 +119,8 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
 		MSDOS_I(inode)->i_start = new_dclus;
 		MSDOS_I(inode)->i_logstart = new_dclus;
 		/*
-		 * Since generic_osync_inode() synchronize later if
-		 * this is not directory, we don't here.
+		 * Since generic_write_sync() synchronizes regular files later,
+		 * we sync here only directories.
 		 */
 		if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) {
 			ret = fat_sync_inode(inode);
-- 
cgit v1.2.2


From 18f2ee705d98034b0f229a3202d827468d4bffd9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 18 Aug 2009 18:43:15 +0200
Subject: vfs: Remove generic_osync_inode() and sync_page_range{_nolock}()

Remove these three functions since nobody uses them anymore.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/fs-writeback.c | 54 ------------------------------------------------------
 1 file changed, 54 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index da86ef58e427..628235cf44b5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1242,57 +1242,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
 	return ret;
 }
 EXPORT_SYMBOL(sync_inode);
-
-/**
- * generic_osync_inode - flush all dirty data for a given inode to disk
- * @inode: inode to write
- * @mapping: the address_space that should be flushed
- * @what:  what to write and wait upon
- *
- * This can be called by file_write functions for files which have the
- * O_SYNC flag set, to flush dirty writes to disk.
- *
- * @what is a bitmask, specifying which part of the inode's data should be
- * written and waited upon.
- *
- *    OSYNC_DATA:     i_mapping's dirty data
- *    OSYNC_METADATA: the buffers at i_mapping->private_list
- *    OSYNC_INODE:    the inode itself
- */
-
-int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what)
-{
-	int err = 0;
-	int need_write_inode_now = 0;
-	int err2;
-
-	if (what & OSYNC_DATA)
-		err = filemap_fdatawrite(mapping);
-	if (what & (OSYNC_METADATA|OSYNC_DATA)) {
-		err2 = sync_mapping_buffers(mapping);
-		if (!err)
-			err = err2;
-	}
-	if (what & OSYNC_DATA) {
-		err2 = filemap_fdatawait(mapping);
-		if (!err)
-			err = err2;
-	}
-
-	spin_lock(&inode_lock);
-	if ((inode->i_state & I_DIRTY) &&
-	    ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
-		need_write_inode_now = 1;
-	spin_unlock(&inode_lock);
-
-	if (need_write_inode_now) {
-		err2 = write_inode_now(inode, 1);
-		if (!err)
-			err = err2;
-	}
-	else
-		inode_sync_wait(inode);
-
-	return err;
-}
-EXPORT_SYMBOL(generic_osync_inode);
-- 
cgit v1.2.2


From 2daea67e966dc0c42067ebea015ddac6834cef88 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 3 Sep 2009 12:39:39 +0200
Subject: fsync: wait for data writeout completion before calling ->fsync

Currenly vfs_fsync(_range) first calls filemap_fdatawrite to write out
the data, the calls into ->fsync to write out the metadata and then finally
calls filemap_fdatawait to wait for the data I/O to complete.  What sounds
like a clever micro-optimization actually is nast trap for many filesystems.

For many modern filesystems i_size or other inode information is only
updated on I/O completion and we need to wait for I/O to finish before
we can write out the metadata.  For old fashionen filesystems that
instanciate blocks during the actual write and also update the metadata
at that point it opens up a large window were we could expose uninitialized
blocks after a crash.  While a few filesystems that need it already wait
for the I/O to finish inside their ->fsync methods it is rather suboptimal
as it is done under the i_mutex and also always for the whole file instead
of just a part as we could do for O_SYNC handling.

Here is a small audit of all fsync instances in the tree:

 - spufs_mfc_fsync:
 - ps3flash_fsync:
 - vol_cdev_fsync:
 - printer_fsync:
 - fb_deferred_io_fsync:
 - bad_file_fsync:
 - simple_sync_file:

	don't care - filesystems/drivers do't use the page cache or are
	purely in-memory.

 - simple_fsync:
 - file_fsync:
 - affs_file_fsync:
 - fat_file_fsync:
 - jfs_fsync:
 - ubifs_fsync:
 - reiserfs_dir_fsync:
 - reiserfs_sync_file:

	never touch pagecache themselves.  We need to wait before if we do
	not want to expose stale data after an allocation.

 - afs_fsync:
 - fuse_fsync_common:

	do the waiting writeback itself in awkward ways, would benefit from
	proper semantics

 - block_fsync:

	Does a filemap_write_and_wait on the block device inode.  Because we
	now have f_mapping that is the same inode we call it on in vfs_fsync.
	So just removing it and letting the VFS do the work in one go would
	be an improvement.

 - btrfs_sync_file:
 - cifs_fsync:
 - xfs_file_fsync:

	need the wait first and currently do it themselves. would benefit from
	doing it outside i_mutex.

 - coda_fsync:
 - ecryptfs_fsync:
 - exofs_file_fsync:
 - shm_fsync:

	only passes the fsync through to the lower layer

 - ext3_sync_file:

	doesn't seem to care, comments are confusing.

 - ext4_sync_file:

	would need the wait to work correctly for delalloc mode with late
	i_size updates.  Otherwise the ext3 comment applies.

	currently implemens it's own writeback and wait in an odd way,
	could benefit from doing it properly.

 - gfs2_fsync:

	not needed for journaled data mode, but probably harmless there.
	Currently writes back data asynchronously itself.  Needs some
	major audit.

 - hostfs_fsync:

	just calls fsync/datasync on the host FD.  Without the wait before
	data might not even be inflight yet if we're unlucky.

 - hpfs_file_fsync:
 - ncp_fsync:

	no-ops.  Dangerous before and after.

 - jffs2_fsync:

	just calls jffs2_flush_wbuf_gc, not sure how this relates to data.

 - nfs_fsync_dir:

	just increments stats, claims all directory operations are synchronous

 - nfs_file_fsync:

	only writes out data???  Looks very odd.

 - nilfs_sync_file:

	looks like it expects all data done, but not sure from the code

 - ntfs_dir_fsync:
 - ntfs_file_fsync:

	appear to do their own data writeback.  Very convoluted code.

 - ocfs2_sync_file:

	does it's own data writeback, but no wait.  probably needs the wait.

 - smb_fsync:

	according to a comment expects all pages written already, probably needs
	the wait before.

This patch only changes vfs_fsync_range, removal of the wait in the methods
that have it is left to the filesystem maintainers.  Note that most
filesystems really do need an audit for their fsync methods given the
gems found in this very brief audit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/sync.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/sync.c b/fs/sync.c
index 4e15da01923c..192340930bb4 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -218,7 +218,7 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
 		goto out;
 	}
 
-	ret = filemap_fdatawrite_range(mapping, start, end);
+	ret = filemap_write_and_wait_range(mapping, start, end);
 
 	/*
 	 * We need to protect against concurrent writers, which could cause
@@ -230,9 +230,6 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
 		ret = err;
 	mutex_unlock(&mapping->host->i_mutex);
 
-	err = filemap_fdatawait_range(mapping, start, end);
-	if (!ret)
-		ret = err;
 out:
 	return ret;
 }
-- 
cgit v1.2.2


From 5891d9dd2a47d38c205115211841a3d82304628f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 16 Jul 2009 17:35:11 +0200
Subject: udf: Remove dead code

Remove code that gets never used.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/directory.c | 86 ------------------------------------------------------
 fs/udf/lowlevel.c  |  4 ---
 2 files changed, 90 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 1d2c570704c8..2ffdb6733af1 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -18,59 +18,6 @@
 #include <linux/string.h>
 #include <linux/buffer_head.h>
 
-#if 0
-static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad,
-				uint8_t ad_size, struct kernel_lb_addr fe_loc,
-				int *pos, int *offset, struct buffer_head **bh,
-				int *error)
-{
-	int loffset = *offset;
-	int block;
-	uint8_t *ad;
-	int remainder;
-
-	*error = 0;
-
-	ad = (uint8_t *)(*bh)->b_data + *offset;
-	*offset += ad_size;
-
-	if (!ad) {
-		brelse(*bh);
-		*error = 1;
-		return NULL;
-	}
-
-	if (*offset == dir->i_sb->s_blocksize) {
-		brelse(*bh);
-		block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
-		if (!block)
-			return NULL;
-		*bh = udf_tread(dir->i_sb, block);
-		if (!*bh)
-			return NULL;
-	} else if (*offset > dir->i_sb->s_blocksize) {
-		ad = tmpad;
-
-		remainder = dir->i_sb->s_blocksize - loffset;
-		memcpy((uint8_t *)ad, (*bh)->b_data + loffset, remainder);
-
-		brelse(*bh);
-		block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
-		if (!block)
-			return NULL;
-		(*bh) = udf_tread(dir->i_sb, block);
-		if (!*bh)
-			return NULL;
-
-		memcpy((uint8_t *)ad + remainder, (*bh)->b_data,
-			ad_size - remainder);
-		*offset = ad_size - remainder;
-	}
-
-	return ad;
-}
-#endif
-
 struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
 					 struct udf_fileident_bh *fibh,
 					 struct fileIdentDesc *cfi,
@@ -248,39 +195,6 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
 	return fi;
 }
 
-#if 0
-static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
-{
-	struct extent_ad *ext;
-	struct fileEntry *fe;
-	uint8_t *ptr;
-
-	if ((!buffer) || (!offset)) {
-		printk(KERN_ERR "udf: udf_get_fileextent() invalidparms\n");
-		return NULL;
-	}
-
-	fe = (struct fileEntry *)buffer;
-
-	if (fe->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FE)) {
-		udf_debug("0x%x != TAG_IDENT_FE\n",
-			  le16_to_cpu(fe->descTag.tagIdent));
-		return NULL;
-	}
-
-	ptr = (uint8_t *)(fe->extendedAttr) +
-		le32_to_cpu(fe->lengthExtendedAttr);
-
-	if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs)))
-		ptr += *offset;
-
-	ext = (struct extent_ad *)ptr;
-
-	*offset = *offset + sizeof(struct extent_ad);
-	return ext;
-}
-#endif
-
 struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
 			      int inc)
 {
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 1b88fd5df05d..43e24a3b8e10 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -36,14 +36,10 @@ unsigned int udf_get_last_session(struct super_block *sb)
 	ms_info.addr_format = CDROM_LBA;
 	i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info);
 
-#define WE_OBEY_THE_WRITTEN_STANDARDS 1
-
 	if (i == 0) {
 		udf_debug("XA disk: %s, vol_desc_start=%d\n",
 			  (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba);
-#if WE_OBEY_THE_WRITTEN_STANDARDS
 		if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
-#endif
 			vol_desc_start = ms_info.addr.lba;
 	} else {
 		udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i);
-- 
cgit v1.2.2


From 7c6e3d1aaeb3b25b49510d193000c27037004acb Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 16 Jul 2009 17:36:54 +0200
Subject: udf: Remove wrong assignment in udf_symlink

Recomputation of the pointer was wrong (it should have been just increment).
Luckily, we never use the computed value. Remove it.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/namei.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 6a29fa34c478..21dad8c608f9 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -943,7 +943,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 		pc->componentType = 1;
 		pc->lengthComponentIdent = 0;
 		pc->componentFileVersionNum = 0;
-		pc += sizeof(struct pathComponent);
 		elen += sizeof(struct pathComponent);
 	}
 
-- 
cgit v1.2.2


From 81056dd04465902461b627169c4b4487a11acba1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 16 Jul 2009 18:02:25 +0200
Subject: udf: Perform preallocation only for regular files

So far we preallocated blocks also for directories but that brings a
problem, when to get rid of preallocated blocks we don't need. So far
we removed them in udf_clear_inode() which has a disadvantage that
1) blocks are unavailable long after writing to a directory finished
   and thus one can get out of space unnecessarily early
2) releasing blocks from udf_clear_inode is problematic because VFS
   does not expect us to redirty inode there and it also slows down
   memory reclaim.

So preallocate blocks only for regular files where we can drop preallocation
in udf_release_file.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index e7533f785636..6d24c2c63f93 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -90,19 +90,16 @@ no_delete:
 }
 
 /*
- * If we are going to release inode from memory, we discard preallocation and
- * truncate last inode extent to proper length. We could use drop_inode() but
- * it's called under inode_lock and thus we cannot mark inode dirty there.  We
- * use clear_inode() but we have to make sure to write inode as it's not written
- * automatically.
+ * If we are going to release inode from memory, we truncate last inode extent
+ * to proper length. We could use drop_inode() but it's called under inode_lock
+ * and thus we cannot mark inode dirty there.  We use clear_inode() but we have
+ * to make sure to write inode as it's not written automatically.
  */
 void udf_clear_inode(struct inode *inode)
 {
 	struct udf_inode_info *iinfo;
 	if (!(inode->i_sb->s_flags & MS_RDONLY)) {
 		lock_kernel();
-		/* Discard preallocation for directories, symlinks, etc. */
-		udf_discard_prealloc(inode);
 		udf_truncate_tail_extent(inode);
 		unlock_kernel();
 		write_inode_now(inode, 0);
@@ -664,8 +661,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 	udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum);
 
 #ifdef UDF_PREALLOCATE
-	/* preallocate blocks */
-	udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);
+	/* We preallocate blocks only for regular files. It also makes sense
+	 * for directories but there's a problem when to drop the
+	 * preallocation. We might use some delayed work for that but I feel
+	 * it's overengineering for a filesystem like UDF. */
+	if (S_ISREG(inode->i_mode))
+		udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);
 #endif
 
 	/* merge any continuous blocks in laarr */
-- 
cgit v1.2.2


From cbc8cc33529b0e0e55ae0ff077b8cb0b71d54c7a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 7 Aug 2009 00:27:27 +0200
Subject: udf: Fix possible corruption when close races with write

When we close a file, we remove preallocated blocks from it. But this
truncation was not protected by i_mutex and thus it could have raced with a
write through a different fd and cause crashes or even filesystem corruption.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/file.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7464305382b5..b80cbd78833c 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -193,9 +193,11 @@ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 static int udf_release_file(struct inode *inode, struct file *filp)
 {
 	if (filp->f_mode & FMODE_WRITE) {
+		mutex_lock(&inode->i_mutex);
 		lock_kernel();
 		udf_discard_prealloc(inode);
 		unlock_kernel();
+		mutex_unlock(&inode->i_mutex);
 	}
 	return 0;
 }
-- 
cgit v1.2.2


From 3661d28615ea580c1db02a972fd4d3898df1cb01 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 14 Sep 2009 22:59:50 -0400
Subject: ext4: Fix include/trace/events/ext4.h to work with Systemtap

Using relative pathnames in #include statements interacts badly with
SystemTap, since the fs/ext4/*.h header files are not packaged up as
part of a distribution kernel's header files.  Since systemtap doesn't
use TP_fast_assign(), we can use a blind structure definition and then
make sure the needed header files are defined before the ext4 source
files #include the trace/events/ext4.h header file.

https://bugzilla.redhat.com/show_bug.cgi?id=512478

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 04c693357336..af95dd8ba54b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
+#include "mballoc.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/ext4.h>
-- 
cgit v1.2.2


From a4dbd6740df0872cdf0a86841f75beec8381964d Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Wed, 24 Jun 2009 10:06:31 -0700
Subject: driver model: constify attribute groups

Let attribute group vectors be declared "const".  We'd
like to let most attribute metadata live in read-only
sections... this is a start.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/partitions/check.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 619ba99dfe39..fbeaddf595d3 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -312,7 +312,7 @@ static struct attribute_group part_attr_group = {
 	.attrs = part_attrs,
 };
 
-static struct attribute_group *part_attr_groups[] = {
+static const struct attribute_group *part_attr_groups[] = {
 	&part_attr_group,
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	&blk_trace_attr_group,
-- 
cgit v1.2.2


From 361735fd8ff8f7d8b9f0e134d0d99d99ee193d92 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 28 Aug 2009 23:55:00 +0400
Subject: xfs: switch to seq_file

create_proc_read_entry() is getting deprecated.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_stats.c | 51 +++++++++++++----------------
 fs/xfs/quota/xfs_qm_stats.c  | 78 ++++++++++++++++++--------------------------
 2 files changed, 55 insertions(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index c3526d445f6a..76fdc5861932 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -20,16 +20,9 @@
 
 DEFINE_PER_CPU(struct xfsstats, xfsstats);
 
-STATIC int
-xfs_read_xfsstats(
-	char		*buffer,
-	char		**start,
-	off_t		offset,
-	int		count,
-	int		*eof,
-	void		*data)
+static int xfs_stat_proc_show(struct seq_file *m, void *v)
 {
-	int		c, i, j, len, val;
+	int		c, i, j, val;
 	__uint64_t	xs_xstrat_bytes = 0;
 	__uint64_t	xs_write_bytes = 0;
 	__uint64_t	xs_read_bytes = 0;
@@ -60,18 +53,18 @@ xfs_read_xfsstats(
 	};
 
 	/* Loop over all stats groups */
-	for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) {
-		len += sprintf(buffer + len, "%s", xstats[i].desc);
+	for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
+		seq_printf(m, "%s", xstats[i].desc);
 		/* inner loop does each group */
 		while (j < xstats[i].endpoint) {
 			val = 0;
 			/* sum over all cpus */
 			for_each_possible_cpu(c)
 				val += *(((__u32*)&per_cpu(xfsstats, c) + j));
-			len += sprintf(buffer + len, " %u", val);
+			seq_printf(m, " %u", val);
 			j++;
 		}
-		buffer[len++] = '\n';
+		seq_putc(m, '\n');
 	}
 	/* extra precision counters */
 	for_each_possible_cpu(i) {
@@ -80,36 +73,38 @@ xfs_read_xfsstats(
 		xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
 	}
 
-	len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n",
+	seq_printf(m, "xpc %Lu %Lu %Lu\n",
 			xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
-	len += sprintf(buffer + len, "debug %u\n",
+	seq_printf(m, "debug %u\n",
 #if defined(DEBUG)
 		1);
 #else
 		0);
 #endif
+	return 0;
+}
 
-	if (offset >= len) {
-		*start = buffer;
-		*eof = 1;
-		return 0;
-	}
-	*start = buffer + offset;
-	if ((len -= offset) > count)
-		return count;
-	*eof = 1;
-
-	return len;
+static int xfs_stat_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, xfs_stat_proc_show, NULL);
 }
 
+static const struct file_operations xfs_stat_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= xfs_stat_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 int
 xfs_init_procfs(void)
 {
 	if (!proc_mkdir("fs/xfs", NULL))
 		goto out;
 
-	if (!create_proc_read_entry("fs/xfs/stat", 0, NULL,
-			xfs_read_xfsstats, NULL))
+	if (!proc_create("fs/xfs/stat", 0, NULL,
+			 &xfs_stat_proc_fops))
 		goto out_remove_entry;
 	return 0;
 
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 21b08c0396a1..83e7ea3e25fa 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -48,50 +48,34 @@
 
 struct xqmstats xqmstats;
 
-STATIC int
-xfs_qm_read_xfsquota(
-	char		*buffer,
-	char		**start,
-	off_t		offset,
-	int		count,
-	int		*eof,
-	void		*data)
+static int xqm_proc_show(struct seq_file *m, void *v)
 {
-	int		len;
-
 	/* maximum; incore; ratio free to inuse; freelist */
-	len = sprintf(buffer, "%d\t%d\t%d\t%u\n",
+	seq_printf(m, "%d\t%d\t%d\t%u\n",
 			ndquot,
 			xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
 			xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
 			xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
-
-	if (offset >= len) {
-		*start = buffer;
-		*eof = 1;
-		return 0;
-	}
-	*start = buffer + offset;
-	if ((len -= offset) > count)
-		return count;
-	*eof = 1;
-
-	return len;
+	return 0;
 }
 
-STATIC int
-xfs_qm_read_stats(
-	char		*buffer,
-	char		**start,
-	off_t		offset,
-	int		count,
-	int		*eof,
-	void		*data)
+static int xqm_proc_open(struct inode *inode, struct file *file)
 {
-	int		len;
+	return single_open(file, xqm_proc_show, NULL);
+}
+
+static const struct file_operations xqm_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= xqm_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 
+static int xqmstat_proc_show(struct seq_file *m, void *v)
+{
 	/* quota performance statistics */
-	len = sprintf(buffer, "qm %u %u %u %u %u %u %u %u\n",
+	seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
 			xqmstats.xs_qm_dqreclaims,
 			xqmstats.xs_qm_dqreclaim_misses,
 			xqmstats.xs_qm_dquot_dups,
@@ -100,25 +84,27 @@ xfs_qm_read_stats(
 			xqmstats.xs_qm_dqwants,
 			xqmstats.xs_qm_dqshake_reclaims,
 			xqmstats.xs_qm_dqinact_reclaims);
+	return 0;
+}
 
-	if (offset >= len) {
-		*start = buffer;
-		*eof = 1;
-		return 0;
-	}
-	*start = buffer + offset;
-	if ((len -= offset) > count)
-		return count;
-	*eof = 1;
-
-	return len;
+static int xqmstat_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, xqmstat_proc_show, NULL);
 }
 
+static const struct file_operations xqmstat_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= xqmstat_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 void
 xfs_qm_init_procfs(void)
 {
-	create_proc_read_entry("fs/xfs/xqmstat", 0, NULL, xfs_qm_read_stats, NULL);
-	create_proc_read_entry("fs/xfs/xqm", 0, NULL, xfs_qm_read_xfsquota, NULL);
+	proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
+	proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
 }
 
 void
-- 
cgit v1.2.2


From 9ef96da6ec5e1b4cf7eb8e30852cd88ec7d5fdc0 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sun, 9 Aug 2009 20:23:12 +0530
Subject: xfs: includecheck fix for fs/xfs/xfs_iops.c

fix the following 'make includecheck' warning:

  fs/xfs/linux-2.6/xfs_iops.c: xfs_acl.h is included more than once.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 8070b34cc287..626b474b2cdd 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -43,7 +43,6 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
-- 
cgit v1.2.2


From d6db07ded51c5fb4df2f4a32e6a41e9bb5db7fc4 Mon Sep 17 00:00:00 2001
From: Csaba Henk <csaba@gluster.com>
Date: Mon, 24 Aug 2009 06:14:07 +0200
Subject: fuse: use drop_nlink() instead of direct nlink manipulation

drop_nlink() is the API function to decrease the link count of an inode.
However, at a place the control filesystem used the decrement operator
on i_nlink directly. Fix this.

Cc: Anand Avati <avati@gluster.com>
Signed-off-by: Csaba Henk <csaba@gluster.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/control.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 99c99dfb0373..218d514924c0 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -156,7 +156,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
 		d_drop(dentry);
 		dput(dentry);
 	}
-	fuse_control_sb->s_root->d_inode->i_nlink--;
+	drop_nlink(fuse_control_sb->s_root->d_inode);
 }
 
 static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
-- 
cgit v1.2.2


From 487ea5af6358cb27c994e2cf056d4ee0872e43c3 Mon Sep 17 00:00:00 2001
From: Csaba Henk <csaba@gluster.com>
Date: Wed, 26 Aug 2009 19:17:22 +0200
Subject: fuse: limit user-specified values of max background requests

An untrusted user could DoS the system if s/he were allowed to accumulate an
arbitrary number of pending background requests by setting the above limits
to extremely high values in INIT. This patch excludes this possibility by
imposing global upper limits on the possible values of per-mount "max
background requests" and "congestion threshold" parameters for unprivileged
FUSE filesystems.

These global limits are implemented as module parameters.

Signed-off-by: Csaba Henk <csaba@gluster.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/inode.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 72 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9aa6f46d0c32..e54ddbda208c 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -14,6 +14,7 @@
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/parser.h>
 #include <linux/statfs.h>
 #include <linux/random.h>
@@ -28,6 +29,24 @@ static struct kmem_cache *fuse_inode_cachep;
 struct list_head fuse_conn_list;
 DEFINE_MUTEX(fuse_mutex);
 
+static int set_global_limit(const char *val, struct kernel_param *kp);
+
+static unsigned max_user_bgreq;
+module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
+		  &max_user_bgreq, 0644);
+__MODULE_PARM_TYPE(max_user_bgreq, "uint");
+MODULE_PARM_DESC(max_user_bgreq,
+ "Global limit for the maximum number of backgrounded requests an "
+ "unprivileged user can set");
+
+static unsigned max_user_congthresh;
+module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
+		  &max_user_congthresh, 0644);
+__MODULE_PARM_TYPE(max_user_congthresh, "uint");
+MODULE_PARM_DESC(max_user_congthresh,
+ "Global limit for the maximum congestion threshold an "
+ "unprivileged user can set");
+
 #define FUSE_SUPER_MAGIC 0x65735546
 
 #define FUSE_DEFAULT_BLKSIZE 512
@@ -735,6 +754,54 @@ static const struct super_operations fuse_super_operations = {
 	.show_options	= fuse_show_options,
 };
 
+static void sanitize_global_limit(unsigned *limit)
+{
+	if (*limit == 0)
+		*limit = ((num_physpages << PAGE_SHIFT) >> 13) /
+			 sizeof(struct fuse_req);
+
+	if (*limit >= 1 << 16)
+		*limit = (1 << 16) - 1;
+}
+
+static int set_global_limit(const char *val, struct kernel_param *kp)
+{
+	int rv;
+
+	rv = param_set_uint(val, kp);
+	if (rv)
+		return rv;
+
+	sanitize_global_limit((unsigned *)kp->arg);
+
+	return 0;
+}
+
+static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
+{
+	int cap_sys_admin = capable(CAP_SYS_ADMIN);
+
+	if (arg->minor < 13)
+		return;
+
+	sanitize_global_limit(&max_user_bgreq);
+	sanitize_global_limit(&max_user_congthresh);
+
+	if (arg->max_background) {
+		fc->max_background = arg->max_background;
+
+		if (!cap_sys_admin && fc->max_background > max_user_bgreq)
+			fc->max_background = max_user_bgreq;
+	}
+	if (arg->congestion_threshold) {
+		fc->congestion_threshold = arg->congestion_threshold;
+
+		if (!cap_sys_admin &&
+		    fc->congestion_threshold > max_user_congthresh)
+			fc->congestion_threshold = max_user_congthresh;
+	}
+}
+
 static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 {
 	struct fuse_init_out *arg = &req->misc.init_out;
@@ -744,12 +811,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 	else {
 		unsigned long ra_pages;
 
-		if (arg->minor >= 13) {
-			if (arg->max_background)
-				fc->max_background = arg->max_background;
-			if (arg->congestion_threshold)
-				fc->congestion_threshold = arg->congestion_threshold;
-		}
+		process_init_limits(fc, arg);
+
 		if (arg->minor >= 6) {
 			ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
 			if (arg->flags & FUSE_ASYNC_READ)
@@ -1161,6 +1224,9 @@ static int __init fuse_init(void)
 	if (res)
 		goto err_sysfs_cleanup;
 
+	sanitize_global_limit(&max_user_bgreq);
+	sanitize_global_limit(&max_user_congthresh);
+
 	return 0;
 
  err_sysfs_cleanup:
-- 
cgit v1.2.2


From 79a9d99434b104c562f30f21b75317667f444793 Mon Sep 17 00:00:00 2001
From: Csaba Henk <csaba@gluster.com>
Date: Wed, 26 Aug 2009 19:18:24 +0200
Subject: fuse: add fusectl interface to max_background

Make the max_background and congestion_threshold parameters of a FUSE
mount tunable at runtime by adding the respective knobs to its directory
within the fusectl filesystem.

Signed-off-by: Csaba Henk <csaba@gluster.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/control.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/fuse/fuse_i.h  |   6 ++-
 fs/fuse/inode.c   |   4 +-
 3 files changed, 141 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 218d514924c0..3773fd63d2f9 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -61,6 +61,121 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
 	return simple_read_from_buffer(buf, len, ppos, tmp, size);
 }
 
+static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf,
+				    size_t len, loff_t *ppos, unsigned val)
+{
+	char tmp[32];
+	size_t size = sprintf(tmp, "%u\n", val);
+
+	return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+
+static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf,
+				     size_t count, loff_t *ppos, unsigned *val,
+				     unsigned global_limit)
+{
+	unsigned long t;
+	char tmp[32];
+	unsigned limit = (1 << 16) - 1;
+	int err;
+
+	if (*ppos || count >= sizeof(tmp) - 1)
+		return -EINVAL;
+
+	if (copy_from_user(tmp, buf, count))
+		return -EINVAL;
+
+	tmp[count] = '\0';
+
+	err = strict_strtoul(tmp, 0, &t);
+	if (err)
+		return err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		limit = min(limit, global_limit);
+
+	if (t > limit)
+		return -EINVAL;
+
+	*val = t;
+
+	return count;
+}
+
+static ssize_t fuse_conn_max_background_read(struct file *file,
+					     char __user *buf, size_t len,
+					     loff_t *ppos)
+{
+	struct fuse_conn *fc;
+	unsigned val;
+
+	fc = fuse_ctl_file_conn_get(file);
+	if (!fc)
+		return 0;
+
+	val = fc->max_background;
+	fuse_conn_put(fc);
+
+	return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_max_background_write(struct file *file,
+					      const char __user *buf,
+					      size_t count, loff_t *ppos)
+{
+	unsigned val;
+	ssize_t ret;
+
+	ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+				    max_user_bgreq);
+	if (ret > 0) {
+		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+		if (fc) {
+			fc->max_background = val;
+			fuse_conn_put(fc);
+		}
+	}
+
+	return ret;
+}
+
+static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
+						   char __user *buf, size_t len,
+						   loff_t *ppos)
+{
+	struct fuse_conn *fc;
+	unsigned val;
+
+	fc = fuse_ctl_file_conn_get(file);
+	if (!fc)
+		return 0;
+
+	val = fc->congestion_threshold;
+	fuse_conn_put(fc);
+
+	return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
+						    const char __user *buf,
+						    size_t count, loff_t *ppos)
+{
+	unsigned val;
+	ssize_t ret;
+
+	ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+				    max_user_congthresh);
+	if (ret > 0) {
+		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+		if (fc) {
+			fc->congestion_threshold = val;
+			fuse_conn_put(fc);
+		}
+	}
+
+	return ret;
+}
+
 static const struct file_operations fuse_ctl_abort_ops = {
 	.open = nonseekable_open,
 	.write = fuse_conn_abort_write,
@@ -71,6 +186,18 @@ static const struct file_operations fuse_ctl_waiting_ops = {
 	.read = fuse_conn_waiting_read,
 };
 
+static const struct file_operations fuse_conn_max_background_ops = {
+	.open = nonseekable_open,
+	.read = fuse_conn_max_background_read,
+	.write = fuse_conn_max_background_write,
+};
+
+static const struct file_operations fuse_conn_congestion_threshold_ops = {
+	.open = nonseekable_open,
+	.read = fuse_conn_congestion_threshold_read,
+	.write = fuse_conn_congestion_threshold_write,
+};
+
 static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
 					  struct fuse_conn *fc,
 					  const char *name,
@@ -127,9 +254,14 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
 		goto err;
 
 	if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
-				NULL, &fuse_ctl_waiting_ops) ||
+				 NULL, &fuse_ctl_waiting_ops) ||
 	    !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
-				 NULL, &fuse_ctl_abort_ops))
+				 NULL, &fuse_ctl_abort_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600,
+				 1, NULL, &fuse_conn_max_background_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
+				 S_IFREG | 0600, 1, NULL,
+				 &fuse_conn_congestion_threshold_ops))
 		goto err;
 
 	return 0;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 6bcfab04396f..fc9c79feb5f7 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -32,7 +32,7 @@
 #define FUSE_NAME_MAX 1024
 
 /** Number of dentries for each connection in the control filesystem */
-#define FUSE_CTL_NUM_DENTRIES 3
+#define FUSE_CTL_NUM_DENTRIES 5
 
 /** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
     module will check permissions based on the file mode.  Otherwise no
@@ -49,6 +49,10 @@ extern struct list_head fuse_conn_list;
 /** Global mutex protecting fuse_conn_list and the control filesystem */
 extern struct mutex fuse_mutex;
 
+/** Module parameters */
+extern unsigned max_user_bgreq;
+extern unsigned max_user_congthresh;
+
 /** FUSE inode */
 struct fuse_inode {
 	/** Inode data */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e54ddbda208c..8f9aca8d4ad5 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -31,7 +31,7 @@ DEFINE_MUTEX(fuse_mutex);
 
 static int set_global_limit(const char *val, struct kernel_param *kp);
 
-static unsigned max_user_bgreq;
+unsigned max_user_bgreq;
 module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
 		  &max_user_bgreq, 0644);
 __MODULE_PARM_TYPE(max_user_bgreq, "uint");
@@ -39,7 +39,7 @@ MODULE_PARM_DESC(max_user_bgreq,
  "Global limit for the maximum number of backgrounded requests an "
  "unprivileged user can set");
 
-static unsigned max_user_congthresh;
+unsigned max_user_congthresh;
 module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
 		  &max_user_congthresh, 0644);
 __MODULE_PARM_TYPE(max_user_congthresh, "uint");
-- 
cgit v1.2.2


From 2c96ce9f2084c1e04d02883e622f74a537a63aea Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 15 Sep 2009 09:43:56 +0200
Subject: fs: remove bdev->bd_inode_backing_dev_info

It has been unused since it was introduced in:

commit 520808bf20e90fdbdb320264ba7dd5cf9d47dcac
Author: Andrew Morton <akpm@osdl.org>
Date:   Fri May 21 00:46:17 2004 -0700

    [PATCH] block device layer: separate backing_dev_info infrastructure

So lets just kill it.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c        | 1 -
 fs/inode.c            | 4 +---
 fs/nilfs2/the_nilfs.c | 4 +---
 3 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3581a4e53942..71e7e03ac343 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -420,7 +420,6 @@ static void bdev_destroy_inode(struct inode *inode)
 {
 	struct bdev_inode *bdi = BDEV_I(inode);
 
-	bdi->bdev.bd_inode_backing_dev_info = NULL;
 	kmem_cache_free(bdev_cachep, bdi);
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index ae7b67e48661..b2ba83d2c4e1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -182,9 +182,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	if (sb->s_bdev) {
 		struct backing_dev_info *bdi;
 
-		bdi = sb->s_bdev->bd_inode_backing_dev_info;
-		if (!bdi)
-			bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+		bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
 		mapping->backing_dev_info = bdi;
 	}
 	inode->i_private = NULL;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index d4168e269c5d..ad391a8c3e7e 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -591,9 +591,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 
 	nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
 
-	bdi = nilfs->ns_bdev->bd_inode_backing_dev_info;
-	if (!bdi)
-		bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
+	bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
 	nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
 
 	/* Finding last segment */
-- 
cgit v1.2.2


From 1fe06ad89255c211fe100d7f690d10b161398df8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 15 Sep 2009 15:10:20 +0200
Subject: writeback: get rid of wbc->for_writepages

It's only set, it's never checked. Kill it.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/afs/write.c          | 1 -
 fs/btrfs/ordered-data.c | 1 -
 fs/jbd2/commit.c        | 1 -
 fs/nfs/write.c          | 1 -
 4 files changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/write.c b/fs/afs/write.c
index c2e7a7ff0080..c63a3c8beb73 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -712,7 +712,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
 		.bdi		= mapping->backing_dev_info,
 		.sync_mode	= WB_SYNC_ALL,
 		.nr_to_write	= LONG_MAX,
-		.for_writepages = 1,
 		.range_cyclic	= 1,
 	};
 	int ret;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d6f0806c682f..7b2f401e604e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -740,7 +740,6 @@ int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
 		.nr_to_write = mapping->nrpages * 2,
 		.range_start = start,
 		.range_end = end,
-		.for_writepages = 1,
 	};
 	return btrfs_writepages(mapping, &wbc);
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7b4088b2364d..0df600e9162d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -220,7 +220,6 @@ static int journal_submit_inode_data_buffers(struct address_space *mapping)
 		.nr_to_write = mapping->nrpages * 2,
 		.range_start = 0,
 		.range_end = i_size_read(mapping->host),
-		.for_writepages = 1,
 	};
 
 	ret = generic_writepages(mapping, &wbc);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 120acadc6a84..53eb26c16b50 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1490,7 +1490,6 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
 		.nr_to_write = LONG_MAX,
 		.range_start = 0,
 		.range_end = LLONG_MAX,
-		.for_writepages = 1,
 	};
 
 	return __nfs_write_mapping(mapping, &wbc, how);
-- 
cgit v1.2.2


From f0fad8a530e7cbad5f686dbca3079d1a626a3882 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 11 Sep 2009 09:47:56 +0200
Subject: writeback: merely wakeup flusher thread if work allocation fails for
 WB_SYNC_NONE

Since it's an opportunistic writeback and not a data integrity action,
don't punt to blocking writeback. Just wakeup the thread and it will
flush old data.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 46 ++++++++++++++--------------------------------
 1 file changed, 14 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 628235cf44b5..783ed44c7cfe 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -75,13 +75,6 @@ static inline void bdi_work_init(struct bdi_work *work,
 	work->state = WS_USED;
 }
 
-static inline void bdi_work_init_on_stack(struct bdi_work *work,
-					  struct writeback_control *wbc)
-{
-	bdi_work_init(work, wbc);
-	work->state |= WS_ONSTACK;
-}
-
 /**
  * writeback_in_progress - determine whether there is writeback in progress
  * @bdi: the device's backing_dev_info structure.
@@ -207,34 +200,23 @@ static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
 
 void bdi_start_writeback(struct writeback_control *wbc)
 {
-	const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
-	struct bdi_work work_stack, *work = NULL;
-
-	if (!must_wait)
-		work = bdi_alloc_work(wbc);
+	/*
+	 * WB_SYNC_NONE is opportunistic writeback. If this allocation fails,
+	 * bdi_queue_work() will wake up the thread and flush old data. This
+	 * should ensure some amount of progress in freeing memory.
+	 */
+	if (wbc->sync_mode != WB_SYNC_ALL) {
+		struct bdi_work *w = bdi_alloc_work(wbc);
 
-	if (!work) {
-		work = &work_stack;
-		bdi_work_init_on_stack(work, wbc);
-	}
+		bdi_queue_work(wbc->bdi, w);
+	} else {
+		struct bdi_work work;
 
-	bdi_queue_work(wbc->bdi, work);
+		bdi_work_init(&work, wbc);
+		work.state |= WS_ONSTACK;
 
-	/*
-	 * If the sync mode is WB_SYNC_ALL, block waiting for the work to
-	 * complete. If not, we only need to wait for the work to be started,
-	 * if we allocated it on-stack. We use the same mechanism, if the
-	 * wait bit is set in the bdi_work struct, then threads will not
-	 * clear pending until after they are done.
-	 *
-	 * Note that work == &work_stack if must_wait is true, so we don't
-	 * need to do call_rcu() here ever, since the completion path will
-	 * have done that for us.
-	 */
-	if (must_wait || work == &work_stack) {
-		bdi_wait_on_work_clear(work);
-		if (work != &work_stack)
-			call_rcu(&work->rcu_head, bdi_work_free);
+		bdi_queue_work(wbc->bdi, &work);
+		bdi_wait_on_work_clear(&work);
 	}
 }
 
-- 
cgit v1.2.2


From c4a77a6c7dcff04a2abc7fe4b6b2ae605be41c5b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 16 Sep 2009 15:18:25 +0200
Subject: writeback: make wb_writeback() take an argument structure

We need to be able to pass in range_cyclic as well, so instead
of growing yet another argument, split the arguments into a
struct wb_writeback_args structure that we can use internally.
Also makes it easier to just copy all members to an on-stack
struct, since we can't access work after clearing the pending
bit.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 78 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 49 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 783ed44c7cfe..c5e91225501d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -34,6 +34,17 @@
  */
 int nr_pdflush_threads;
 
+/*
+ * Passed into wb_writeback(), essentially a subset of writeback_control
+ */
+struct wb_writeback_args {
+	long nr_pages;
+	struct super_block *sb;
+	enum writeback_sync_modes sync_mode;
+	int for_kupdate;
+	int range_cyclic;
+};
+
 /*
  * Work items for the bdi_writeback threads
  */
@@ -45,9 +56,7 @@ struct bdi_work {
 	unsigned long seen;
 	atomic_t pending;
 
-	struct super_block *sb;
-	unsigned long nr_pages;
-	enum writeback_sync_modes sync_mode;
+	struct wb_writeback_args args;
 
 	unsigned long state;
 };
@@ -69,9 +78,11 @@ static inline void bdi_work_init(struct bdi_work *work,
 				 struct writeback_control *wbc)
 {
 	INIT_RCU_HEAD(&work->rcu_head);
-	work->sb = wbc->sb;
-	work->nr_pages = wbc->nr_to_write;
-	work->sync_mode = wbc->sync_mode;
+	work->args.sb = wbc->sb;
+	work->args.nr_pages = wbc->nr_to_write;
+	work->args.sync_mode = wbc->sync_mode;
+	work->args.range_cyclic = wbc->range_cyclic;
+	work->args.for_kupdate = 0;
 	work->state = WS_USED;
 }
 
@@ -106,7 +117,7 @@ static void bdi_work_free(struct rcu_head *head)
 
 static void wb_work_complete(struct bdi_work *work)
 {
-	const enum writeback_sync_modes sync_mode = work->sync_mode;
+	const enum writeback_sync_modes sync_mode = work->args.sync_mode;
 
 	/*
 	 * For allocated work, we can clear the done/seen bit right here.
@@ -653,17 +664,16 @@ static inline bool over_bground_thresh(void)
  * older_than_this takes precedence over nr_to_write.  So we'll only write back
  * all dirty pages if they are all attached to "old" mappings.
  */
-static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
-			 struct super_block *sb,
-			 enum writeback_sync_modes sync_mode, int for_kupdate)
+static long wb_writeback(struct bdi_writeback *wb,
+			 struct wb_writeback_args *args)
 {
 	struct writeback_control wbc = {
 		.bdi			= wb->bdi,
-		.sb			= sb,
-		.sync_mode		= sync_mode,
+		.sb			= args->sb,
+		.sync_mode		= args->sync_mode,
 		.older_than_this	= NULL,
-		.for_kupdate		= for_kupdate,
-		.range_cyclic		= 1,
+		.for_kupdate		= args->for_kupdate,
+		.range_cyclic		= args->range_cyclic,
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
@@ -673,13 +683,18 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
 		oldest_jif = jiffies -
 				msecs_to_jiffies(dirty_expire_interval * 10);
 	}
+	if (!wbc.range_cyclic) {
+		wbc.range_start = 0;
+		wbc.range_end = LLONG_MAX;
+	}
 
 	for (;;) {
 		/*
 		 * Don't flush anything for non-integrity writeback where
 		 * no nr_pages was given
 		 */
-		if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE)
+		if (!args->for_kupdate && args->nr_pages <= 0 &&
+		     args->sync_mode == WB_SYNC_NONE)
 			break;
 
 		/*
@@ -687,7 +702,8 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
 		 * periodic background writeout and we are below the
 		 * background dirty threshold, don't do anything
 		 */
-		if (for_kupdate && nr_pages <= 0 && !over_bground_thresh())
+		if (args->for_kupdate && args->nr_pages <= 0 &&
+		    !over_bground_thresh())
 			break;
 
 		wbc.more_io = 0;
@@ -695,7 +711,7 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		wbc.pages_skipped = 0;
 		writeback_inodes_wb(wb, &wbc);
-		nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+		args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 
 		/*
@@ -749,8 +765,16 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 			global_page_state(NR_UNSTABLE_NFS) +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 
-	if (nr_pages)
-		return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1);
+	if (nr_pages) {
+		struct wb_writeback_args args = {
+			.nr_pages	= nr_pages,
+			.sync_mode	= WB_SYNC_NONE,
+			.for_kupdate	= 1,
+			.range_cyclic	= 1,
+		};
+
+		return wb_writeback(wb, &args);
+	}
 
 	return 0;
 }
@@ -762,35 +786,31 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 {
 	struct backing_dev_info *bdi = wb->bdi;
 	struct bdi_work *work;
-	long nr_pages, wrote = 0;
+	long wrote = 0;
 
 	while ((work = get_next_work_item(bdi, wb)) != NULL) {
-		enum writeback_sync_modes sync_mode;
-
-		nr_pages = work->nr_pages;
+		struct wb_writeback_args args = work->args;
 
 		/*
 		 * Override sync mode, in case we must wait for completion
 		 */
 		if (force_wait)
-			work->sync_mode = sync_mode = WB_SYNC_ALL;
-		else
-			sync_mode = work->sync_mode;
+			work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
 
 		/*
 		 * If this isn't a data integrity operation, just notify
 		 * that we have seen this work and we are now starting it.
 		 */
-		if (sync_mode == WB_SYNC_NONE)
+		if (args.sync_mode == WB_SYNC_NONE)
 			wb_clear_pending(wb, work);
 
-		wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0);
+		wrote += wb_writeback(wb, &args);
 
 		/*
 		 * This is a data integrity writeback, so only do the
 		 * notification when we have completed the work.
 		 */
-		if (sync_mode == WB_SYNC_ALL)
+		if (args.sync_mode == WB_SYNC_ALL)
 			wb_clear_pending(wb, work);
 	}
 
-- 
cgit v1.2.2


From 32a88aa1b6dfb901cec64e1898cac78d0f25028a Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 16 Sep 2009 15:02:33 +0200
Subject: fs: Assign bdi in super_block

We do this automatically in get_sb_bdev() from the set_bdev_super()
callback. Filesystems that have their own private backing_dev_info
must assign that in ->fill_super().

Note that ->s_bdi assignment is required for proper writeback!

Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/btrfs/disk-io.c | 1 +
 fs/fuse/inode.c    | 2 ++
 fs/nfs/super.c     | 2 ++
 fs/super.c         | 6 ++++++
 fs/sync.c          | 9 ++++++++-
 fs/ubifs/super.c   | 1 +
 6 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 15831d5c7367..8b8192790011 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1600,6 +1600,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	sb->s_blocksize = 4096;
 	sb->s_blocksize_bits = blksize_bits(4096);
+	sb->s_bdi = &fs_info->bdi;
 
 	/*
 	 * we set the i_size on the btree inode to the max possible int.
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 4567db6f9430..e5dbecd87b0f 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -894,6 +894,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (err)
 		goto err_put_conn;
 
+	sb->s_bdi = &fc->bdi;
+
 	/* Handle umasking inside the fuse code */
 	if (sb->s_flags & MS_POSIXACL)
 		fc->dont_mask = 1;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 867f70504531..de935692d40d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1918,6 +1918,8 @@ static inline void nfs_initialise_sb(struct super_block *sb)
 	if (server->flags & NFS_MOUNT_NOAC)
 		sb->s_flags |= MS_SYNCHRONOUS;
 
+	sb->s_bdi = &server->backing_dev_info;
+
 	nfs_super_set_maxbytes(sb, server->maxfilesize);
 }
 
diff --git a/fs/super.c b/fs/super.c
index 9cda337ddae2..b03fea8fbfb6 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -707,6 +707,12 @@ static int set_bdev_super(struct super_block *s, void *data)
 {
 	s->s_bdev = data;
 	s->s_dev = s->s_bdev->bd_dev;
+
+	/*
+	 * We set the bdi here to the queue backing, file systems can
+	 * overwrite this in ->fill_super()
+	 */
+	s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
 	return 0;
 }
 
diff --git a/fs/sync.c b/fs/sync.c
index 192340930bb4..c08467a5d7cb 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -27,6 +27,13 @@
  */
 static int __sync_filesystem(struct super_block *sb, int wait)
 {
+	/*
+	 * This should be safe, as we require bdi backing to actually
+	 * write out data in the first place
+	 */
+	if (!sb->s_bdi)
+		return 0;
+
 	/* Avoid doing twice syncing and cache pruning for quota sync */
 	if (!wait) {
 		writeout_quota_sb(sb, -1);
@@ -101,7 +108,7 @@ restart:
 		spin_unlock(&sb_lock);
 
 		down_read(&sb->s_umount);
-		if (!(sb->s_flags & MS_RDONLY) && sb->s_root)
+		if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
 			__sync_filesystem(sb, wait);
 		up_read(&sb->s_umount);
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 51763aa8f4de..c4af069df1ad 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1980,6 +1980,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	if (err)
 		goto out_bdi;
 
+	sb->s_bdi = &c->bdi;
 	sb->s_fs_info = c;
 	sb->s_magic = UBIFS_SUPER_MAGIC;
 	sb->s_blocksize = UBIFS_BLOCK_SIZE;
-- 
cgit v1.2.2


From f11fcae8401a3175f528e2f7917362645d570111 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 15 Sep 2009 09:53:35 +0200
Subject: writeback: only use bdi_writeback_all() for WB_SYNC_NONE writeout

Data integrity writeback must use bdi_start_writeback() and ensure
that wbc->sb and wbc->bdi are set.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 70 +++++++++++--------------------------------------------
 1 file changed, 14 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c5e91225501d..14f06b459197 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -50,7 +50,6 @@ struct wb_writeback_args {
  */
 struct bdi_work {
 	struct list_head list;
-	struct list_head wait_list;
 	struct rcu_head rcu_head;
 
 	unsigned long seen;
@@ -198,7 +197,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
 		    TASK_UNINTERRUPTIBLE);
 }
 
-static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
+static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
+				 struct writeback_control *wbc)
 {
 	struct bdi_work *work;
 
@@ -206,7 +206,7 @@ static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
 	if (work)
 		bdi_work_init(work, wbc);
 
-	return work;
+	bdi_queue_work(bdi, work);
 }
 
 void bdi_start_writeback(struct writeback_control *wbc)
@@ -216,11 +216,9 @@ void bdi_start_writeback(struct writeback_control *wbc)
 	 * bdi_queue_work() will wake up the thread and flush old data. This
 	 * should ensure some amount of progress in freeing memory.
 	 */
-	if (wbc->sync_mode != WB_SYNC_ALL) {
-		struct bdi_work *w = bdi_alloc_work(wbc);
-
-		bdi_queue_work(wbc->bdi, w);
-	} else {
+	if (wbc->sync_mode != WB_SYNC_ALL)
+		bdi_alloc_queue_work(wbc->bdi, wbc);
+	else {
 		struct bdi_work work;
 
 		bdi_work_init(&work, wbc);
@@ -860,67 +858,26 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 }
 
 /*
- * Schedule writeback for all backing devices. Expensive! If this is a data
- * integrity operation, writeback will be complete when this returns. If
- * we are simply called for WB_SYNC_NONE, then writeback will merely be
- * scheduled to run.
+ * Schedule writeback for all backing devices. Can only be used for
+ * WB_SYNC_NONE writeback, WB_SYNC_ALL should use bdi_start_writeback()
+ * and pass in the superblock.
  */
 static void bdi_writeback_all(struct writeback_control *wbc)
 {
-	const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
 	struct backing_dev_info *bdi;
-	struct bdi_work *work;
-	LIST_HEAD(list);
 
-restart:
+	WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
+
 	spin_lock(&bdi_lock);
 
 	list_for_each_entry(bdi, &bdi_list, bdi_list) {
-		struct bdi_work *work;
-
 		if (!bdi_has_dirty_io(bdi))
 			continue;
 
-		/*
-		 * If work allocation fails, do the writes inline. We drop
-		 * the lock and restart the list writeout. This should be OK,
-		 * since this happens rarely and because the writeout should
-		 * eventually make more free memory available.
-		 */
-		work = bdi_alloc_work(wbc);
-		if (!work) {
-			struct writeback_control __wbc;
-
-			/*
-			 * Not a data integrity writeout, just continue
-			 */
-			if (!must_wait)
-				continue;
-
-			spin_unlock(&bdi_lock);
-			__wbc = *wbc;
-			__wbc.bdi = bdi;
-			writeback_inodes_wbc(&__wbc);
-			goto restart;
-		}
-		if (must_wait)
-			list_add_tail(&work->wait_list, &list);
-
-		bdi_queue_work(bdi, work);
+		bdi_alloc_queue_work(bdi, wbc);
 	}
 
 	spin_unlock(&bdi_lock);
-
-	/*
-	 * If this is for WB_SYNC_ALL, wait for pending work to complete
-	 * before returning.
-	 */
-	while (!list_empty(&list)) {
-		work = list_entry(list.next, struct bdi_work, wait_list);
-		list_del(&work->wait_list);
-		bdi_wait_on_work_clear(work);
-		call_rcu(&work->rcu_head, bdi_work_free);
-	}
 }
 
 /*
@@ -1177,6 +1134,7 @@ long sync_inodes_sb(struct super_block *sb)
 {
 	struct writeback_control wbc = {
 		.sb		= sb,
+		.bdi		= sb->s_bdi,
 		.sync_mode	= WB_SYNC_ALL,
 		.range_start	= 0,
 		.range_end	= LLONG_MAX,
@@ -1184,7 +1142,7 @@ long sync_inodes_sb(struct super_block *sb)
 	long nr_to_write = LONG_MAX; /* doesn't actually matter */
 
 	wbc.nr_to_write = nr_to_write;
-	bdi_writeback_all(&wbc);
+	bdi_start_writeback(&wbc);
 	wait_sb_inodes(&wbc);
 	return nr_to_write - wbc.nr_to_write;
 }
-- 
cgit v1.2.2


From cfc4ba5365449cb6b5c9f68d755a142f17da1e47 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 14 Sep 2009 13:12:40 +0200
Subject: writeback: use RCU to protect bdi_list

Now that bdi_writeback_all() no longer handles integrity writeback,
it doesn't have to block anymore. This means that we can switch
bdi_list reader side protection to RCU.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 14f06b459197..f8cd7a97f5b7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -868,16 +868,16 @@ static void bdi_writeback_all(struct writeback_control *wbc)
 
 	WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
 
-	spin_lock(&bdi_lock);
+	rcu_read_lock();
 
-	list_for_each_entry(bdi, &bdi_list, bdi_list) {
+	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
 
 		bdi_alloc_queue_work(bdi, wbc);
 	}
 
-	spin_unlock(&bdi_lock);
+	rcu_read_unlock();
 }
 
 /*
-- 
cgit v1.2.2


From bcddc3f01c9122882c8b9f12ab94a934e55aef97 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Sun, 13 Sep 2009 20:07:36 +0200
Subject: writeback: inline allocation failure handling in
 bdi_alloc_queue_work()

This gets rid of work == NULL in bdi_queue_work() and puts the
OOM handling where it belongs.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 49 +++++++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f8cd7a97f5b7..59b3ee63b624 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -149,21 +149,19 @@ static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
 
 static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
 {
-	if (work) {
-		work->seen = bdi->wb_mask;
-		BUG_ON(!work->seen);
-		atomic_set(&work->pending, bdi->wb_cnt);
-		BUG_ON(!bdi->wb_cnt);
+	work->seen = bdi->wb_mask;
+	BUG_ON(!work->seen);
+	atomic_set(&work->pending, bdi->wb_cnt);
+	BUG_ON(!bdi->wb_cnt);
 
-		/*
-		 * Make sure stores are seen before it appears on the list
-		 */
-		smp_mb();
+	/*
+	 * Make sure stores are seen before it appears on the list
+	 */
+	smp_mb();
 
-		spin_lock(&bdi->wb_lock);
-		list_add_tail_rcu(&work->list, &bdi->work_list);
-		spin_unlock(&bdi->wb_lock);
-	}
+	spin_lock(&bdi->wb_lock);
+	list_add_tail_rcu(&work->list, &bdi->work_list);
+	spin_unlock(&bdi->wb_lock);
 
 	/*
 	 * If the default thread isn't there, make sure we add it. When
@@ -175,14 +173,12 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
 		struct bdi_writeback *wb = &bdi->wb;
 
 		/*
-		 * If we failed allocating the bdi work item, wake up the wb
-		 * thread always. As a safety precaution, it'll flush out
-		 * everything
+		 * End work now if this wb has no dirty IO pending. Otherwise
+		 * wakeup the handling thread
 		 */
-		if (!wb_has_dirty_io(wb)) {
-			if (work)
-				wb_clear_pending(wb, work);
-		} else if (wb->task)
+		if (!wb_has_dirty_io(wb))
+			wb_clear_pending(wb, work);
+		else if (wb->task)
 			wake_up_process(wb->task);
 	}
 }
@@ -202,11 +198,20 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
 {
 	struct bdi_work *work;
 
+	/*
+	 * This is WB_SYNC_NONE writeback, so if allocation fails just
+	 * wakeup the thread for old dirty data writeback
+	 */
 	work = kmalloc(sizeof(*work), GFP_ATOMIC);
-	if (work)
+	if (work) {
 		bdi_work_init(work, wbc);
+		bdi_queue_work(bdi, work);
+	} else {
+		struct bdi_writeback *wb = &bdi->wb;
 
-	bdi_queue_work(bdi, work);
+		if (wb->task)
+			wake_up_process(wb->task);
+	}
 }
 
 void bdi_start_writeback(struct writeback_control *wbc)
-- 
cgit v1.2.2


From b6e51316daede0633e9274e1e30391cfa4747877 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 16 Sep 2009 15:13:54 +0200
Subject: writeback: separate starting of sync vs opportunistic writeback

bdi_start_writeback() is currently split into two paths, one for
WB_SYNC_NONE and one for WB_SYNC_ALL. Add bdi_sync_writeback()
for WB_SYNC_ALL writeback and let bdi_start_writeback() handle
only WB_SYNC_NONE.

Push down the writeback_control allocation and only accept the
parameters that make sense for each function. This cleans up
the API considerably.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 132 +++++++++++++++++++++++++++---------------------------
 fs/ubifs/budget.c |  20 ++-------
 2 files changed, 70 insertions(+), 82 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 59b3ee63b624..5887328b5a06 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -74,14 +74,10 @@ static inline bool bdi_work_on_stack(struct bdi_work *work)
 }
 
 static inline void bdi_work_init(struct bdi_work *work,
-				 struct writeback_control *wbc)
+				 struct wb_writeback_args *args)
 {
 	INIT_RCU_HEAD(&work->rcu_head);
-	work->args.sb = wbc->sb;
-	work->args.nr_pages = wbc->nr_to_write;
-	work->args.sync_mode = wbc->sync_mode;
-	work->args.range_cyclic = wbc->range_cyclic;
-	work->args.for_kupdate = 0;
+	work->args = *args;
 	work->state = WS_USED;
 }
 
@@ -194,7 +190,7 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
 }
 
 static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
-				 struct writeback_control *wbc)
+				 struct wb_writeback_args *args)
 {
 	struct bdi_work *work;
 
@@ -204,7 +200,7 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
 	 */
 	work = kmalloc(sizeof(*work), GFP_ATOMIC);
 	if (work) {
-		bdi_work_init(work, wbc);
+		bdi_work_init(work, args);
 		bdi_queue_work(bdi, work);
 	} else {
 		struct bdi_writeback *wb = &bdi->wb;
@@ -214,24 +210,54 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
 	}
 }
 
-void bdi_start_writeback(struct writeback_control *wbc)
+/**
+ * bdi_sync_writeback - start and wait for writeback
+ * @bdi: the backing device to write from
+ * @sb: write inodes from this super_block
+ *
+ * Description:
+ *   This does WB_SYNC_ALL data integrity writeback and waits for the
+ *   IO to complete. Callers must hold the sb s_umount semaphore for
+ *   reading, to avoid having the super disappear before we are done.
+ */
+static void bdi_sync_writeback(struct backing_dev_info *bdi,
+			       struct super_block *sb)
 {
-	/*
-	 * WB_SYNC_NONE is opportunistic writeback. If this allocation fails,
-	 * bdi_queue_work() will wake up the thread and flush old data. This
-	 * should ensure some amount of progress in freeing memory.
-	 */
-	if (wbc->sync_mode != WB_SYNC_ALL)
-		bdi_alloc_queue_work(wbc->bdi, wbc);
-	else {
-		struct bdi_work work;
+	struct wb_writeback_args args = {
+		.sb		= sb,
+		.sync_mode	= WB_SYNC_ALL,
+		.nr_pages	= LONG_MAX,
+		.range_cyclic	= 0,
+	};
+	struct bdi_work work;
 
-		bdi_work_init(&work, wbc);
-		work.state |= WS_ONSTACK;
+	bdi_work_init(&work, &args);
+	work.state |= WS_ONSTACK;
 
-		bdi_queue_work(wbc->bdi, &work);
-		bdi_wait_on_work_clear(&work);
-	}
+	bdi_queue_work(bdi, &work);
+	bdi_wait_on_work_clear(&work);
+}
+
+/**
+ * bdi_start_writeback - start writeback
+ * @bdi: the backing device to write from
+ * @nr_pages: the number of pages to write
+ *
+ * Description:
+ *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
+ *   started when this function returns, we make no guarentees on
+ *   completion. Caller need not hold sb s_umount semaphore.
+ *
+ */
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
+{
+	struct wb_writeback_args args = {
+		.sync_mode	= WB_SYNC_NONE,
+		.nr_pages	= nr_pages,
+		.range_cyclic	= 1,
+	};
+
+	bdi_alloc_queue_work(bdi, &args);
 }
 
 /*
@@ -863,23 +889,25 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 }
 
 /*
- * Schedule writeback for all backing devices. Can only be used for
- * WB_SYNC_NONE writeback, WB_SYNC_ALL should use bdi_start_writeback()
- * and pass in the superblock.
+ * Schedule writeback for all backing devices. This does WB_SYNC_NONE
+ * writeback, for integrity writeback see bdi_sync_writeback().
  */
-static void bdi_writeback_all(struct writeback_control *wbc)
+static void bdi_writeback_all(struct super_block *sb, long nr_pages)
 {
+	struct wb_writeback_args args = {
+		.sb		= sb,
+		.nr_pages	= nr_pages,
+		.sync_mode	= WB_SYNC_NONE,
+	};
 	struct backing_dev_info *bdi;
 
-	WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
-
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
 
-		bdi_alloc_queue_work(bdi, wbc);
+		bdi_alloc_queue_work(bdi, &args);
 	}
 
 	rcu_read_unlock();
@@ -891,17 +919,10 @@ static void bdi_writeback_all(struct writeback_control *wbc)
  */
 void wakeup_flusher_threads(long nr_pages)
 {
-	struct writeback_control wbc = {
-		.sync_mode	= WB_SYNC_NONE,
-		.older_than_this = NULL,
-		.range_cyclic	= 1,
-	};
-
 	if (nr_pages == 0)
 		nr_pages = global_page_state(NR_FILE_DIRTY) +
 				global_page_state(NR_UNSTABLE_NFS);
-	wbc.nr_to_write = nr_pages;
-	bdi_writeback_all(&wbc);
+	bdi_writeback_all(NULL, nr_pages);
 }
 
 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
@@ -1048,7 +1069,7 @@ EXPORT_SYMBOL(__mark_inode_dirty);
  * on the writer throttling path, and we get decent balancing between many
  * throttled threads: we don't want them all piling up on inode_sync_wait.
  */
-static void wait_sb_inodes(struct writeback_control *wbc)
+static void wait_sb_inodes(struct super_block *sb)
 {
 	struct inode *inode, *old_inode = NULL;
 
@@ -1056,7 +1077,7 @@ static void wait_sb_inodes(struct writeback_control *wbc)
 	 * We need to be protected against the filesystem going from
 	 * r/o to r/w or vice versa.
 	 */
-	WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount));
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
 	spin_lock(&inode_lock);
 
@@ -1067,7 +1088,7 @@ static void wait_sb_inodes(struct writeback_control *wbc)
 	 * In which case, the inode may not be on the dirty list, but
 	 * we still have to wait for that writeout.
 	 */
-	list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) {
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		struct address_space *mapping;
 
 		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
@@ -1107,14 +1128,8 @@ static void wait_sb_inodes(struct writeback_control *wbc)
  * for IO completion of submitted IO. The number of pages submitted is
  * returned.
  */
-long writeback_inodes_sb(struct super_block *sb)
+void writeback_inodes_sb(struct super_block *sb)
 {
-	struct writeback_control wbc = {
-		.sb		= sb,
-		.sync_mode	= WB_SYNC_NONE,
-		.range_start	= 0,
-		.range_end	= LLONG_MAX,
-	};
 	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
 	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
 	long nr_to_write;
@@ -1122,9 +1137,7 @@ long writeback_inodes_sb(struct super_block *sb)
 	nr_to_write = nr_dirty + nr_unstable +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 
-	wbc.nr_to_write = nr_to_write;
-	bdi_writeback_all(&wbc);
-	return nr_to_write - wbc.nr_to_write;
+	bdi_writeback_all(sb, nr_to_write);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 
@@ -1135,21 +1148,10 @@ EXPORT_SYMBOL(writeback_inodes_sb);
  * This function writes and waits on any dirty inode belonging to this
  * super_block. The number of pages synced is returned.
  */
-long sync_inodes_sb(struct super_block *sb)
+void sync_inodes_sb(struct super_block *sb)
 {
-	struct writeback_control wbc = {
-		.sb		= sb,
-		.bdi		= sb->s_bdi,
-		.sync_mode	= WB_SYNC_ALL,
-		.range_start	= 0,
-		.range_end	= LLONG_MAX,
-	};
-	long nr_to_write = LONG_MAX; /* doesn't actually matter */
-
-	wbc.nr_to_write = nr_to_write;
-	bdi_start_writeback(&wbc);
-	wait_sb_inodes(&wbc);
-	return nr_to_write - wbc.nr_to_write;
+	bdi_sync_writeback(sb->s_bdi, sb);
+	wait_sb_inodes(sb);
 }
 EXPORT_SYMBOL(sync_inodes_sb);
 
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 1c8991b0db13..ee1ce68fd98b 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -54,29 +54,15 @@
  * @nr_to_write: how many dirty pages to write-back
  *
  * This function shrinks UBIFS liability by means of writing back some amount
- * of dirty inodes and their pages. Returns the amount of pages which were
- * written back. The returned value does not include dirty inodes which were
- * synchronized.
+ * of dirty inodes and their pages.
  *
  * Note, this function synchronizes even VFS inodes which are locked
  * (@i_mutex) by the caller of the budgeting function, because write-back does
  * not touch @i_mutex.
  */
-static int shrink_liability(struct ubifs_info *c, int nr_to_write)
+static void shrink_liability(struct ubifs_info *c, int nr_to_write)
 {
-	int nr_written;
-
-	nr_written = writeback_inodes_sb(c->vfs_sb);
-	if (!nr_written) {
-		/*
-		 * Re-try again but wait on pages/inodes which are being
-		 * written-back concurrently (e.g., by pdflush).
-		 */
-		nr_written = sync_inodes_sb(c->vfs_sb);
-	}
-
-	dbg_budg("%d pages were written back", nr_written);
-	return nr_written;
+	writeback_inodes_sb(c->vfs_sb);
 }
 
 /**
-- 
cgit v1.2.2


From 8010c3b6349b407f8f11b3f4d7e9f94cb00fe528 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 15 Sep 2009 20:04:57 +0200
Subject: writeback: add comments to bdi_work structure

And document its retriever, get_next_work_item().

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5887328b5a06..55f0d4e51b59 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -49,15 +49,15 @@ struct wb_writeback_args {
  * Work items for the bdi_writeback threads
  */
 struct bdi_work {
-	struct list_head list;
-	struct rcu_head rcu_head;
+	struct list_head list;		/* pending work list */
+	struct rcu_head rcu_head;	/* for RCU free/clear of work */
 
-	unsigned long seen;
-	atomic_t pending;
+	unsigned long seen;		/* threads that have seen this work */
+	atomic_t pending;		/* number of threads still to do work */
 
-	struct wb_writeback_args args;
+	struct wb_writeback_args args;	/* writeback arguments */
 
-	unsigned long state;
+	unsigned long state;		/* flag bits, see WS_* */
 };
 
 enum {
@@ -758,7 +758,11 @@ static long wb_writeback(struct bdi_writeback *wb,
 
 /*
  * Return the next bdi_work struct that hasn't been processed by this
- * wb thread yet
+ * wb thread yet. ->seen is initially set for each thread that exists
+ * for this device, when a thread first notices a piece of work it
+ * clears its bit. Depending on writeback type, the thread will notify
+ * completion on either receiving the work (WB_SYNC_NONE) or after
+ * it is done (WB_SYNC_ALL).
  */
 static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
 					   struct bdi_writeback *wb)
-- 
cgit v1.2.2


From 49db041430e8a856dbc3af15430bf068f1c74655 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 15 Sep 2009 21:27:40 +0200
Subject: writeback: use schedule_timeout_interruptible()

Gets rid of a manual set_current_state().

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 55f0d4e51b59..34757758511a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -884,8 +884,7 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 		}
 
 		wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(wait_jiffies);
+		schedule_timeout_interruptible(wait_jiffies);
 		try_to_freeze();
 	}
 
-- 
cgit v1.2.2


From deed62edffe600bc5b379c872d3004116e001b66 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 15 Sep 2009 21:32:58 +0200
Subject: writeback: remove smp_mb(), it's not needed with list_add_tail_rcu()

list_add_tail_rcu contains required barriers.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 34757758511a..59c99e729187 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -151,10 +151,10 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
 	BUG_ON(!bdi->wb_cnt);
 
 	/*
-	 * Make sure stores are seen before it appears on the list
+	 * list_add_tail_rcu() contains the necessary barriers to
+	 * make sure the above stores are seen before the item is
+	 * noticed on the list
 	 */
-	smp_mb();
-
 	spin_lock(&bdi->wb_lock);
 	list_add_tail_rcu(&work->list, &bdi->work_list);
 	spin_unlock(&bdi->wb_lock);
-- 
cgit v1.2.2


From 77fad5e625e56eb31a343ae1d489979fdc61a2aa Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 15 Sep 2009 21:34:12 +0200
Subject: writeback: improve scalability of bdi writeback work queues

If you're going to do an atomic RMW on each list entry, there's not much
point in all the RCU complexities of the list walking. This is only going
to help the multi-thread case I guess, but it doesn't hurt to do now.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 59c99e729187..6bca6f8176f0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -772,8 +772,9 @@ static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(work, &bdi->work_list, list) {
-		if (!test_and_clear_bit(wb->nr, &work->seen))
+		if (!test_bit(wb->nr, &work->seen))
 			continue;
+		clear_bit(wb->nr, &work->seen);
 
 		ret = work;
 		break;
-- 
cgit v1.2.2


From 77b9d059cb3ddb8b1246d5878e81d52926550b23 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 15 Sep 2009 21:34:51 +0200
Subject: writeback: Fix bdi use after free in wb_work_complete()

By the time bdi_work_on_stack gets evaluated again in bdi_work_free, it
can already have been deallocated and used for something else in the
!on stack case, giving a false positive in this test and causing
corruption.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6bca6f8176f0..7eba7326b9b9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -113,6 +113,7 @@ static void bdi_work_free(struct rcu_head *head)
 static void wb_work_complete(struct bdi_work *work)
 {
 	const enum writeback_sync_modes sync_mode = work->args.sync_mode;
+	int onstack = bdi_work_on_stack(work);
 
 	/*
 	 * For allocated work, we can clear the done/seen bit right here.
@@ -120,9 +121,9 @@ static void wb_work_complete(struct bdi_work *work)
 	 * to after the RCU grace period, since the stack could be invalidated
 	 * as soon as bdi_work_clear() has done the wakeup.
 	 */
-	if (!bdi_work_on_stack(work))
+	if (!onstack)
 		bdi_work_clear(work);
-	if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work))
+	if (sync_mode == WB_SYNC_NONE || onstack)
 		call_rcu(&work->rcu_head, bdi_work_free);
 }
 
-- 
cgit v1.2.2


From 1ef7d9aa32a8ee054c4d4fdcd2ea537c04d61b2f Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 15 Sep 2009 21:37:55 +0200
Subject: writeback: fix possible bdi writeback refcounting problem

wb_clear_pending AFAIKS should not be called after the item has been
put on the list, except by the worker threads. It could lead to the
situation where the refcount is decremented below 0 and cause lots of
problems.

Presumably the !wb_has_dirty_io case is not a common one, so it can
be discovered when the thread wakes up to check?

Also add a comment in bdi_work_clear.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 7eba7326b9b9..8e1e5e19d21e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -97,6 +97,11 @@ static void bdi_work_clear(struct bdi_work *work)
 {
 	clear_bit(WS_USED_B, &work->state);
 	smp_mb__after_clear_bit();
+	/*
+	 * work can have disappeared at this point. bit waitq functions
+	 * should be able to tolerate this, provided bdi_sched_wait does
+	 * not dereference it's pointer argument.
+	*/
 	wake_up_bit(&work->state, WS_USED_B);
 }
 
@@ -169,13 +174,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
 	else {
 		struct bdi_writeback *wb = &bdi->wb;
 
-		/*
-		 * End work now if this wb has no dirty IO pending. Otherwise
-		 * wakeup the handling thread
-		 */
-		if (!wb_has_dirty_io(wb))
-			wb_clear_pending(wb, work);
-		else if (wb->task)
+		if (wb->task)
 			wake_up_process(wb->task);
 	}
 }
-- 
cgit v1.2.2


From b449fc6fcc07a392c69f3c1db9a4ad4dda8cbcba Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@sun.com>
Date: Thu, 30 Jul 2009 20:09:46 +0200
Subject: JBD: round commit timer up to avoid uncommitted transaction

Fix jiffie rounding in jbd commit timer setup code.  Rounding down could cause
the timer to be fired before the corresponding transaction has expired.  That
transaction can stay not committed forever if no new transaction is created or
explicit sync/umount happens.

Signed-off-by: Andreas Dilger <adilger@sun.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/transaction.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index c03ac11f74be..833c1675a00c 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -56,7 +56,8 @@ get_transaction(journal_t *journal, transaction_t *transaction)
 	spin_lock_init(&transaction->t_handle_lock);
 
 	/* Set up the commit timer for the new transaction. */
-	journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
+	journal->j_commit_timer.expires =
+				round_jiffies_up(transaction->t_expires);
 	add_timer(&journal->j_commit_timer);
 
 	J_ASSERT(journal->j_running_transaction == NULL);
-- 
cgit v1.2.2


From 9c28cbccec66a5ca292c6659bf5a0fe0c8459fa7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 3 Aug 2009 19:21:00 +0200
Subject: jbd: Journal block numbers can ever be only 32-bit use unsigned int
 for them

It does not make sense to store block number for journal as unsigned long
since they can be only 32-bit (because of on-disk format limitation). So
change in-memory structures and variables to use unsigned int instead.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/checkpoint.c |  6 +++---
 fs/jbd/commit.c     |  2 +-
 fs/jbd/journal.c    | 30 +++++++++++++++---------------
 fs/jbd/recovery.c   | 18 +++++++++---------
 fs/jbd/revoke.c     | 16 ++++++++--------
 5 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 61f32f3868cd..b0435dd0654d 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -456,7 +456,7 @@ int cleanup_journal_tail(journal_t *journal)
 {
 	transaction_t * transaction;
 	tid_t		first_tid;
-	unsigned long	blocknr, freed;
+	unsigned int	blocknr, freed;
 
 	if (is_journal_aborted(journal))
 		return 1;
@@ -502,8 +502,8 @@ int cleanup_journal_tail(journal_t *journal)
 		freed = freed + journal->j_last - journal->j_first;
 
 	jbd_debug(1,
-		  "Cleaning journal tail from %d to %d (offset %lu), "
-		  "freeing %lu\n",
+		  "Cleaning journal tail from %d to %d (offset %u), "
+		  "freeing %u\n",
 		  journal->j_tail_sequence, first_tid, blocknr, freed);
 
 	journal->j_free += freed;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 618e21c0b7a3..4bd882548c45 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -308,7 +308,7 @@ void journal_commit_transaction(journal_t *journal)
 	int bufs;
 	int flags;
 	int err;
-	unsigned long blocknr;
+	unsigned int blocknr;
 	ktime_t start_time;
 	u64 commit_time;
 	char *tagp = NULL;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index f96f85092d1c..bd3c073b485d 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -276,7 +276,7 @@ static void journal_kill_thread(journal_t *journal)
 int journal_write_metadata_buffer(transaction_t *transaction,
 				  struct journal_head  *jh_in,
 				  struct journal_head **jh_out,
-				  unsigned long blocknr)
+				  unsigned int blocknr)
 {
 	int need_copy_out = 0;
 	int done_copy_out = 0;
@@ -567,9 +567,9 @@ int log_wait_commit(journal_t *journal, tid_t tid)
  * Log buffer allocation routines:
  */
 
-int journal_next_log_block(journal_t *journal, unsigned long *retp)
+int journal_next_log_block(journal_t *journal, unsigned int *retp)
 {
-	unsigned long blocknr;
+	unsigned int blocknr;
 
 	spin_lock(&journal->j_state_lock);
 	J_ASSERT(journal->j_free > 1);
@@ -590,11 +590,11 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp)
  * this is a no-op.  If needed, we can use j_blk_offset - everything is
  * ready.
  */
-int journal_bmap(journal_t *journal, unsigned long blocknr,
-		 unsigned long *retp)
+int journal_bmap(journal_t *journal, unsigned int blocknr,
+		 unsigned int *retp)
 {
 	int err = 0;
-	unsigned long ret;
+	unsigned int ret;
 
 	if (journal->j_inode) {
 		ret = bmap(journal->j_inode, blocknr);
@@ -604,7 +604,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
 			char b[BDEVNAME_SIZE];
 
 			printk(KERN_ALERT "%s: journal block not found "
-					"at offset %lu on %s\n",
+					"at offset %u on %s\n",
 				__func__,
 				blocknr,
 				bdevname(journal->j_dev, b));
@@ -630,7 +630,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
 struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
 {
 	struct buffer_head *bh;
-	unsigned long blocknr;
+	unsigned int blocknr;
 	int err;
 
 	err = journal_next_log_block(journal, &blocknr);
@@ -774,7 +774,7 @@ journal_t * journal_init_inode (struct inode *inode)
 	journal_t *journal = journal_init_common();
 	int err;
 	int n;
-	unsigned long blocknr;
+	unsigned int blocknr;
 
 	if (!journal)
 		return NULL;
@@ -846,12 +846,12 @@ static void journal_fail_superblock (journal_t *journal)
 static int journal_reset(journal_t *journal)
 {
 	journal_superblock_t *sb = journal->j_superblock;
-	unsigned long first, last;
+	unsigned int first, last;
 
 	first = be32_to_cpu(sb->s_first);
 	last = be32_to_cpu(sb->s_maxlen);
 	if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
-		printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n",
+		printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
 		       first, last);
 		journal_fail_superblock(journal);
 		return -EINVAL;
@@ -885,7 +885,7 @@ static int journal_reset(journal_t *journal)
  **/
 int journal_create(journal_t *journal)
 {
-	unsigned long blocknr;
+	unsigned int blocknr;
 	struct buffer_head *bh;
 	journal_superblock_t *sb;
 	int i, err;
@@ -969,14 +969,14 @@ void journal_update_superblock(journal_t *journal, int wait)
 	if (sb->s_start == 0 && journal->j_tail_sequence ==
 				journal->j_transaction_sequence) {
 		jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
-			"(start %ld, seq %d, errno %d)\n",
+			"(start %u, seq %d, errno %d)\n",
 			journal->j_tail, journal->j_tail_sequence,
 			journal->j_errno);
 		goto out;
 	}
 
 	spin_lock(&journal->j_state_lock);
-	jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+	jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
 		  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
 
 	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1371,7 +1371,7 @@ int journal_flush(journal_t *journal)
 {
 	int err = 0;
 	transaction_t *transaction = NULL;
-	unsigned long old_tail;
+	unsigned int old_tail;
 
 	spin_lock(&journal->j_state_lock);
 
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index db5e982c5ddf..cb1a49ae605e 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -70,7 +70,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
 {
 	int err;
 	unsigned int max, nbufs, next;
-	unsigned long blocknr;
+	unsigned int blocknr;
 	struct buffer_head *bh;
 
 	struct buffer_head * bufs[MAXBUF];
@@ -132,7 +132,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
 		 unsigned int offset)
 {
 	int err;
-	unsigned long blocknr;
+	unsigned int blocknr;
 	struct buffer_head *bh;
 
 	*bhp = NULL;
@@ -314,7 +314,7 @@ static int do_one_pass(journal_t *journal,
 			struct recovery_info *info, enum passtype pass)
 {
 	unsigned int		first_commit_ID, next_commit_ID;
-	unsigned long		next_log_block;
+	unsigned int		next_log_block;
 	int			err, success = 0;
 	journal_superblock_t *	sb;
 	journal_header_t *	tmp;
@@ -367,14 +367,14 @@ static int do_one_pass(journal_t *journal,
 			if (tid_geq(next_commit_ID, info->end_transaction))
 				break;
 
-		jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+		jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
 			  next_commit_ID, next_log_block, journal->j_last);
 
 		/* Skip over each chunk of the transaction looking
 		 * either the next descriptor block or the final commit
 		 * record. */
 
-		jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+		jbd_debug(3, "JBD: checking block %u\n", next_log_block);
 		err = jread(&bh, journal, next_log_block);
 		if (err)
 			goto failed;
@@ -429,7 +429,7 @@ static int do_one_pass(journal_t *journal,
 			tagp = &bh->b_data[sizeof(journal_header_t)];
 			while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
 			       <= journal->j_blocksize) {
-				unsigned long io_block;
+				unsigned int io_block;
 
 				tag = (journal_block_tag_t *) tagp;
 				flags = be32_to_cpu(tag->t_flags);
@@ -443,10 +443,10 @@ static int do_one_pass(journal_t *journal,
 					success = err;
 					printk (KERN_ERR
 						"JBD: IO error %d recovering "
-						"block %ld in log\n",
+						"block %u in log\n",
 						err, io_block);
 				} else {
-					unsigned long blocknr;
+					unsigned int blocknr;
 
 					J_ASSERT(obh != NULL);
 					blocknr = be32_to_cpu(tag->t_blocknr);
@@ -581,7 +581,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
 	max = be32_to_cpu(header->r_count);
 
 	while (offset < max) {
-		unsigned long blocknr;
+		unsigned int blocknr;
 		int err;
 
 		blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index da6cd9bdaabc..ad717328343a 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -101,7 +101,7 @@ struct jbd_revoke_record_s
 {
 	struct list_head  hash;
 	tid_t		  sequence;	/* Used for recovery only */
-	unsigned long	  blocknr;
+	unsigned int	  blocknr;
 };
 
 
@@ -126,7 +126,7 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int);
 /* Utility functions to maintain the revoke table */
 
 /* Borrowed from buffer.c: this is a tried and tested block hash function */
-static inline int hash(journal_t *journal, unsigned long block)
+static inline int hash(journal_t *journal, unsigned int block)
 {
 	struct jbd_revoke_table_s *table = journal->j_revoke;
 	int hash_shift = table->hash_shift;
@@ -136,7 +136,7 @@ static inline int hash(journal_t *journal, unsigned long block)
 		(block << (hash_shift - 12))) & (table->hash_size - 1);
 }
 
-static int insert_revoke_hash(journal_t *journal, unsigned long blocknr,
+static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
 			      tid_t seq)
 {
 	struct list_head *hash_list;
@@ -166,7 +166,7 @@ oom:
 /* Find a revoke record in the journal's hash table. */
 
 static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
-						      unsigned long blocknr)
+						      unsigned int blocknr)
 {
 	struct list_head *hash_list;
 	struct jbd_revoke_record_s *record;
@@ -332,7 +332,7 @@ void journal_destroy_revoke(journal_t *journal)
  * by one.
  */
 
-int journal_revoke(handle_t *handle, unsigned long blocknr,
+int journal_revoke(handle_t *handle, unsigned int blocknr,
 		   struct buffer_head *bh_in)
 {
 	struct buffer_head *bh = NULL;
@@ -401,7 +401,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
 		}
 	}
 
-	jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in);
+	jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
 	err = insert_revoke_hash(journal, blocknr,
 				handle->h_transaction->t_tid);
 	BUFFER_TRACE(bh_in, "exit");
@@ -644,7 +644,7 @@ static void flush_descriptor(journal_t *journal,
  */
 
 int journal_set_revoke(journal_t *journal,
-		       unsigned long blocknr,
+		       unsigned int blocknr,
 		       tid_t sequence)
 {
 	struct jbd_revoke_record_s *record;
@@ -668,7 +668,7 @@ int journal_set_revoke(journal_t *journal,
  */
 
 int journal_test_revoke(journal_t *journal,
-			unsigned long blocknr,
+			unsigned int blocknr,
 			tid_t sequence)
 {
 	struct jbd_revoke_record_s *record;
-- 
cgit v1.2.2


From 3adae9da0b35d2ca908039f42a1e90395c335181 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 11 Aug 2009 17:27:21 +0200
Subject: jbd: Annotate transaction start also for journal_restart()

lockdep annotation for a transaction start has been at the end of
journal_start(). But a transaction is also started from journal_restart(). Move
the lockdep annotation to start_this_handle() which covers both cases.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/transaction.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 833c1675a00c..006f9ad838a2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -229,6 +229,8 @@ repeat_locked:
 		  __log_space_left(journal));
 	spin_unlock(&transaction->t_handle_lock);
 	spin_unlock(&journal->j_state_lock);
+
+	lock_map_acquire(&handle->h_lockdep_map);
 out:
 	if (unlikely(new_transaction))		/* It's usually NULL */
 		kfree(new_transaction);
@@ -293,9 +295,6 @@ handle_t *journal_start(journal_t *journal, int nblocks)
 		handle = ERR_PTR(err);
 		goto out;
 	}
-
-	lock_map_acquire(&handle->h_lockdep_map);
-
 out:
 	return handle;
 }
@@ -417,6 +416,7 @@ int journal_restart(handle_t *handle, int nblocks)
 	__log_start_commit(journal, transaction->t_tid);
 	spin_unlock(&journal->j_state_lock);
 
+	lock_map_release(&handle->h_lockdep_map);
 	handle->h_buffer_credits = nblocks;
 	ret = start_this_handle(journal, handle);
 	return ret;
-- 
cgit v1.2.2


From 00171d3c7e3b738ba582c7a9b37408e796f49046 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 11 Aug 2009 19:06:10 +0200
Subject: ext3: Fix possible deadlock between ext3_truncate() and
 ext3_get_blocks()

During truncate we are sometimes forced to start a new transaction as the
amount of blocks to be journaled is both quite large and hard to predict. So
far we restarted a transaction while holding truncate_mutex and that violates
lock ordering because truncate_mutex ranks below transaction start (and it
can lead to a real deadlock with ext3_get_blocks() allocating new blocks
from ext3_writepage()).

Luckily, the problem is easy to fix: We just drop the truncate_mutex before
restarting the transaction and acquire it afterwards. We are safe to do this as
by the time ext3_truncate() is called, all the page cache for the truncated
part of the file is dropped and so writepage() cannot come and allocate new
blocks in the part of the file we are truncating. The rest of writers is
stopped by us holding i_mutex.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/inode.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b49908a167ae..eb0b4e0d517b 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -172,10 +172,21 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
  * so before we call here everything must be consistently dirtied against
  * this transaction.
  */
-static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
+static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
 {
+	int ret;
+
 	jbd_debug(2, "restarting handle %p\n", handle);
-	return ext3_journal_restart(handle, blocks_for_truncate(inode));
+	/*
+	 * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
+	 * At this moment, get_block can be called only for blocks inside
+	 * i_size since page cache has been already dropped and writes are
+	 * blocked by i_mutex. So we can safely drop the truncate_mutex.
+	 */
+	mutex_unlock(&EXT3_I(inode)->truncate_mutex);
+	ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
+	mutex_lock(&EXT3_I(inode)->truncate_mutex);
+	return ret;
 }
 
 /*
@@ -2072,7 +2083,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
 			ext3_journal_dirty_metadata(handle, bh);
 		}
 		ext3_mark_inode_dirty(handle, inode);
-		ext3_journal_test_restart(handle, inode);
+		truncate_restart_transaction(handle, inode);
 		if (bh) {
 			BUFFER_TRACE(bh, "retaking write access");
 			ext3_journal_get_write_access(handle, bh);
@@ -2282,7 +2293,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
 				return;
 			if (try_to_extend_transaction(handle, inode)) {
 				ext3_mark_inode_dirty(handle, inode);
-				ext3_journal_test_restart(handle, inode);
+				truncate_restart_transaction(handle, inode);
 			}
 
 			ext3_free_blocks(handle, inode, nr, 1);
-- 
cgit v1.2.2


From 4f003fd32bc54ec438b8691795279844df27ce38 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 8 Sep 2009 00:22:14 +0200
Subject: ext3: Add locking to ext3_do_update_inode

I've been struggling with this off and on while I've been testing the
data=guarded work.  The symptom is corrupted orphan lists and inodes
with the wrong i_size stored on disk.  I was convinced the
data=guarded code was just missing a call to ext3_mark_inode_dirty, but
tracing showed the i_disksize I was sending to ext3_mark_inode_dirty
wasn't actually making it to the drive.

ext3_mark_inode_dirty can be called without locks held (atime updates
and a few others), so the data=guarded code uses locks while updating
the in-memory inode, and then calls ext3_mark_inode_dirty
without any locks held.

But, ext3_mark_inode_dirty has no internal locking to make sure that
only one CPU is updating the buffer head at a time.  Generally this
works out ok because everyone that changes the inode then calls
ext3_mark_inode_dirty themselves.  Even though it races, eventually
someone updates the buffer heads and things move on.

But there is still a risk of the wrong values getting in, and the
data=guarded code seems to hit the race very often.

Since everyone that changes the inode also logs it, it should be
possible to fix this with some memory barriers.  I'll leave that as an
exercise to the reader and lock the buffer head instead.

It it probably a good idea to have a different patch series for lockless
bit flipping on the ext3 i_state field.  ext3_do_update_inode &= clears
EXT3_STATE_NEW without any locks held.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/inode.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index eb0b4e0d517b..cd098a7b77fc 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2903,6 +2903,10 @@ static int ext3_do_update_inode(handle_t *handle,
 	struct buffer_head *bh = iloc->bh;
 	int err = 0, rc, block;
 
+again:
+	/* we can't allow multiple procs in here at once, its a bit racey */
+	lock_buffer(bh);
+
 	/* For fields not not tracking in the in-memory inode,
 	 * initialise them to zero for new inodes. */
 	if (ei->i_state & EXT3_STATE_NEW)
@@ -2962,16 +2966,20 @@ static int ext3_do_update_inode(handle_t *handle,
 			       /* If this is the first large file
 				* created, add a flag to the superblock.
 				*/
+				unlock_buffer(bh);
 				err = ext3_journal_get_write_access(handle,
 						EXT3_SB(sb)->s_sbh);
 				if (err)
 					goto out_brelse;
+
 				ext3_update_dynamic_rev(sb);
 				EXT3_SET_RO_COMPAT_FEATURE(sb,
 					EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
 				handle->h_sync = 1;
 				err = ext3_journal_dirty_metadata(handle,
 						EXT3_SB(sb)->s_sbh);
+				/* get our lock and start over */
+				goto again;
 			}
 		}
 	}
@@ -2994,6 +3002,7 @@ static int ext3_do_update_inode(handle_t *handle,
 		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
 
 	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+	unlock_buffer(bh);
 	rc = ext3_journal_dirty_metadata(handle, bh);
 	if (!err)
 		err = rc;
-- 
cgit v1.2.2


From 56fcad29d4b3cbcbb2ed47a9d3ceca3f57175417 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 8 Sep 2009 14:59:42 +0200
Subject: ext3: Flush disk caches on fsync when needed

In case we fsync() a file and inode is not dirty, we don't force a transaction
to disk and hence don't flush disk caches. Thus file data could be just in disk
caches and not on persistent storage. Fix the problem by flushing disk caches
if we didn't force a transaction commit.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/fsync.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d33634119e17..451d166bbe93 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -23,6 +23,7 @@
  */
 
 #include <linux/time.h>
+#include <linux/blkdev.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
@@ -73,7 +74,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
 	}
 
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		goto out;
+		goto flush;
 
 	/*
 	 * The VFS has written the file data.  If the inode is unaltered
@@ -85,7 +86,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
 			.nr_to_write = 0, /* sys_fsync did this */
 		};
 		ret = sync_inode(inode, &wbc);
+		goto out;
 	}
+flush:
+	/*
+	 * In case we didn't commit a transaction, we have to flush
+	 * disk caches manually so that data really is on persistent
+	 * storage
+	 */
+	if (test_opt(inode->i_sb, BARRIER))
+		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 out:
 	return ret;
 }
-- 
cgit v1.2.2


From e8505970af46658ece2545e9bc1fe594998fdcdf Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Wed, 16 Sep 2009 13:46:38 -0400
Subject: ext4: Replace get_ext_path macro with an inline funciton

Replace get_ext_path macro with an inline function,
since this macro looks like a function call but its arguments
get modified. Ted pointed this out, thanks.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 55 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 4c4491cef357..e4bd8761498a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -19,14 +19,29 @@
 #include "ext4_extents.h"
 #include "ext4.h"
 
-#define get_ext_path(path, inode, block, ret)		\
-	do {								\
-		path = ext4_ext_find_extent(inode, block, path);	\
-		if (IS_ERR(path)) {					\
-			ret = PTR_ERR(path);				\
-			path = NULL;					\
-		}							\
-	} while (0)
+/**
+ * get_ext_path - Find an extent path for designated logical block number.
+ *
+ * @inode:	an inode which is searched
+ * @lblock:	logical block number to find an extent path
+ * @path:	pointer to an extent path pointer (for output)
+ *
+ * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
+ * on failure.
+ */
+static inline int
+get_ext_path(struct inode *inode, ext4_lblk_t lblock,
+		struct ext4_ext_path **path)
+{
+	int ret = 0;
+
+	*path = ext4_ext_find_extent(inode, lblock, *path);
+	if (IS_ERR(*path)) {
+		ret = PTR_ERR(*path);
+		*path = NULL;
+	}
+	return ret;
+}
 
 /**
  * copy_extent_status - Copy the extent's initialization status
@@ -283,7 +298,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 	}
 
 	if (new_flag) {
-		get_ext_path(orig_path, orig_inode, eblock, err);
+		err = get_ext_path(orig_inode, eblock, &orig_path);
 		if (orig_path == NULL)
 			goto out;
 
@@ -293,8 +308,8 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 	}
 
 	if (end_flag) {
-		get_ext_path(orig_path, orig_inode,
-				      le32_to_cpu(end_ext->ee_block) - 1, err);
+		err = get_ext_path(orig_inode,
+				le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
 		if (orig_path == NULL)
 			goto out;
 
@@ -631,12 +646,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 	mext_double_down_write(orig_inode, donor_inode);
 
 	/* Get the original extent for the block "orig_off" */
-	get_ext_path(orig_path, orig_inode, orig_off, err);
+	err = get_ext_path(orig_inode, orig_off, &orig_path);
 	if (orig_path == NULL)
 		goto out;
 
 	/* Get the donor extent for the head */
-	get_ext_path(donor_path, donor_inode, donor_off, err);
+	err = get_ext_path(donor_inode, donor_off, &donor_path);
 	if (donor_path == NULL)
 		goto out;
 	depth = ext_depth(orig_inode);
@@ -678,7 +693,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 
 		if (orig_path)
 			ext4_ext_drop_refs(orig_path);
-		get_ext_path(orig_path, orig_inode, orig_off, err);
+		err = get_ext_path(orig_inode, orig_off, &orig_path);
 		if (orig_path == NULL)
 			goto out;
 		depth = ext_depth(orig_inode);
@@ -692,8 +707,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 
 		if (donor_path)
 			ext4_ext_drop_refs(donor_path);
-		get_ext_path(donor_path, donor_inode,
-				      donor_off, err);
+		err = get_ext_path(donor_inode, donor_off, &donor_path);
 		if (donor_path == NULL)
 			goto out;
 		depth = ext_depth(donor_inode);
@@ -1154,12 +1168,12 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 	if (file_end < block_end)
 		len -= block_end - file_end;
 
-	get_ext_path(orig_path, orig_inode, block_start, ret);
+	ret = get_ext_path(orig_inode, block_start, &orig_path);
 	if (orig_path == NULL)
 		goto out2;
 
 	/* Get path structure to check the hole */
-	get_ext_path(holecheck_path, orig_inode, block_start, ret);
+	ret = get_ext_path(orig_inode, block_start, &holecheck_path);
 	if (holecheck_path == NULL)
 		goto out;
 
@@ -1289,8 +1303,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		/* Decrease buffer counter */
 		if (holecheck_path)
 			ext4_ext_drop_refs(holecheck_path);
-		get_ext_path(holecheck_path, orig_inode,
-				      seq_start, ret);
+		ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
 		if (holecheck_path == NULL)
 			break;
 		depth = holecheck_path->p_depth;
@@ -1298,7 +1311,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		/* Decrease buffer counter */
 		if (orig_path)
 			ext4_ext_drop_refs(orig_path);
-		get_ext_path(orig_path, orig_inode, seq_start, ret);
+		ret = get_ext_path(orig_inode, seq_start, &orig_path);
 		if (orig_path == NULL)
 			break;
 
-- 
cgit v1.2.2


From 2147b1a6a48e28399120ca51d4a91840a278611f Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Wed, 16 Sep 2009 13:46:35 -0400
Subject: ext4: Replace BUG_ON() with ext4_error() in move_extents.c

Replace BUG_ON calls with a call to ext4_error()
to print an error message if EXT4_IOC_MOVE_EXT failed
with some kind of reasons.  This will help to debug.
Ted pointed this out, thanks.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 149 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 109 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index e4bd8761498a..2258560e9722 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -127,6 +127,31 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 	return 1;
 }
 
+/**
+ * mext_check_null_inode - NULL check for two inodes
+ *
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ */
+static int
+mext_check_null_inode(struct inode *inode1, struct inode *inode2,
+		const char *function)
+{
+	int ret = 0;
+
+	if (inode1 == NULL) {
+		ext4_error(inode2->i_sb, function,
+			"Both inodes should not be NULL: "
+			"inode1 NULL inode2 %lu", inode2->i_ino);
+		ret = -EIO;
+	} else if (inode2 == NULL) {
+		ext4_error(inode1->i_sb, function,
+			"Both inodes should not be NULL: "
+			"inode1 %lu inode2 NULL", inode1->i_ino);
+		ret = -EIO;
+	}
+	return ret;
+}
+
 /**
  * mext_double_down_read - Acquire two inodes' read semaphore
  *
@@ -139,8 +164,6 @@ mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
 {
 	struct inode *first = orig_inode, *second = donor_inode;
 
-	BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
 	/*
 	 * Use the inode number to provide the stable locking order instead
 	 * of its address, because the C language doesn't guarantee you can
@@ -167,8 +190,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
 {
 	struct inode *first = orig_inode, *second = donor_inode;
 
-	BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
 	/*
 	 * Use the inode number to provide the stable locking order instead
 	 * of its address, because the C language doesn't guarantee you can
@@ -193,8 +214,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
 static void
 mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
 {
-	BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
 	up_read(&EXT4_I(orig_inode)->i_data_sem);
 	up_read(&EXT4_I(donor_inode)->i_data_sem);
 }
@@ -209,8 +228,6 @@ mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
 static void
 mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
 {
-	BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
 	up_write(&EXT4_I(orig_inode)->i_data_sem);
 	up_write(&EXT4_I(donor_inode)->i_data_sem);
 }
@@ -534,7 +551,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 	 * oext      |-----------|
 	 * new_ext       |-------|
 	 */
-	BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end);
+	if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
+		ext4_error(orig_inode->i_sb, __func__,
+			"new_ext_end(%u) should be less than or equal to "
+			"oext->ee_block(%u) + oext_alen(%d) - 1",
+			new_ext_end, le32_to_cpu(oext->ee_block),
+			oext_alen);
+		ret = -EIO;
+		goto out;
+	}
 
 	/*
 	 * Case: new_ext is smaller than original extent
@@ -558,6 +583,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 
 	ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
 				o_end, &start_ext, &new_ext, &end_ext);
+out:
 	return ret;
 }
 
@@ -668,7 +694,20 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 	/* Loop for the donor extents */
 	while (1) {
 		/* The extent for donor must be found. */
-		BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block));
+		if (!dext) {
+			ext4_error(donor_inode->i_sb, __func__,
+				   "The extent for donor must be found");
+			err = -EIO;
+			goto out;
+		} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
+			ext4_error(donor_inode->i_sb, __func__,
+				"Donor offset(%u) and the first block of donor "
+				"extent(%u) should be equal",
+				donor_off,
+				le32_to_cpu(tmp_dext.ee_block));
+			err = -EIO;
+			goto out;
+		}
 
 		/* Set donor extent to orig extent */
 		err = mext_leaf_block(handle, orig_inode,
@@ -1050,18 +1089,23 @@ mext_check_arguments(struct inode *orig_inode,
  * @inode1:	the inode structure
  * @inode2:	the inode structure
  *
- * Lock two inodes' i_mutex by i_ino order. This function is moved from
- * fs/inode.c.
+ * Lock two inodes' i_mutex by i_ino order.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
  */
-static void
+static int
 mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
 {
-	if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
-		if (inode1)
-			mutex_lock(&inode1->i_mutex);
-		else if (inode2)
-			mutex_lock(&inode2->i_mutex);
-		return;
+	int ret = 0;
+
+	BUG_ON(inode1 == NULL && inode2 == NULL);
+
+	ret = mext_check_null_inode(inode1, inode2, __func__);
+	if (ret < 0)
+		goto out;
+
+	if (inode1 == inode2) {
+		mutex_lock(&inode1->i_mutex);
+		goto out;
 	}
 
 	if (inode1->i_ino < inode2->i_ino) {
@@ -1071,6 +1115,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
 		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
 		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
 	}
+
+out:
+	return ret;
 }
 
 /**
@@ -1079,17 +1126,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
  * @inode1:     the inode that is released first
  * @inode2:     the inode that is released second
  *
- * This function is moved from fs/inode.c.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
  */
 
-static void
+static int
 mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
 {
+	int ret = 0;
+
+	BUG_ON(inode1 == NULL && inode2 == NULL);
+
+	ret = mext_check_null_inode(inode1, inode2, __func__);
+	if (ret < 0)
+		goto out;
+
 	if (inode1)
 		mutex_unlock(&inode1->i_mutex);
 
 	if (inode2 && inode2 != inode1)
 		mutex_unlock(&inode2->i_mutex);
+
+out:
+	return ret;
 }
 
 /**
@@ -1146,21 +1204,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 	ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
 	ext4_lblk_t rest_blocks;
 	pgoff_t orig_page_offset = 0, seq_end_page;
-	int ret, depth, last_extent = 0;
+	int ret1, ret2, depth, last_extent = 0;
 	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
 	int data_offset_in_page;
 	int block_len_in_page;
 	int uninit;
 
 	/* protect orig and donor against a truncate */
-	mext_inode_double_lock(orig_inode, donor_inode);
+	ret1 = mext_inode_double_lock(orig_inode, donor_inode);
+	if (ret1 < 0)
+		return ret1;
 
 	mext_double_down_read(orig_inode, donor_inode);
 	/* Check the filesystem environment whether move_extent can be done */
-	ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
+	ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
 					donor_start, &len, *moved_len);
 	mext_double_up_read(orig_inode, donor_inode);
-	if (ret)
+	if (ret1)
 		goto out2;
 
 	file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
@@ -1168,19 +1228,19 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 	if (file_end < block_end)
 		len -= block_end - file_end;
 
-	ret = get_ext_path(orig_inode, block_start, &orig_path);
+	ret1 = get_ext_path(orig_inode, block_start, &orig_path);
 	if (orig_path == NULL)
 		goto out2;
 
 	/* Get path structure to check the hole */
-	ret = get_ext_path(orig_inode, block_start, &holecheck_path);
+	ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
 	if (holecheck_path == NULL)
 		goto out;
 
 	depth = ext_depth(orig_inode);
 	ext_cur = holecheck_path[depth].p_ext;
 	if (ext_cur == NULL) {
-		ret = -EINVAL;
+		ret1 = -EINVAL;
 		goto out;
 	}
 
@@ -1193,13 +1253,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		last_extent = mext_next_extent(orig_inode,
 					holecheck_path, &ext_cur);
 		if (last_extent < 0) {
-			ret = last_extent;
+			ret1 = last_extent;
 			goto out;
 		}
 		last_extent = mext_next_extent(orig_inode, orig_path,
 							&ext_dummy);
 		if (last_extent < 0) {
-			ret = last_extent;
+			ret1 = last_extent;
 			goto out;
 		}
 	}
@@ -1209,7 +1269,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 	if (le32_to_cpu(ext_cur->ee_block) > block_end) {
 		ext4_debug("ext4 move extent: The specified range of file "
 							"may be the hole\n");
-		ret = -EINVAL;
+		ret1 = -EINVAL;
 		goto out;
 	}
 
@@ -1229,7 +1289,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		last_extent = mext_next_extent(orig_inode, holecheck_path,
 						&ext_cur);
 		if (last_extent < 0) {
-			ret = last_extent;
+			ret1 = last_extent;
 			break;
 		}
 		add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1281,16 +1341,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		while (orig_page_offset <= seq_end_page) {
 
 			/* Swap original branches with new branches */
-			ret = move_extent_per_page(o_filp, donor_inode,
+			ret1 = move_extent_per_page(o_filp, donor_inode,
 						orig_page_offset,
 						data_offset_in_page,
 						block_len_in_page, uninit);
-			if (ret < 0)
+			if (ret1 < 0)
 				goto out;
 			orig_page_offset++;
 			/* Count how many blocks we have exchanged */
 			*moved_len += block_len_in_page;
-			BUG_ON(*moved_len > len);
+			if (*moved_len > len) {
+				ext4_error(orig_inode->i_sb, __func__,
+					"We replaced blocks too much! "
+					"sum of replaced: %llu requested: %llu",
+					*moved_len, len);
+				ret1 = -EIO;
+				goto out;
+			}
 
 			data_offset_in_page = 0;
 			rest_blocks -= block_len_in_page;
@@ -1303,7 +1370,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		/* Decrease buffer counter */
 		if (holecheck_path)
 			ext4_ext_drop_refs(holecheck_path);
-		ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
+		ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
 		if (holecheck_path == NULL)
 			break;
 		depth = holecheck_path->p_depth;
@@ -1311,7 +1378,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		/* Decrease buffer counter */
 		if (orig_path)
 			ext4_ext_drop_refs(orig_path);
-		ret = get_ext_path(orig_inode, seq_start, &orig_path);
+		ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
 		if (orig_path == NULL)
 			break;
 
@@ -1330,10 +1397,12 @@ out:
 		kfree(holecheck_path);
 	}
 out2:
-	mext_inode_double_unlock(orig_inode, donor_inode);
+	ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
 
-	if (ret)
-		return ret;
+	if (ret1)
+		return ret1;
+	else if (ret2)
+		return ret2;
 
 	return 0;
 }
-- 
cgit v1.2.2


From 347fa6f1c7cb5df2b38d3c9167cfe242ce0cd1da Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Wed, 16 Sep 2009 14:25:07 -0400
Subject: ext4: Add null extent check to ext_get_path

There is the possibility that path structure which is taken
by ext4_ext_find_extent() indicates null extents.
Because during data block exchanging in ext4_move_extents(),
constitution of an extent tree may be changed.
As a solution, the patch adds null extent check
to ext_get_path().

Reported-by: Peng Tao <bergwolf@gmail.com>
Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 2258560e9722..1c509d549137 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -39,7 +39,9 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock,
 	if (IS_ERR(*path)) {
 		ret = PTR_ERR(*path);
 		*path = NULL;
-	}
+	} else if ((*path)[ext_depth(inode)].p_ext == NULL)
+		ret = -ENODATA;
+
 	return ret;
 }
 
@@ -316,7 +318,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 
 	if (new_flag) {
 		err = get_ext_path(orig_inode, eblock, &orig_path);
-		if (orig_path == NULL)
+		if (err)
 			goto out;
 
 		if (ext4_ext_insert_extent(handle, orig_inode,
@@ -327,7 +329,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 	if (end_flag) {
 		err = get_ext_path(orig_inode,
 				le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
-		if (orig_path == NULL)
+		if (err)
 			goto out;
 
 		if (ext4_ext_insert_extent(handle, orig_inode,
@@ -673,12 +675,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 
 	/* Get the original extent for the block "orig_off" */
 	err = get_ext_path(orig_inode, orig_off, &orig_path);
-	if (orig_path == NULL)
+	if (err)
 		goto out;
 
 	/* Get the donor extent for the head */
 	err = get_ext_path(donor_inode, donor_off, &donor_path);
-	if (donor_path == NULL)
+	if (err)
 		goto out;
 	depth = ext_depth(orig_inode);
 	oext = orig_path[depth].p_ext;
@@ -733,7 +735,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 		if (orig_path)
 			ext4_ext_drop_refs(orig_path);
 		err = get_ext_path(orig_inode, orig_off, &orig_path);
-		if (orig_path == NULL)
+		if (err)
 			goto out;
 		depth = ext_depth(orig_inode);
 		oext = orig_path[depth].p_ext;
@@ -747,7 +749,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 		if (donor_path)
 			ext4_ext_drop_refs(donor_path);
 		err = get_ext_path(donor_inode, donor_off, &donor_path);
-		if (donor_path == NULL)
+		if (err)
 			goto out;
 		depth = ext_depth(donor_inode);
 		dext = donor_path[depth].p_ext;
@@ -1221,7 +1223,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 					donor_start, &len, *moved_len);
 	mext_double_up_read(orig_inode, donor_inode);
 	if (ret1)
-		goto out2;
+		goto out;
 
 	file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
 	block_end = block_start + len - 1;
@@ -1229,20 +1231,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		len -= block_end - file_end;
 
 	ret1 = get_ext_path(orig_inode, block_start, &orig_path);
-	if (orig_path == NULL)
-		goto out2;
+	if (ret1)
+		goto out;
 
 	/* Get path structure to check the hole */
 	ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
-	if (holecheck_path == NULL)
+	if (ret1)
 		goto out;
 
 	depth = ext_depth(orig_inode);
 	ext_cur = holecheck_path[depth].p_ext;
-	if (ext_cur == NULL) {
-		ret1 = -EINVAL;
-		goto out;
-	}
 
 	/*
 	 * Get proper extent whose ee_block is beyond block_start
@@ -1371,7 +1369,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		if (holecheck_path)
 			ext4_ext_drop_refs(holecheck_path);
 		ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
-		if (holecheck_path == NULL)
+		if (ret1)
 			break;
 		depth = holecheck_path->p_depth;
 
@@ -1379,7 +1377,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		if (orig_path)
 			ext4_ext_drop_refs(orig_path);
 		ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
-		if (orig_path == NULL)
+		if (ret1)
 			break;
 
 		ext_cur = holecheck_path[depth].p_ext;
@@ -1396,7 +1394,7 @@ out:
 		ext4_ext_drop_refs(holecheck_path);
 		kfree(holecheck_path);
 	}
-out2:
+
 	ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
 
 	if (ret1)
-- 
cgit v1.2.2


From c40ce3c9ea97425a12d7e44031a98fe50add6fc1 Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Wed, 16 Sep 2009 14:25:39 -0400
Subject: ext4: Fix different block exchange issue in EXT4_IOC_MOVE_EXT

If logical block offset of original file which is passed to
EXT4_IOC_MOVE_EXT is different from donor file's,
a calculation error occurs in ext4_calc_swap_extents(),
therefore wrong block is exchanged between original file and donor file.
As a result, we hit ext4_error() in check_block_validity().
To detect the logical offset difference in EXT4_IOC_MOVE_EXT,
add checks to mext_calc_swap_extents() and handle it as error,
since data exchange must be done between the same blocks in EXT4_IOC_MOVE_EXT.

Reported-by: Peng Tao <bergwolf@gmail.com>
Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 46 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 37 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 1c509d549137..1f027b1ec430 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -597,8 +597,10 @@ out:
  * @orig_off:		block offset of original inode
  * @donor_off:		block offset of donor inode
  * @max_count:		the maximun length of extents
+ *
+ * Return 0 on success, or a negative error value on failure.
  */
-static void
+static int
 mext_calc_swap_extents(struct ext4_extent *tmp_dext,
 			      struct ext4_extent *tmp_oext,
 			      ext4_lblk_t orig_off, ext4_lblk_t donor_off,
@@ -607,6 +609,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
 	ext4_lblk_t diff, orig_diff;
 	struct ext4_extent dext_old, oext_old;
 
+	BUG_ON(orig_off != donor_off);
+
+	/* original and donor extents have to cover the same block offset */
+	if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
+	    le32_to_cpu(tmp_oext->ee_block) +
+			ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
+		return -ENODATA;
+
+	if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
+	    le32_to_cpu(tmp_dext->ee_block) +
+			ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
+		return -ENODATA;
+
 	dext_old = *tmp_dext;
 	oext_old = *tmp_oext;
 
@@ -634,6 +649,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
 
 	copy_extent_status(&oext_old, tmp_dext);
 	copy_extent_status(&dext_old, tmp_oext);
+
+	return 0;
 }
 
 /**
@@ -690,8 +707,10 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 	dext = donor_path[depth].p_ext;
 	tmp_dext = *dext;
 
-	mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+	err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
 				      donor_off, count);
+	if (err)
+		goto out;
 
 	/* Loop for the donor extents */
 	while (1) {
@@ -760,9 +779,10 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 		}
 		tmp_dext = *dext;
 
-		mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
-					      donor_off,
-					      count - replaced_count);
+		err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+					   donor_off, count - replaced_count);
+		if (err)
+			goto out;
 	}
 
 out:
@@ -1243,11 +1263,15 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 	ext_cur = holecheck_path[depth].p_ext;
 
 	/*
-	 * Get proper extent whose ee_block is beyond block_start
-	 * if block_start was within the hole.
+	 * Get proper starting location of block replacement if block_start was
+	 * within the hole.
 	 */
 	if (le32_to_cpu(ext_cur->ee_block) +
 		ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+		/*
+		 * The hole exists between extents or the tail of
+		 * original file.
+		 */
 		last_extent = mext_next_extent(orig_inode,
 					holecheck_path, &ext_cur);
 		if (last_extent < 0) {
@@ -1260,8 +1284,12 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			ret1 = last_extent;
 			goto out;
 		}
-	}
-	seq_start = block_start;
+		seq_start = le32_to_cpu(ext_cur->ee_block);
+	} else if (le32_to_cpu(ext_cur->ee_block) > block_start)
+		/* The hole exists at the beginning of original file. */
+		seq_start = le32_to_cpu(ext_cur->ee_block);
+	else
+		seq_start = block_start;
 
 	/* No blocks within the specified range. */
 	if (le32_to_cpu(ext_cur->ee_block) > block_end) {
-- 
cgit v1.2.2


From fb0a387dcdcd21aab1b09ee7fd80b7c979bdbbfd Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 16 Sep 2009 14:45:10 -0400
Subject: ext4: limit block allocations for indirect-block files to < 2^32

Today, the ext4 allocator will happily allocate blocks past
2^32 for indirect-block files, which results in the block
numbers getting truncated, and corruption ensues.

This patch limits such allocations to < 2^32, and adds
BUG_ONs if we do get blocks larger than that.

This should address RH Bug 519471, ext4 bitmap allocator
must limit blocks to < 2^32

* ext4_find_goal() is modified to choose a goal < UINT_MAX,
  so that our starting point is in an acceptable range.

* ext4_xattr_block_set() is modified such that the goal block
  is < UINT_MAX, as above.

* ext4_mb_regular_allocator() is modified so that the group
  search does not continue into groups which are too high

* ext4_mb_use_preallocated() has a check that we don't use
  preallocated space which is too far out

* ext4_alloc_blocks() and ext4_xattr_block_set() add some BUG_ONs

No attempt has been made to limit inode locations to < 2^32,
so we may wind up with blocks far from their inodes.  Doing
this much already will lead to some odd ENOSPC issues when the
"lower 32" gets full, and further restricting inodes could
make that even weirder.

For high inodes, choosing a goal of the original, % UINT_MAX,
may be a bit odd, but then we're in an odd situation anyway,
and I don't know of a better heuristic.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  4 ++++
 fs/ext4/inode.c   | 11 ++++++++++-
 fs/ext4/mballoc.c |  9 +++++++++
 fs/ext4/super.c   |  2 ++
 fs/ext4/xattr.c   | 15 +++++++++++++--
 5 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4dc64ed58d26..2a0f75d55fad 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -403,6 +403,9 @@ struct ext4_mount_options {
 #endif
 };
 
+/* Max physical block we can addres w/o extents */
+#define EXT4_MAX_BLOCK_FILE_PHYS	0xFFFFFFFF
+
 /*
  * Structure of an inode on the disk
  */
@@ -857,6 +860,7 @@ struct ext4_sb_info {
 	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
 	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
 	ext4_group_t s_groups_count;	/* Number of groups in the fs */
+	ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
 	unsigned long s_overhead_last;  /* Last calculated overhead */
 	unsigned long s_blocks_last;    /* Last seen block count */
 	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d04c8428bde2..5a8979259c9a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -562,15 +562,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
  *
  *	Normally this function find the preferred place for block allocation,
  *	returns it.
+ *	Because this is only used for non-extent files, we limit the block nr
+ *	to 32 bits.
  */
 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 				   Indirect *partial)
 {
+	ext4_fsblk_t goal;
+
 	/*
 	 * XXX need to get goal block from mballoc's data structures
 	 */
 
-	return ext4_find_near(inode, partial);
+	goal = ext4_find_near(inode, partial);
+	goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+	return goal;
 }
 
 /**
@@ -651,6 +657,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 		if (*err)
 			goto failed_out;
 
+		BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
+
 		target -= count;
 		/* allocate blocks for indirect blocks */
 		while (index < indirect_blks && count) {
@@ -685,6 +693,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 		ar.flags = EXT4_MB_HINT_DATA;
 
 	current_block = ext4_mb_new_blocks(handle, &ar, err);
+	BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
 
 	if (*err && (target == blks)) {
 		/*
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d23056d375b3..e9c61896d605 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1965,6 +1965,10 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	sb = ac->ac_sb;
 	sbi = EXT4_SB(sb);
 	ngroups = ext4_get_groups_count(sb);
+	/* non-extent files are limited to low blocks/groups */
+	if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+		ngroups = sbi->s_blockfile_groups;
+
 	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
 
 	/* first, try the goal */
@@ -3382,6 +3386,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 			ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
 			continue;
 
+		/* non-extent files can't have physical blocks past 2^32 */
+		if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+			pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+			continue;
+
 		/* found preallocated blocks, use them */
 		spin_lock(&pa->pa_lock);
 		if (pa->pa_deleted == 0 && pa->pa_free) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index af95dd8ba54b..a6b1ab734728 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2616,6 +2616,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 	sbi->s_groups_count = blocks_count;
+	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
 	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
 		   EXT4_DESC_PER_BLOCK(sb);
 	sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 62b31c246994..fed5b01d7a8d 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,12 +810,23 @@ inserted:
 			get_bh(new_bh);
 		} else {
 			/* We need to allocate a new block */
-			ext4_fsblk_t goal = ext4_group_first_block_no(sb,
+			ext4_fsblk_t goal, block;
+
+			goal = ext4_group_first_block_no(sb,
 						EXT4_I(inode)->i_block_group);
-			ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
+
+			/* non-extent files can't have physical blocks past 2^32 */
+			if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+				goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+
+			block = ext4_new_meta_blocks(handle, inode,
 						  goal, NULL, &error);
 			if (error)
 				goto cleanup;
+
+			if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+				BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
+
 			ea_idebug(inode, "creating block %d", block);
 
 			new_bh = sb_getblk(sb, block);
-- 
cgit v1.2.2


From 1b9c12f44c1eb614fd3b8822bfe8f1f5d8e53737 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 17 Sep 2009 08:32:22 -0400
Subject: ext4: store EXT4_EXT_MIGRATE in i_state instead of i_flags

EXT4_EXT_MIGRATE is only intended to be used for an in-memory flag,
and the hex value assigned to it collides with FS_DIRECTIO_FL (which
is also stored in i_flags).  There's no reason for the
EXT4_EXT_MIGRATE bit to be stored in i_flags, so we switch it to use
i_state instead.

Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  2 +-
 fs/ext4/inode.c   |  6 ++----
 fs/ext4/migrate.c | 20 ++++++++++----------
 3 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2a0f75d55fad..84e7f1d00a83 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -268,7 +268,6 @@ struct flex_groups {
 #define EXT4_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
 #define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
 #define EXT4_EXTENTS_FL			0x00080000 /* Inode uses extents */
-#define EXT4_EXT_MIGRATE		0x00100000 /* Inode is migrating */
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
 #define EXT4_FL_USER_VISIBLE		0x000BDFFF /* User visible flags */
@@ -306,6 +305,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
 #define EXT4_STATE_XATTR		0x00000004 /* has in-inode xattrs */
 #define EXT4_STATE_NO_EXPAND		0x00000008 /* No space for expansion */
 #define EXT4_STATE_DA_ALLOC_CLOSE	0x00000010 /* Alloc DA blks on close */
+#define EXT4_STATE_EXT_MIGRATE		0x00000020 /* Inode is migrating */
 
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5a8979259c9a..a5b4ce40cc66 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1255,8 +1255,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 			 * i_data's format changing.  Force the migrate
 			 * to fail by clearing migrate flags
 			 */
-			EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
-							~EXT4_EXT_MIGRATE;
+			EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
 		}
 	}
 
@@ -4596,8 +4595,7 @@ static int ext4_do_update_inode(handle_t *handle,
 	if (ext4_inode_blocks_set(handle, raw_inode, ei))
 		goto out_brelse;
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
-	/* clear the migrate flag in the raw_inode */
-	raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
+	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
 	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
 	    cpu_to_le32(EXT4_OS_HURD))
 		raw_inode->i_file_acl_high =
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 05361ad5b80a..bf519f239ae6 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
 
 	down_write(&EXT4_I(inode)->i_data_sem);
 	/*
-	 * if EXT4_EXT_MIGRATE is cleared a block allocation
+	 * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
 	 * happened after we started the migrate. We need to
 	 * fail the migrate
 	 */
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) {
+	if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
 		retval = -EAGAIN;
 		up_write(&EXT4_I(inode)->i_data_sem);
 		goto err_out;
 	} else
-		EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
-							~EXT4_EXT_MIGRATE;
+		EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
 	/*
 	 * We have the extent map build with the tmp inode.
 	 * Now copy the i_data across
@@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode)
 	 * when we add extents we extent the journal
 	 */
 	/*
-	 * Even though we take i_mutex we can still cause block allocation
-	 * via mmap write to holes. If we have allocated new blocks we fail
-	 * migrate.  New block allocation will clear EXT4_EXT_MIGRATE flag.
-	 * The flag is updated with i_data_sem held to prevent racing with
-	 * block allocation.
+	 * Even though we take i_mutex we can still cause block
+	 * allocation via mmap write to holes. If we have allocated
+	 * new blocks we fail migrate.  New block allocation will
+	 * clear EXT4_STATE_EXT_MIGRATE flag.  The flag is updated
+	 * with i_data_sem held to prevent racing with block
+	 * allocation.
 	 */
 	down_read((&EXT4_I(inode)->i_data_sem));
-	EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE;
+	EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
 	up_read((&EXT4_I(inode)->i_data_sem));
 
 	handle = ext4_journal_start(inode, 1);
-- 
cgit v1.2.2


From fb40ba0d98968bc3454731360363d725b4f1064c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 16 Sep 2009 19:30:40 -0400
Subject: ext4: Add a tracepoint for ext4_alloc_da_blocks()

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a5b4ce40cc66..9887a0c562d5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3128,6 +3128,8 @@ out:
  */
 int ext4_alloc_da_blocks(struct inode *inode)
 {
+	trace_ext4_alloc_da_blocks(inode);
+
 	if (!EXT4_I(inode)->i_reserved_data_blocks &&
 	    !EXT4_I(inode)->i_reserved_meta_blocks)
 		return 0;
-- 
cgit v1.2.2


From 5534fb5bb35a62a94e0bd1fa2421f7fb6e894f10 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 17 Sep 2009 09:34:16 -0400
Subject: ext4: Fix the alloc on close after a truncate hueristic

In an attempt to avoid doing an unneeded flush after opening a
(previously non-existent) file with O_CREAT|O_TRUNC, the code only
triggered the hueristic if ei->disksize was non-zero.  Turns out that
the VFS doesn't call ->truncate() if the file doesn't exist, and
ei->disksize is always zero even if the file previously existed.  So
remove the test, since it isn't necessary and in fact disabled the
hueristic.

Thanks to Clemens Eisserer that he was seeing problems with files
written using kwrite and eclipse after sudden crashes caused by a
buggy Intel video driver.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9887a0c562d5..4abd683b963d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3973,8 +3973,7 @@ void ext4_truncate(struct inode *inode)
 	if (!ext4_can_truncate(inode))
 		return;
 
-	if (ei->i_disksize && inode->i_size == 0 &&
-	    !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
 		ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
 
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-- 
cgit v1.2.2


From 0a80e9867db154966b2a771042e10452ac110e1e Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 17 Sep 2009 11:55:58 -0400
Subject: ext4: replace MAX_DEFRAG_SIZE with EXT_MAX_BLOCK

There's no reason to redefine the maximum allowable offset
in an extent-based file just for defrag;
EXT_MAX_BLOCK already does this.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h         |  1 -
 fs/ext4/ext4_extents.h |  1 +
 fs/ext4/move_extent.c  | 12 ++++++------
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 84e7f1d00a83..e227eea23f05 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -476,7 +476,6 @@ struct move_extent {
 	__u64 len;		/* block length to be moved */
 	__u64 moved_len;	/* moved block length */
 };
-#define MAX_DEFRAG_SIZE         ((1UL<<31) - 1)
 
 #define EXT4_EPOCH_BITS 2
 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 9538633879d7..61652f1d15e6 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -137,6 +137,7 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
 #define EXT_BREAK      1
 #define EXT_REPEAT     2
 
+/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */
 #define EXT_MAX_BLOCK	0xffffffff
 
 /*
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 1f027b1ec430..c07a2915e40b 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -1040,12 +1040,12 @@ mext_check_arguments(struct inode *orig_inode,
 		return -EINVAL;
 	}
 
-	if ((orig_start > MAX_DEFRAG_SIZE) ||
-	    (donor_start > MAX_DEFRAG_SIZE) ||
-	    (*len > MAX_DEFRAG_SIZE) ||
-	    (orig_start + *len > MAX_DEFRAG_SIZE))  {
-		ext4_debug("ext4 move extent: Can't handle over [%lu] blocks "
-			"[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE,
+	if ((orig_start > EXT_MAX_BLOCK) ||
+	    (donor_start > EXT_MAX_BLOCK) ||
+	    (*len > EXT_MAX_BLOCK) ||
+	    (orig_start + *len > EXT_MAX_BLOCK))  {
+		ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
+			"[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK,
 			orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
-- 
cgit v1.2.2