From a403a0a370946e7dbcda6464a3509089daee54bc Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 26 Jul 2007 15:54:16 +0000
Subject: [CIFS] Fix hang in find_writable_file

Caused by unneeded reopen during reconnect while spinlock held.

Fixes kernel bugzilla bug #7903

Thanks to Lin Feng Shen for testing this, and Amit Arora for
some nice problem determination to narrow this down.

Acked-by: Dave Kleikamp <shaggy@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES |  5 ++++-
 fs/cifs/README  |  7 +++++++
 fs/cifs/TODO    |  3 +--
 fs/cifs/file.c  | 33 +++++++++++++++++++++++----------
 4 files changed, 35 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 6d84ca2beead..bed6215c0794 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -3,7 +3,10 @@ Version 1.50
 Fix NTLMv2 signing. NFS server mounted over cifs works (if cifs mount is
 done with "serverino" mount option).  Add support for POSIX Unlink
 (helps with certain sharing violation cases when server such as
-Samba supports newer POSIX CIFS Protocol Extensions).
+Samba supports newer POSIX CIFS Protocol Extensions). Add "nounix"
+mount option to allow disabling the CIFS Unix Extensions for just
+that mount. Fix hang on spinlock in find_writable_file (race when
+reopening file after session crash).
 
 Version 1.49
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 85f1eb14083e..fa096f9c824d 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -444,6 +444,13 @@ A partial list of the supported mount options follows:
  noposixpaths   If CIFS Unix extensions are supported, do not request
 		posix path name support (this may cause servers to
 		reject creatingfile with certain reserved characters).
+ nounix         Disable the CIFS Unix Extensions for this mount (tree
+		connection). This is rarely needed, but it may be useful
+		in order to turn off multiple settings all at once (ie
+		posix acls, posix locks, posix paths, symlink support
+		and retrieving uids/gids/mode from the server) or to
+		work around a bug in server which implement the Unix
+		Extensions.
  nobrl          Do not send byte range lock requests to the server.
 		This is necessary for certain applications that break
 		with cifs style mandatory byte range locks (and most
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index d7bd51575fd6..29d4b2715254 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -82,8 +82,7 @@ u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for
 
 v) mount check for unmatched uids
 
-w) Add mount option for Linux extension disable per mount, and partial
-disable per mount (uid off, symlink/fifo/mknod on but what about posix acls?)
+w) Add support for new vfs entry points for setlease and fallocate 
 
 x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of 
 processes can proceed better in parallel (on the server)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e13592afca9c..894b1f7b299d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1904,6 +1904,25 @@ static int cifs_readpage(struct file *file, struct page *page)
 	return rc;
 }
 
+static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
+{
+	struct cifsFileInfo *open_file;
+
+	read_lock(&GlobalSMBSeslock);
+	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
+		if (open_file->closePend)
+			continue;
+		if (open_file->pfile &&
+		    ((open_file->pfile->f_flags & O_RDWR) ||
+		     (open_file->pfile->f_flags & O_WRONLY))) {
+			read_unlock(&GlobalSMBSeslock);
+			return 1;
+		}
+	}
+	read_unlock(&GlobalSMBSeslock);
+	return 0;
+}
+
 /* We do not want to update the file size from server for inodes
    open for write - to avoid races with writepage extending
    the file - in the future we could consider allowing
@@ -1912,19 +1931,13 @@ static int cifs_readpage(struct file *file, struct page *page)
    page caching in the current Linux kernel design */
 int is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
 {
-	struct cifsFileInfo *open_file = NULL;
-
-	if (cifsInode)
-		open_file =  find_writable_file(cifsInode);
+	if (!cifsInode)
+		return 1;
 
-	if (open_file) {
+	if (is_inode_writable(cifsInode)) {
+		/* This inode is open for write at least once */
 		struct cifs_sb_info *cifs_sb;
 
-		/* there is not actually a write pending so let
-		this handle go free and allow it to
-		be closable if needed */
-		atomic_dec(&open_file->wrtPending);
-
 		cifs_sb = CIFS_SB(cifsInode->vfs_inode.i_sb);
 		if ( cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO ) {
 			/* since no page cache to corrupt on directio
-- 
cgit v1.2.2


From 313b0d3d863453cb596ea42aed07f8e1a9b0a7e2 Mon Sep 17 00:00:00 2001
From: Masakazu Mokuno <mokuno@sm.sony.co.jp>
Date: Mon, 9 Jul 2007 19:54:39 +0900
Subject: [PATCH] remove duplicated ioctl entries in compat_ioctl.c

This patch removes some duplicated wireless ioctl entries in the array
'struct ioctl_trans ioctl_start[]' of fs/compat_ioctl.c

These entries are registered twice like:

	COMPATIBLE_IOCTL(SIOCGIWPRIV)

and

	HANDLE_IOCTL(SIOCGIWPRIV, do_wireless_ioctl)

Signed-off-by: Masakazu Mokuno <mokuno@sm.sony.co.jp>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 fs/compat_ioctl.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 2bc1428d621c..a6c9078af124 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -3161,12 +3161,9 @@ COMPATIBLE_IOCTL(SIOCSIWSENS)
 COMPATIBLE_IOCTL(SIOCGIWSENS)
 COMPATIBLE_IOCTL(SIOCSIWRANGE)
 COMPATIBLE_IOCTL(SIOCSIWPRIV)
-COMPATIBLE_IOCTL(SIOCGIWPRIV)
 COMPATIBLE_IOCTL(SIOCSIWSTATS)
-COMPATIBLE_IOCTL(SIOCGIWSTATS)
 COMPATIBLE_IOCTL(SIOCSIWAP)
 COMPATIBLE_IOCTL(SIOCGIWAP)
-COMPATIBLE_IOCTL(SIOCSIWSCAN)
 COMPATIBLE_IOCTL(SIOCSIWRATE)
 COMPATIBLE_IOCTL(SIOCGIWRATE)
 COMPATIBLE_IOCTL(SIOCSIWRTS)
-- 
cgit v1.2.2


From 5e11934d13c9a3bcb0cadad6c7a7de5c32660422 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 26 Jul 2007 12:06:17 -0400
Subject: NFS: Fix put_nfs_open_context

We need to grab the inode->i_lock atomically with the last reference put in
order to remove the open context that is being freed from the
nfsi->open_files list.

Fix by converting the kref to a standard atomic counter and then using
atomic_dec_and_lock()...

Thanks to Arnd Bergmann for pointing out the problem.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bca6cdcb9f0d..71a49c3acabd 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -468,7 +468,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, str
 		ctx->lockowner = current->files;
 		ctx->error = 0;
 		ctx->dir_cookie = 0;
-		kref_init(&ctx->kref);
+		atomic_set(&ctx->count, 1);
 	}
 	return ctx;
 }
@@ -476,21 +476,18 @@ static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, str
 struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
 {
 	if (ctx != NULL)
-		kref_get(&ctx->kref);
+		atomic_inc(&ctx->count);
 	return ctx;
 }
 
-static void nfs_free_open_context(struct kref *kref)
+void put_nfs_open_context(struct nfs_open_context *ctx)
 {
-	struct nfs_open_context *ctx = container_of(kref,
-			struct nfs_open_context, kref);
+	struct inode *inode = ctx->path.dentry->d_inode;
 
-	if (!list_empty(&ctx->list)) {
-		struct inode *inode = ctx->path.dentry->d_inode;
-		spin_lock(&inode->i_lock);
-		list_del(&ctx->list);
-		spin_unlock(&inode->i_lock);
-	}
+	if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
+		return;
+	list_del(&ctx->list);
+	spin_unlock(&inode->i_lock);
 	if (ctx->state != NULL)
 		nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
 	if (ctx->cred != NULL)
@@ -500,11 +497,6 @@ static void nfs_free_open_context(struct kref *kref)
 	kfree(ctx);
 }
 
-void put_nfs_open_context(struct nfs_open_context *ctx)
-{
-	kref_put(&ctx->kref, nfs_free_open_context);
-}
-
 /*
  * Ensure that mmap has a recent RPC credential for use when writing out
  * shared pages
-- 
cgit v1.2.2


From ba683031fae115d61c6b5f4c675cc27f6e9576d2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 27 Jul 2007 10:23:05 -0400
Subject: NFSv4: Fix a locking regression in nfs4_set_mode_locked()

We don't really need to clear &state->inode_states inside
nfs4_set_mode_locked, and doing so without holding the inode->i_lock would
in any case be a bug...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4state.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e9662ba81d86..3e4adf8c8312 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -341,8 +341,6 @@ nfs4_state_set_mode_locked(struct nfs4_state *state, mode_t mode)
 		else
 			list_move_tail(&state->open_states, &state->owner->so_states);
 	}
-	if (mode == 0)
-		list_del_init(&state->inode_states);
 	state->state = mode;
 }
 
@@ -415,8 +413,7 @@ void nfs4_put_open_state(struct nfs4_state *state)
 	if (!atomic_dec_and_lock(&state->count, &owner->so_lock))
 		return;
 	spin_lock(&inode->i_lock);
-	if (!list_empty(&state->inode_states))
-		list_del(&state->inode_states);
+	list_del(&state->inode_states);
 	list_del(&state->open_states);
 	spin_unlock(&inode->i_lock);
 	spin_unlock(&owner->so_lock);
-- 
cgit v1.2.2


From 45328c354e8ae16b67cb3adb72ab57459f9e5fd6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 26 Jul 2007 17:47:34 -0400
Subject: NFS: Fix NFSv4 open stateid regressions

Do not allow cached open for O_RDONLY or O_WRONLY unless the file has been
previously opened in these modes.

Also Fix the calculation of the mode in nfs4_close_prepare. We should only
issue an OPEN_DOWNGRADE if we're sure that we will still be holding the
correct open modes. This may not be the case if we've been doing delegated
opens.

Finally, there is no need to adjust the open mode bit flags in
nfs4_close_done(): that has already been done in nfs4_close_prepare().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6ca2795ccd9c..62b3ae280310 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -332,11 +332,9 @@ static int can_open_cached(struct nfs4_state *state, int mode)
 	switch (mode & (FMODE_READ|FMODE_WRITE|O_EXCL)) {
 		case FMODE_READ:
 			ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0;
-			ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0;
 			break;
 		case FMODE_WRITE:
 			ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0;
-			ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0;
 			break;
 		case FMODE_READ|FMODE_WRITE:
 			ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0;
@@ -1260,7 +1258,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 	nfs_increment_open_seqid(task->tk_status, calldata->arg.seqid);
 	switch (task->tk_status) {
 		case 0:
-			nfs_set_open_stateid(state, &calldata->res.stateid, calldata->arg.open_flags);
+			nfs_set_open_stateid(state, &calldata->res.stateid, 0);
 			renew_lease(server, calldata->timestamp);
 			break;
 		case -NFS4ERR_STALE_STATEID:
@@ -1286,23 +1284,19 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 		.rpc_cred = state->owner->so_cred,
 	};
 	int clear_rd, clear_wr, clear_rdwr;
-	int mode;
 
 	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
 		return;
 
-	mode = FMODE_READ|FMODE_WRITE;
 	clear_rd = clear_wr = clear_rdwr = 0;
 	spin_lock(&state->owner->so_lock);
 	/* Calculate the change in open mode */
 	if (state->n_rdwr == 0) {
 		if (state->n_rdonly == 0) {
-			mode &= ~FMODE_READ;
 			clear_rd |= test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags);
 			clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags);
 		}
 		if (state->n_wronly == 0) {
-			mode &= ~FMODE_WRITE;
 			clear_wr |= test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags);
 			clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags);
 		}
@@ -1314,9 +1308,13 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 		return;
 	}
 	nfs_fattr_init(calldata->res.fattr);
-	if (mode != 0)
+	if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) {
 		msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
-	calldata->arg.open_flags = mode;
+		calldata->arg.open_flags = FMODE_READ;
+	} else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) {
+		msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
+		calldata->arg.open_flags = FMODE_WRITE;
+	}
 	calldata->timestamp = jiffies;
 	rpc_call_setup(task, &msg, 0);
 }
-- 
cgit v1.2.2


From 905f8d16e32fd48499e3f8b9a2d9f746af3e0949 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 6 Aug 2007 12:18:34 -0400
Subject: NFSv4: Don't call put_rpccred() from an rcu callback

Doing so would require us to introduce bh-safe locks into put_rpccred().
This patch fixes the lockdep complaint reported by Marc Dietrich:

inconsistent {softirq-on-W} -> {in-softirq-W} usage.
swapper/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
 (rpc_credcache_lock){-+..}, at: [<c01dc487>]
_atomic_dec_and_lock+0x17/0x60
{softirq-on-W} state was registered at:
  [<c013e870>] __lock_acquire+0x650/0x1030
  [<c013f2b1>] lock_acquire+0x61/0x80
  [<c02db9ac>] _spin_lock+0x2c/0x40
  [<c01dc487>] _atomic_dec_and_lock+0x17/0x60
  [<dced55fd>] put_rpccred+0x5d/0x100 [sunrpc]
  [<dced56c1>] rpcauth_unbindcred+0x21/0x60 [sunrpc]
  [<dced3fd4>] a0 [sunrpc]
  [<dcecefe0>] rpc_call_sync+0x30/0x40 [sunrpc]
  [<dcedc73b>] rpcb_register+0xdb/0x180 [sunrpc]
  [<dced65b3>] svc_register+0x93/0x160 [sunrpc]
  [<dced6ebe>] __svc_create+0x1ee/0x220 [sunrpc]
  [<dced7053>] svc_create+0x13/0x20 [sunrpc]
  [<dcf6d722>] nfs_callback_up+0x82/0x120 [nfs]
  [<dcf48f36>] nfs_get_client+0x176/0x390 [nfs]
  [<dcf49181>] nfs4_set_client+0x31/0x190 [nfs]
  [<dcf49983>] nfs4_create_server+0x63/0x3b0 [nfs]
  [<dcf52426>] nfs4_get_sb+0x346/0x5b0 [nfs]
  [<c017b444>] vfs_kern_mount+0x94/0x110
  [<c0190a62>] do_mount+0x1f2/0x7d0
  [<c01910a6>] sys_mount+0x66/0xa0
  [<c0104046>] syscall_call+0x7/0xb
  [<ffffffff>] 0xffffffff
irq event stamp: 5277830
hardirqs last  enabled at (5277830): [<c017530a>] kmem_cache_free+0x8a/0xc0
hardirqs last disabled at (5277829): [<c01752d2>] kmem_cache_free+0x52/0xc0
softirqs last  enabled at (5277798): [<c0124173>] __do_softirq+0xa3/0xc0
softirqs last disabled at (5277817): [<c01241d7>] do_softirq+0x47/0x50

other info that might help us debug this:
no locks held by swapper/0.

stack backtrace:
 [<c0104fda>] show_trace_log_lvl+0x1a/0x30
 [<c0105c02>] show_trace+0x12/0x20
 [<c0105d15>] dump_stack+0x15/0x20
 [<c013ccc3>] print_usage_bug+0x153/0x160
 [<c013d8b9>] mark_lock+0x449/0x620
 [<c013e824>] __lock_acquire+0x604/0x1030
 [<c013f2b1>] lock_acquire+0x61/0x80
 [<c02db9ac>] _spin_lock+0x2c/0x40
 [<c01dc487>] _atomic_dec_and_lock+0x17/0x60
 [<dced55fd>] put_rpccred+0x5d/0x100 [sunrpc]
 [<dcf6bf83>] nfs_free_delegation_callback+0x13/0x20 [nfs]
 [<c012f9ea>] __rcu_process_callbacks+0x6a/0x1c0
 [<c012fb52>] rcu_process_callbacks+0x12/0x30
 [<c0124218>] tasklet_action+0x38/0x80
 [<c0124125>] __do_softirq+0x55/0xc0
 [<c01241d7>] do_softirq+0x47/0x50
 [<c0124605>] irq_exit+0x35/0x40
 [<c0112463>] smp_apic_timer_interrupt+0x43/0x80
 [<c0104a77>] apic_timer_interrupt+0x33/0x38
 [<c02690df>] cpuidle_idle_call+0x6f/0x90
 [<c01023c3>] cpu_idle+0x43/0x70
 [<c02d8c27>] rest_init+0x47/0x50
 [<c03bcb6a>] start_kernel+0x22a/0x2b0
 [<00000000>] 0x0
 =======================

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/delegation.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 20ac403469a0..c55a761c22bb 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -20,10 +20,8 @@
 #include "delegation.h"
 #include "internal.h"
 
-static void nfs_free_delegation(struct nfs_delegation *delegation)
+static void nfs_do_free_delegation(struct nfs_delegation *delegation)
 {
-	if (delegation->cred)
-		put_rpccred(delegation->cred);
 	kfree(delegation);
 }
 
@@ -31,7 +29,18 @@ static void nfs_free_delegation_callback(struct rcu_head *head)
 {
 	struct nfs_delegation *delegation = container_of(head, struct nfs_delegation, rcu);
 
-	nfs_free_delegation(delegation);
+	nfs_do_free_delegation(delegation);
+}
+
+static void nfs_free_delegation(struct nfs_delegation *delegation)
+{
+	struct rpc_cred *cred;
+
+	cred = rcu_dereference(delegation->cred);
+	rcu_assign_pointer(delegation->cred, NULL);
+	call_rcu(&delegation->rcu, nfs_free_delegation_callback);
+	if (cred)
+		put_rpccred(cred);
 }
 
 static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state)
@@ -166,7 +175,7 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
 	int res = 0;
 
 	res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid);
-	call_rcu(&delegation->rcu, nfs_free_delegation_callback);
+	nfs_free_delegation(delegation);
 	return res;
 }
 
@@ -448,7 +457,7 @@ restart:
 		spin_unlock(&clp->cl_lock);
 		rcu_read_unlock();
 		if (delegation != NULL)
-			call_rcu(&delegation->rcu, nfs_free_delegation_callback);
+			nfs_free_delegation(delegation);
 		goto restart;
 	}
 	rcu_read_unlock();
-- 
cgit v1.2.2


From 3d39c691ff486142dd9aaeac12f553f4476b7a62 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 7 Aug 2007 15:28:33 -0400
Subject: NFS: Replace flush_scheduled_work with cancel_work_sync() and friends

This will avoid deadlocks of the form:

stack backtrace:
 [<c0104fda>] show_trace_log_lvl+0x1a/0x30
 [<c0105c02>] show_trace+0x12/0x20
 [<c0105d15>] dump_stack+0x15/0x20
 [<c013ee42>] __lock_acquire+0xc22/0x1030
 [<c013f2b1>] lock_acquire+0x61/0x80
 [<c012edd9>] flush_workqueue+0x49/0x70
 [<c012ee0d>] flush_scheduled_work+0xd/0x10
 [<dcf55c0c>] nfs_release_automount_timer+0x2c/0x30 [nfs]
 [<dcf45d8e>] nfs_free_server+0x9e/0xd0 [nfs]
 [<dcf4e626>] nfs_kill_super+0x16/0x20 [nfs]
 [<c017b38d>] deactivate_super+0x7d/0xa0
 [<c018f94b>] mntput_no_expire+0x4b/0x80
 [<c018fd94>] expire_mount_list+0xe4/0x140
 [<c0191219>] mark_mounts_for_expiry+0x99/0xb0
 [<dcf55d1d>] nfs_expire_automounts+0xd/0x40 [nfs]
 [<c012e61b>] run_workqueue+0x12b/0x1e0
 [<c012f05b>] worker_thread+0x9b/0x100
 [<c0131c72>] kthread+0x42/0x70
 [<c0104c0f>] kernel_thread_helper+0x7/0x18
 =======================

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/namespace.c  | 6 ++----
 fs/nfs/nfs4renewd.c | 5 ++---
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 7f86e65182e4..aea76d0e5fbd 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -175,10 +175,8 @@ static void nfs_expire_automounts(struct work_struct *work)
 
 void nfs_release_automount_timer(void)
 {
-	if (list_empty(&nfs_automount_list)) {
-		cancel_delayed_work(&nfs_automount_task);
-		flush_scheduled_work();
-	}
+	if (list_empty(&nfs_automount_list))
+		cancel_delayed_work_sync(&nfs_automount_task);
 }
 
 /*
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 0505ca124034..3ea352d82eba 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -127,16 +127,15 @@ nfs4_schedule_state_renewal(struct nfs_client *clp)
 void
 nfs4_renewd_prepare_shutdown(struct nfs_server *server)
 {
-	flush_scheduled_work();
+	cancel_delayed_work(&server->nfs_client->cl_renewd);
 }
 
 void
 nfs4_kill_renewd(struct nfs_client *clp)
 {
 	down_read(&clp->cl_sem);
-	cancel_delayed_work(&clp->cl_renewd);
+	cancel_delayed_work_sync(&clp->cl_renewd);
 	up_read(&clp->cl_sem);
-	flush_scheduled_work();
 }
 
 /*
-- 
cgit v1.2.2


From c11e9fafb398411af7558fca913c2fa4a10b1f48 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 20 Jul 2007 11:24:53 -0700
Subject: ocfs2: Restrict inode changes in ocfs2_update_inode_atime()

ocfs2_update_inode_atime() calls ocfs2_mark_inode_dirty() to push changes
from the struct inode into the ocfs2 disk inode. The problem is,
ocfs2_mark_inode_dirty() might change other fields, depending on what
happened to the struct inode. Since we don't always have locking to
serialize changes to other fields (like i_size, etc), just fix things up to
only touch the atime field.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/file.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c4034f693e7b..992eb567cb4d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -187,6 +187,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
 	int ret;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	handle_t *handle;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
 
 	mlog_entry_void();
 
@@ -197,11 +198,27 @@ int ocfs2_update_inode_atime(struct inode *inode,
 		goto out;
 	}
 
+	ret = ocfs2_journal_access(handle, inode, bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * Don't use ocfs2_mark_inode_dirty() here as we don't always
+	 * have i_mutex to guard against concurrent changes to other
+	 * inode fields.
+	 */
 	inode->i_atime = CURRENT_TIME;
-	ret = ocfs2_mark_inode_dirty(handle, inode, bh);
+	di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
+	di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+
+	ret = ocfs2_journal_dirty(handle, bh);
 	if (ret < 0)
 		mlog_errno(ret);
 
+out_commit:
 	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
 	mlog_exit(ret);
-- 
cgit v1.2.2


From a00cce356b5592208e761525a48a25902322cce9 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 20 Jul 2007 11:28:30 -0700
Subject: ocfs2: use s_maxbytes directly in ocfs2_change_file_space()

There's no need to recalculate things via ocfs2_max_file_offset() as we've
already done that to fill s_maxbytes, so use that instead. We can also
un-export ocfs2_max_file_offset() then.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/file.c  | 2 +-
 fs/ocfs2/super.c | 2 +-
 fs/ocfs2/super.h | 2 --
 3 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 992eb567cb4d..7e508e2942ca 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1533,7 +1533,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct buffer_head *di_bh = NULL;
 	handle_t *handle;
-	unsigned long long max_off = ocfs2_max_file_offset(inode->i_sb->s_blocksize_bits);
+	unsigned long long max_off = inode->i_sb->s_maxbytes;
 
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 		return -EROFS;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 200c7d4790dc..a100b488533e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -319,7 +319,7 @@ static void ocfs2_destroy_inode(struct inode *inode)
 /* From xfs_super.c:xfs_max_file_offset
  * Copyright (c) 2000-2004 Silicon Graphics, Inc.
  */
-unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
+static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
 {
 	unsigned int pagefactor = 1;
 	unsigned int bitshift = BITS_PER_LONG - 1;
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 3b9cb3d0b008..783f5270f2a1 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -45,6 +45,4 @@ void __ocfs2_abort(struct super_block *sb,
 
 #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
 
-unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
-
 #endif /* OCFS2_SUPER_H */
-- 
cgit v1.2.2


From 7c08d70c69150148c14f02633855f1591219c37c Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 20 Jul 2007 11:58:36 -0700
Subject: ocfs2: Fix some casting errors related to file writes

ocfs2_align_clusters_to_page_index() needs to cast the clusters shift to
pgoff_t and ocfs2_file_buffered_write() needs loff_t when calculating
destination start for memcpy.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/file.c  | 2 +-
 fs/ocfs2/ocfs2.h | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7e508e2942ca..b1ae4c754157 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1959,7 +1959,7 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
 		}
 
 		dst = kmap_atomic(page, KM_USER0);
-		memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes);
+		memcpy(dst + (pos & (loff_t)(PAGE_CACHE_SIZE - 1)), buf, bytes);
 		kunmap_atomic(dst, KM_USER0);
 		flush_dcache_page(page);
 		ocfs2_put_write_source(user_page);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 5cc90a40b3c5..58307853fb4a 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -494,16 +494,16 @@ static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
 /*
  * Find the 1st page index which covers the given clusters.
  */
-static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_block *sb,
+static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb,
 							u32 clusters)
 {
 	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
-	unsigned long index = clusters;
+        pgoff_t index = clusters;
 
 	if (PAGE_CACHE_SHIFT > cbits) {
-		index = clusters >> (PAGE_CACHE_SHIFT - cbits);
+		index = (pgoff_t)clusters >> (PAGE_CACHE_SHIFT - cbits);
 	} else if (PAGE_CACHE_SHIFT < cbits) {
-		index = clusters << (cbits - PAGE_CACHE_SHIFT);
+		index = (pgoff_t)clusters << (cbits - PAGE_CACHE_SHIFT);
 	}
 
 	return index;
-- 
cgit v1.2.2


From ce76fd30ce98cdaeb38dca0dfbb3fa6d2801c5ce Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 20 Jul 2007 12:02:14 -0700
Subject: ocfs2: check ia_size limits in setattr

We have to manually check the requested truncate size as the check in
vmtruncate() comes too late for Ocfs2.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/file.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index b1ae4c754157..4ffa715be09c 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1028,6 +1028,11 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	if (size_change && attr->ia_size != i_size_read(inode)) {
+		if (attr->ia_size > sb->s_maxbytes) {
+			status = -EFBIG;
+			goto bail_unlock;
+		}
+
 		if (i_size_read(inode) > attr->ia_size)
 			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
 		else
-- 
cgit v1.2.2


From 5a25403175b8a945e93fc9c64ae9cf54f5730add Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 20 Jul 2007 12:56:16 -0700
Subject: ocfs2: Fix max offset calculations

ocfs2_max_file_offset() was over-estimating the largest file size for
several cases. This wasn't really a problem before, but now that we support
sparse files, it needs to be more accurate.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/super.c | 68 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 40 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index a100b488533e..c3c89e26567c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -316,39 +316,51 @@ static void ocfs2_destroy_inode(struct inode *inode)
 	kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
 }
 
-/* From xfs_super.c:xfs_max_file_offset
- * Copyright (c) 2000-2004 Silicon Graphics, Inc.
- */
-static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
+static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
+						unsigned int cbits)
 {
-	unsigned int pagefactor = 1;
-	unsigned int bitshift = BITS_PER_LONG - 1;
-
-	/* Figure out maximum filesize, on Linux this can depend on
-	 * the filesystem blocksize (on 32 bit platforms).
-	 * __block_prepare_write does this in an [unsigned] long...
-	 *      page->index << (PAGE_CACHE_SHIFT - bbits)
-	 * So, for page sized blocks (4K on 32 bit platforms),
-	 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
-	 *      (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
-	 * but for smaller blocksizes it is less (bbits = log2 bsize).
-	 * Note1: get_block_t takes a long (implicit cast from above)
-	 * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
-	 * can optionally convert the [unsigned] long from above into
-	 * an [unsigned] long long.
+	unsigned int bytes = 1 << cbits;
+	unsigned int trim = bytes;
+	unsigned int bitshift = 32;
+
+	/*
+	 * i_size and all block offsets in ocfs2 are always 64 bits
+	 * wide. i_clusters is 32 bits, in cluster-sized units. So on
+	 * 64 bit platforms, cluster size will be the limiting factor.
 	 */
 
 #if BITS_PER_LONG == 32
 # if defined(CONFIG_LBD)
 	BUILD_BUG_ON(sizeof(sector_t) != 8);
-	pagefactor = PAGE_CACHE_SIZE;
-	bitshift = BITS_PER_LONG;
+	/*
+	 * We might be limited by page cache size.
+	 */
+	if (bytes > PAGE_CACHE_SIZE) {
+		bytes = PAGE_CACHE_SIZE;
+		trim = 1;
+		/*
+		 * Shift by 31 here so that we don't get larger than
+		 * MAX_LFS_FILESIZE
+		 */
+		bitshift = 31;
+	}
 # else
-	pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
+	/*
+	 * We are limited by the size of sector_t. Use block size, as
+	 * that's what we expose to the VFS.
+	 */
+	bytes = 1 << bbits;
+	trim = 1;
+	bitshift = 31;
 # endif
 #endif
 
-	return (((unsigned long long)pagefactor) << bitshift) - 1;
+	/*
+	 * Trim by a whole cluster when we can actually approach the
+	 * on-disk limits. Otherwise we can overflow i_clusters when
+	 * an extent start is at the max offset.
+	 */
+	return (((unsigned long long)bytes) << bitshift) - trim;
 }
 
 static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
@@ -1259,8 +1271,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
 				  int sector_size)
 {
 	int status = 0;
-	int i;
-	struct ocfs2_dinode *di = NULL;
+	int i, cbits, bbits;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
 	struct inode *inode = NULL;
 	struct buffer_head *bitmap_bh = NULL;
 	struct ocfs2_journal *journal;
@@ -1281,7 +1293,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	sb->s_export_op = &ocfs2_export_ops;
 	sb->s_flags |= MS_NOATIME;
 	/* this is needed to support O_LARGEFILE */
-	sb->s_maxbytes = ocfs2_max_file_offset(sb->s_blocksize_bits);
+	cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
+	bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
+	sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
 
 	osb->sb = sb;
 	/* Save off for ocfs2_rw_direct */
@@ -1341,8 +1355,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		goto bail;
 	}
 
-	di = (struct ocfs2_dinode *)bh->b_data;
-
 	osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
 	if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
 		mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
-- 
cgit v1.2.2


From 6a18380e7ddd7d1a0493efe3be6475dd92323364 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 23 Jul 2007 10:01:21 +0200
Subject: [2.6 patch] ocfs2_insert_extent(): remove dead code

This patch removes some now dead code.

Spotted by the Coverity checker.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f5e11f4fa952..4f517665c9a0 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3731,7 +3731,6 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 {
 	int status;
 	struct buffer_head *last_eb_bh = NULL;
-	struct buffer_head *bh = NULL;
 	struct ocfs2_insert_type insert = {0, };
 	struct ocfs2_extent_rec rec;
 
@@ -3783,9 +3782,6 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 		ocfs2_extent_map_insert_rec(inode, &rec);
 
 bail:
-	if (bh)
-		brelse(bh);
-
 	if (last_eb_bh)
 		brelse(last_eb_bh);
 
-- 
cgit v1.2.2


From 480214d71f1972756473415d31953647952400fb Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.musran@oracle.com>
Date: Mon, 6 Aug 2007 15:11:56 -0700
Subject: ocfs2: Fix rename/extend race

If one process is extending a file while another is renaming it, there
exists a window when rename could flush the old inode's stale i_size to
disk. This patch recognizes the fact that rename is only updating the old
inode's ctime, so it ensures only that value is flushed to disk.

Signed-off-by: Sunil Mushran <sunil.musran@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/namei.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d430fdab16e9..701e6d04ed5d 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1080,6 +1080,7 @@ static int ocfs2_rename(struct inode *old_dir,
 	struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
 						    // this is the 1st dirent bh
 	nlink_t old_dir_nlink = old_dir->i_nlink;
+	struct ocfs2_dinode *old_di;
 
 	/* At some point it might be nice to break this function up a
 	 * bit. */
@@ -1354,7 +1355,20 @@ static int ocfs2_rename(struct inode *old_dir,
 
 	old_inode->i_ctime = CURRENT_TIME;
 	mark_inode_dirty(old_inode);
-	ocfs2_mark_inode_dirty(handle, old_inode, old_inode_bh);
+
+	status = ocfs2_journal_access(handle, old_inode, old_inode_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status >= 0) {
+		old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
+
+		old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
+		old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
+
+		status = ocfs2_journal_dirty(handle, old_inode_bh);
+		if (status < 0)
+			mlog_errno(status);
+	} else
+		mlog_errno(status);
 
 	/* now that the name has been added to new_dir, remove the old name */
 	status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
-- 
cgit v1.2.2


From ce17204ae633001ef41318d487282730e96b9522 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 30 Jul 2007 11:02:50 -0700
Subject: ocfs2: Retry sendpage() if it returns EAGAIN

Instead of treating EAGAIN, returned from sendpage(), as an error, this
patch retries the operation.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/tcp.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index f0bdfd944c44..685c18065c82 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -854,17 +854,25 @@ static void o2net_sendpage(struct o2net_sock_container *sc,
 	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
 	ssize_t ret;
 
-
-	mutex_lock(&sc->sc_send_lock);
-	ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
-					 virt_to_page(kmalloced_virt),
-					 (long)kmalloced_virt & ~PAGE_MASK,
-					 size, MSG_DONTWAIT);
-	mutex_unlock(&sc->sc_send_lock);
-	if (ret != size) {
+	while (1) {
+		mutex_lock(&sc->sc_send_lock);
+		ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
+						 virt_to_page(kmalloced_virt),
+						 (long)kmalloced_virt & ~PAGE_MASK,
+						 size, MSG_DONTWAIT);
+		mutex_unlock(&sc->sc_send_lock);
+		if (ret == size)
+			break;
+		if (ret == (ssize_t)-EAGAIN) {
+			mlog(0, "sendpage of size %zu to " SC_NODEF_FMT
+			     " returned EAGAIN\n", size, SC_NODEF_ARGS(sc));
+			cond_resched();
+			continue;
+		}
 		mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT 
 		     " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
 		o2net_ensure_shutdown(nn, sc, 0);
+		break;
 	}
 }
 
-- 
cgit v1.2.2


From e0dceaf0a4b8c55076a4dbcba7ac8b05755f5cc6 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 9 Aug 2007 16:52:30 -0700
Subject: ocfs2: set non-default s_time_gran during mount

We need to manually set this to '1' during mount, otherwise inode_setattr()
will chop off the nanosecond portion of our timestamps.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c3c89e26567c..f2fc9a795deb 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1291,6 +1291,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	sb->s_fs_info = osb;
 	sb->s_op = &ocfs2_sops;
 	sb->s_export_op = &ocfs2_export_ops;
+	sb->s_time_gran = 1;
 	sb->s_flags |= MS_NOATIME;
 	/* this is needed to support O_LARGEFILE */
 	cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
-- 
cgit v1.2.2


From 6a648fa72161d1f6468dabd96c5d3c0db04f598a Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Fri, 10 Aug 2007 13:00:44 -0700
Subject: direct-io: fix error-path crashes

Need to initialize map_bh.b_state to zero.  Otherwise, in case of a faulty
user-buffer its possible to go into dio_zero_block() and submit a page by
mistake - since it checks for buffer_new().

http://marc.info/?l=linux-kernel&m=118551339032528&w=2

akpm: Linus had a (better) patch to just do a kzalloc() in there, but it got
lost.  Probably this version is better for -stable anwyay.

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Acked-by: Joe Jin <joe.jin@oracle.com>
Acked-by: Zach Brown <zach.brown@oracle.com>
Cc: gurudas pai <gurudas.pai@oracle.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/direct-io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 52bb2638f7ab..6874785bb65a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -974,6 +974,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	dio->get_block = get_block;
 	dio->end_io = end_io;
 	dio->map_bh.b_private = NULL;
+	dio->map_bh.b_state = 0;
 	dio->final_block_in_bio = -1;
 	dio->next_block_for_io = -1;
 
-- 
cgit v1.2.2


From 202a21d6914369c1362f1ab50f0cbe92b9c38718 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 10 Aug 2007 13:00:51 -0700
Subject: eCryptfs: fix lookup error for special files

When ecryptfs_lookup() is called against special files, eCryptfs generates
the following errors because it tries to treat them like regular eCryptfs
files.

Error opening lower file for lower_dentry [0xffff810233a6f150], lower_mnt [0xffff810235bb4c80], and flags
[0x8000]
Error opening lower_file to read header region
Error attempting to read the [user.ecryptfs] xattr from the lower file; return value = [-95]
Valid metadata not found in header region or xattr region; treating file as unencrypted

For instance, the problem can be reproduced by the steps below.

  # mkdir /root/crypt /mnt/crypt
  # mount -t ecryptfs /root/crypt /mnt/crypt
  # mknod /mnt/crypt/c0 c 0 0
  # umount /mnt/crypt
  # mount -t ecryptfs /root/crypt /mnt/crypt
  # ls -l /mnt/crypt

This patch fixes it by adding a check similar to directories and
symlinks.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Acked-by: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 0a50942b4378..131954b3fb98 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -353,6 +353,10 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
 		ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n");
 		goto out;
 	}
+	if (special_file(lower_inode->i_mode)) {
+		ecryptfs_printk(KERN_DEBUG, "Is a special file; returning\n");
+		goto out;
+	}
 	if (!nd) {
 		ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave"
 				"as we *think* we are about to unlink\n");
-- 
cgit v1.2.2


From a75de1b3799f8933d6d2b64bdf31194368ec98ab Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 10 Aug 2007 13:00:56 -0700
Subject: eCryptfs: fix error handling in ecryptfs_init

ecryptfs_init() exits without doing any cleanup jobs if
ecryptfs_init_messaging() fails.  In that case, eCryptfs leaves
sysfs entries, leaks memory, and causes an invalid page fault.
This patch fixes the problem.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Acked-by: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/main.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index e557a6766927..a98497264fe8 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -813,6 +813,15 @@ out:
 	return rc;
 }
 
+static void do_sysfs_unregistration(void)
+{
+	sysfs_remove_file(&ecryptfs_subsys.kobj,
+			  &sysfs_attr_version.attr);
+	sysfs_remove_file(&ecryptfs_subsys.kobj,
+			  &sysfs_attr_version_str.attr);
+	subsystem_unregister(&ecryptfs_subsys);
+}
+
 static int __init ecryptfs_init(void)
 {
 	int rc;
@@ -851,6 +860,9 @@ static int __init ecryptfs_init(void)
 	if (rc) {
 		ecryptfs_printk(KERN_ERR, "Failure occured while attempting to "
 				"initialize the eCryptfs netlink socket\n");
+		do_sysfs_unregistration();
+		unregister_filesystem(&ecryptfs_fs_type);
+		ecryptfs_free_kmem_caches();
 	}
 out:
 	return rc;
@@ -858,11 +870,7 @@ out:
 
 static void __exit ecryptfs_exit(void)
 {
-	sysfs_remove_file(&ecryptfs_subsys.kobj,
-			  &sysfs_attr_version.attr);
-	sysfs_remove_file(&ecryptfs_subsys.kobj,
-			  &sysfs_attr_version_str.attr);
-	subsystem_unregister(&ecryptfs_subsys);
+	do_sysfs_unregistration();
 	ecryptfs_release_messaging(ecryptfs_transport);
 	unregister_filesystem(&ecryptfs_fs_type);
 	ecryptfs_free_kmem_caches();
-- 
cgit v1.2.2


From 25720c2d73058f4f929f16093f60817ed52a285c Mon Sep 17 00:00:00 2001
From: Patrick Caulfield <pcaulfie@redhat.com>
Date: Wed, 11 Jul 2007 13:39:43 +0100
Subject: [DLM] Clear othercon pointers when a connection is closed

This patch clears the othercon pointer and frees the memory when a connnection
is closed. This could cause a small memory leak when nodes leave the cluster.

Signed-Off-By: Patrick Caulfield <pcaulfie@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/dlm/lowcomms.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index dd362739d291..d15fd5f9f5c5 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -327,6 +327,8 @@ static void close_connection(struct connection *con, bool and_other)
 	if (con->othercon && and_other) {
 		/* Will only re-enter once. */
 		close_connection(con->othercon, false);
+		kmem_cache_free(con_cache, con->othercon);
+		con->othercon = NULL;
 	}
 	if (con->rx_page) {
 		__free_page(con->rx_page);
-- 
cgit v1.2.2


From 41684f9547455b395ffd65e5b7961067d20a2872 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Fri, 13 Jul 2007 14:49:06 -0500
Subject: [DLM] fix NULL ls usage

Fix regression in recent patch "[DLM] variable allocation" which
attempts to dereference an "ls" struct when it's NULL.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/dlm/rcom.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index e3a1527cbdbe..188b91c027e4 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -386,8 +386,7 @@ static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 	dlm_recover_process_copy(ls, rc_in);
 }
 
-static int send_ls_not_ready(struct dlm_ls *ls, int nodeid,
-			     struct dlm_rcom *rc_in)
+static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
 {
 	struct dlm_rcom *rc;
 	struct rcom_config *rf;
@@ -395,7 +394,7 @@ static int send_ls_not_ready(struct dlm_ls *ls, int nodeid,
 	char *mb;
 	int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config);
 
-	mh = dlm_lowcomms_get_buffer(nodeid, mb_len, ls->ls_allocation, &mb);
+	mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_NOFS, &mb);
 	if (!mh)
 		return -ENOBUFS;
 	memset(mb, 0, mb_len);
@@ -465,7 +464,7 @@ void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
 		log_print("lockspace %x from %d type %x not found",
 			  hd->h_lockspace, nodeid, rc->rc_type);
 		if (rc->rc_type == DLM_RCOM_STATUS)
-			send_ls_not_ready(ls, nodeid, rc);
+			send_ls_not_ready(nodeid, rc);
 		return;
 	}
 
-- 
cgit v1.2.2


From 01c8cab25863de007fe8c598d0033919ea8ae65e Mon Sep 17 00:00:00 2001
From: Patrick Caulfield <pcaulfie@redhat.com>
Date: Tue, 17 Jul 2007 16:53:15 +0100
Subject: [DLM] zero unused parts of sockaddr_storage

When we build a sockaddr_storage for an IP address, clear the unused parts as
they could be used for node comparisons.

I have seen this occasionally make sctp connections fail.

Signed-Off-By: Patrick Caulfield <pcaulfie@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/dlm/lowcomms.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d15fd5f9f5c5..631bc431e6af 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -313,6 +313,7 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
 		in6_addr->sin6_port = cpu_to_be16(port);
 		*addr_len = sizeof(struct sockaddr_in6);
 	}
+	memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len);
 }
 
 /* Close a remote connection and tidy up */
-- 
cgit v1.2.2


From 1a2bf2eefb63a267aea7f3f80d6ac59160e20810 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Thu, 19 Jul 2007 00:27:43 +0200
Subject: [DLM] Fix memory leak in dlm_add_member() when dlm_node_weight()
 returns less than zero

There's a memory leak in fs/dlm/member.c::dlm_add_member().

If "dlm_node_weight(ls->ls_name, nodeid)" returns < 0, then
we'll return without freeing the memory allocated to the (at
that point yet unused) 'memb'.
This patch frees the allocated memory in that case and thus
avoids the leak.

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/dlm/member.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 073599dced2a..d09977528f69 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -56,8 +56,10 @@ static int dlm_add_member(struct dlm_ls *ls, int nodeid)
 		return -ENOMEM;
 
 	w = dlm_node_weight(ls->ls_name, nodeid);
-	if (w < 0)
+	if (w < 0) {
+		kfree(memb);
 		return w;
+	}
 
 	memb->nodeid = nodeid;
 	memb->weight = w;
-- 
cgit v1.2.2


From 9e5f2825a8b721360b291f14f42cd7a25781156b Mon Sep 17 00:00:00 2001
From: Patrick Caulfield <pcaulfie@redhat.com>
Date: Thu, 2 Aug 2007 14:58:14 +0100
Subject: [DLM] More othercon fixes

The last patch to clean out 'othercon' structures only fixed half the problem.
The attached addresses the other situations too, and fixes bz#238490

Signed-Off-By: Patrick Caulfield <pcaulfie@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/dlm/lowcomms.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 631bc431e6af..9e9d2e82f40f 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -328,15 +328,24 @@ static void close_connection(struct connection *con, bool and_other)
 	if (con->othercon && and_other) {
 		/* Will only re-enter once. */
 		close_connection(con->othercon, false);
-		kmem_cache_free(con_cache, con->othercon);
-		con->othercon = NULL;
 	}
 	if (con->rx_page) {
 		__free_page(con->rx_page);
 		con->rx_page = NULL;
 	}
-	con->retries = 0;
-	mutex_unlock(&con->sock_mutex);
+
+	/* If we are an 'othercon' then NULL the pointer to us
+	   from the parent and tidy ourself up */
+	if (test_bit(CF_IS_OTHERCON, &con->flags)) {
+		struct connection *parent = __nodeid2con(con->nodeid, 0);
+		parent->othercon = NULL;
+		kmem_cache_free(con_cache, con);
+	}
+	else {
+		/* Parent connections get reused */
+		con->retries = 0;
+		mutex_unlock(&con->sock_mutex);
+	}
 }
 
 /* We only send shutdown messages to nodes that are not part of the cluster */
@@ -634,7 +643,7 @@ out_resched:
 
 out_close:
 	mutex_unlock(&con->sock_mutex);
-	if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) {
+	if (ret != -EAGAIN) {
 		close_connection(con, false);
 		/* Reconnect when there is something to send */
 	}
@@ -1125,8 +1134,6 @@ static int tcp_listen_for_all(void)
 
 	log_print("Using TCP for communications");
 
-	set_bit(CF_IS_OTHERCON, &con->flags);
-
 	sock = tcp_create_listen_sock(con, dlm_local_addr[0]);
 	if (sock) {
 		add_sock(sock, con);
@@ -1410,7 +1417,7 @@ void dlm_lowcomms_stop(void)
 	for (i = 0; i <= max_nodeid; i++) {
 		con = __nodeid2con(i, 0);
 		if (con) {
-			con->flags |= 0xFF;
+			con->flags |= 0x0F;
 			if (con->sock)
 				con->sock->sk->sk_user_data = NULL;
 		}
@@ -1426,8 +1433,6 @@ void dlm_lowcomms_stop(void)
 		con = __nodeid2con(i, 0);
 		if (con) {
 			close_connection(con, true);
-			if (con->othercon)
-				kmem_cache_free(con_cache, con->othercon);
 			kmem_cache_free(con_cache, con);
 		}
 	}
-- 
cgit v1.2.2


From 3650925893469ccb03dbcc6a440c5d363350f591 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Tue, 7 Aug 2007 09:44:48 -0500
Subject: [DLM] fix basts for granted PR waiting CW

Fix a long standing bug where a blocking callback would be missed
when there's a granted lock in PR mode and waiting locks in both
PR and CW modes (and the PR lock was added to the waiting queue
before the CW lock).  The logic simply compared the numerical values
of the modes to determine if a blocking callback was required, but in
the one case of PR and CW, the lower valued CW mode blocks the higher
valued PR mode.  We just need to add a special check for this PR/CW
case in the tests that decide when a blocking callback is needed.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/dlm/lock.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 55 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index b455919c1998..2082daf083d8 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1670,9 +1670,10 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
    with a deadlk here, we'd have to generate something like grant_lock with
    the deadlk error.) */
 
-/* returns the highest requested mode of all blocked conversions */
+/* Returns the highest requested mode of all blocked conversions; sets
+   cw if there's a blocked conversion to DLM_LOCK_CW. */
 
-static int grant_pending_convert(struct dlm_rsb *r, int high)
+static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw)
 {
 	struct dlm_lkb *lkb, *s;
 	int hi, demoted, quit, grant_restart, demote_restart;
@@ -1709,6 +1710,9 @@ static int grant_pending_convert(struct dlm_rsb *r, int high)
 		}
 
 		hi = max_t(int, lkb->lkb_rqmode, hi);
+
+		if (cw && lkb->lkb_rqmode == DLM_LOCK_CW)
+			*cw = 1;
 	}
 
 	if (grant_restart)
@@ -1721,29 +1725,52 @@ static int grant_pending_convert(struct dlm_rsb *r, int high)
 	return max_t(int, high, hi);
 }
 
-static int grant_pending_wait(struct dlm_rsb *r, int high)
+static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw)
 {
 	struct dlm_lkb *lkb, *s;
 
 	list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
 		if (can_be_granted(r, lkb, 0, NULL))
 			grant_lock_pending(r, lkb);
-                else
+                else {
 			high = max_t(int, lkb->lkb_rqmode, high);
+			if (lkb->lkb_rqmode == DLM_LOCK_CW)
+				*cw = 1;
+		}
 	}
 
 	return high;
 }
 
+/* cw of 1 means there's a lock with a rqmode of DLM_LOCK_CW that's blocked
+   on either the convert or waiting queue.
+   high is the largest rqmode of all locks blocked on the convert or
+   waiting queue. */
+
+static int lock_requires_bast(struct dlm_lkb *gr, int high, int cw)
+{
+	if (gr->lkb_grmode == DLM_LOCK_PR && cw) {
+		if (gr->lkb_highbast < DLM_LOCK_EX)
+			return 1;
+		return 0;
+	}
+
+	if (gr->lkb_highbast < high &&
+	    !__dlm_compat_matrix[gr->lkb_grmode+1][high+1])
+		return 1;
+	return 0;
+}
+
 static void grant_pending_locks(struct dlm_rsb *r)
 {
 	struct dlm_lkb *lkb, *s;
 	int high = DLM_LOCK_IV;
+	int cw = 0;
 
 	DLM_ASSERT(is_master(r), dlm_dump_rsb(r););
 
-	high = grant_pending_convert(r, high);
-	high = grant_pending_wait(r, high);
+	high = grant_pending_convert(r, high, &cw);
+	high = grant_pending_wait(r, high, &cw);
 
 	if (high == DLM_LOCK_IV)
 		return;
@@ -1751,27 +1778,41 @@ static void grant_pending_locks(struct dlm_rsb *r)
 	/*
 	 * If there are locks left on the wait/convert queue then send blocking
 	 * ASTs to granted locks based on the largest requested mode (high)
-	 * found above. FIXME: highbast < high comparison not valid for PR/CW.
+	 * found above.
 	 */
 
 	list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
-		if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
-		    !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
-			queue_bast(r, lkb, high);
+		if (lkb->lkb_bastaddr && lock_requires_bast(lkb, high, cw)) {
+			if (cw && high == DLM_LOCK_PR)
+				queue_bast(r, lkb, DLM_LOCK_CW);
+			else
+				queue_bast(r, lkb, high);
 			lkb->lkb_highbast = high;
 		}
 	}
 }
 
+static int modes_require_bast(struct dlm_lkb *gr, struct dlm_lkb *rq)
+{
+	if ((gr->lkb_grmode == DLM_LOCK_PR && rq->lkb_rqmode == DLM_LOCK_CW) ||
+	    (gr->lkb_grmode == DLM_LOCK_CW && rq->lkb_rqmode == DLM_LOCK_PR)) {
+		if (gr->lkb_highbast < DLM_LOCK_EX)
+			return 1;
+		return 0;
+	}
+
+	if (gr->lkb_highbast < rq->lkb_rqmode && !modes_compat(gr, rq))
+		return 1;
+	return 0;
+}
+
 static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
 			    struct dlm_lkb *lkb)
 {
 	struct dlm_lkb *gr;
 
 	list_for_each_entry(gr, head, lkb_statequeue) {
-		if (gr->lkb_bastaddr &&
-		    gr->lkb_highbast < lkb->lkb_rqmode &&
-		    !modes_compat(gr, lkb)) {
+		if (gr->lkb_bastaddr && modes_require_bast(gr, lkb)) {
 			queue_bast(r, gr, lkb->lkb_rqmode);
 			gr->lkb_highbast = lkb->lkb_rqmode;
 		}
@@ -2235,7 +2276,7 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	   before we try again to grant this one. */
 
 	if (is_demoted(lkb)) {
-		grant_pending_convert(r, DLM_LOCK_IV);
+		grant_pending_convert(r, DLM_LOCK_IV, NULL);
 		if (_can_be_granted(r, lkb, 1)) {
 			grant_lock(r, lkb);
 			queue_cast(r, lkb, 0);
-- 
cgit v1.2.2


From bdcb88562ca90e6cfac13130e147c63aaa4f9e41 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 11 Jul 2007 15:55:23 -0500
Subject: [GFS2] soft lockup detected in databuf_lo_before_commit

This is part 2 of the patch for bug #245832, part 1 of which is already
in the git tree.

The problem was that sdp->sd_log_num_databuf was not always being
protected by the gfs2_log_lock spinlock, but the sd_log_le_databuf
(which it is supposed to reflect) was protected.  That meant there
was a timing window during which gfs2_log_flush called
databuf_lo_before_commit and the count didn't match what was
really on the linked list in that window.  So when it ran out of
items on the linked list, it decremented total_dbuf from 0 to -1 and
thus never left the "while(total_dbuf)" loop.

The solution is to protect the variable sdp->sd_log_num_databuf so
that the value will always match the contents of the linked list,
and therefore the number will never go negative, and therefore, the
loop will be exited properly.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/lops.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index aff70f0698fd..3b395c41b2f3 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -486,8 +486,8 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 		gfs2_pin(sdp, bd->bd_bh);
 		tr->tr_num_databuf_new++;
 	}
-	sdp->sd_log_num_databuf++;
 	gfs2_log_lock(sdp);
+	sdp->sd_log_num_databuf++;
 	list_add(&le->le_list, &sdp->sd_log_le_databuf);
 	gfs2_log_unlock(sdp);
 }
@@ -523,7 +523,7 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
 	struct buffer_head *bh = NULL,*bh1 = NULL;
 	struct gfs2_log_descriptor *ld;
 	unsigned int limit;
-	unsigned int total_dbuf = sdp->sd_log_num_databuf;
+	unsigned int total_dbuf;
 	unsigned int total_jdata = sdp->sd_log_num_jdata;
 	unsigned int num, n;
 	__be64 *ptr = NULL;
@@ -535,6 +535,7 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
 	 * into the log along with a header
 	 */
 	gfs2_log_lock(sdp);
+	total_dbuf = sdp->sd_log_num_databuf;
 	bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf,
 				       bd_le.le_list);
 	while(total_dbuf) {
@@ -653,6 +654,7 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
 				break;
 		}
 		bh = NULL;
+		BUG_ON(total_dbuf < num);
 		total_dbuf -= num;
 		total_jdata -= num;
 	}
-- 
cgit v1.2.2


From 24c7387333c77b602ece7ecd6a85fc94f8f16d8c Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Thu, 12 Jul 2007 16:58:50 -0500
Subject: [GFS2] soft lockup in rgblk_search

This patch seems to fix the problem described in bugzilla bug 246114.
It was written by Steve Whitehouse with some tweaking by me.

The code was looping in the relatively new section of code designed to
search for and reuse unlinked inodes.  In cases where it was finding an
appropriate inode to reuse, it was looping around and finding the same
block over and over because a "<=" check should have been a "<" when
comparing the goal block to the last unlinked block found.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index e4e040625153..bb58e69fd977 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -863,16 +863,19 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
 	u64 no_addr;
 
 	for(;;) {
+		if (goal >= rgd->rd_data)
+			break;
 		goal = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
 				    GFS2_BLKST_UNLINKED);
 		if (goal == 0)
-			return 0;
+			break;
 		no_addr = goal + rgd->rd_data0;
-		if (no_addr <= *last_unlinked)
+		goal++;
+		if (no_addr < *last_unlinked)
 			continue;
 		*last_unlinked = no_addr;
 		inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
-					no_addr, -1);
+					  no_addr, -1);
 		if (!IS_ERR(inode))
 			return inode;
 	}
-- 
cgit v1.2.2


From 6eefaf61f664053c1dd6534a994cab3f8bb07263 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 17 Jul 2007 10:26:56 +0100
Subject: [GFS2] Fix incorrect return code in rgrp.c

The following patch fixes a bug where 0 was being used as a return code
to indicate "nothing to do" when in fact 0 was a valid block location
which might be returned by the function.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index bb58e69fd977..ce48c4594ec8 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -867,7 +867,7 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
 			break;
 		goal = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
 				    GFS2_BLKST_UNLINKED);
-		if (goal == 0)
+		if (goal == BFITNOENT)
 			break;
 		no_addr = goal + rgd->rd_data0;
 		goal++;
@@ -1316,7 +1316,7 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 				    bi->bi_len, blk, new_state);
 	}
 
-	return (blk == BFITNOENT) ? 0 : (bi->bi_start * GFS2_NBBY) + blk;
+	return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk;
 }
 
 /**
@@ -1396,6 +1396,7 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip)
 		goal = rgd->rd_last_alloc_data;
 
 	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
+	BUG_ON(blk == BFITNOENT);
 	rgd->rd_last_alloc_data = blk;
 
 	block = rgd->rd_data0 + blk;
@@ -1440,6 +1441,7 @@ u64 gfs2_alloc_meta(struct gfs2_inode *ip)
 		goal = rgd->rd_last_alloc_meta;
 
 	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
+	BUG_ON(blk == BFITNOENT);
 	rgd->rd_last_alloc_meta = blk;
 
 	block = rgd->rd_data0 + blk;
@@ -1481,6 +1483,7 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
 
 	blk = rgblk_search(rgd, rgd->rd_last_alloc_meta,
 			   GFS2_BLKST_FREE, GFS2_BLKST_DINODE);
+	BUG_ON(blk == BFITNOENT);
 
 	rgd->rd_last_alloc_meta = blk;
 
-- 
cgit v1.2.2


From a867bb28c1cb49ae86d034d8bd8fe6dbcbb19566 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 17 Jul 2007 10:29:02 +0100
Subject: [GFS2] Fix incorrect error path in prepare_write()

The error path in prepare_write() was incorrect in the (very rare) event
that the transaction fails to start. The following prevents a NULL
pointer dereference,

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_address.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index ce90032c010e..42a5f58f6fca 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -416,7 +416,7 @@ static int gfs2_prepare_write(struct file *file, struct page *page,
 
 	error = gfs2_trans_begin(sdp, rblocks, 0);
 	if (error)
-		goto out;
+		goto out_trans_fail;
 
 	if (gfs2_is_stuffed(ip)) {
 		if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
@@ -434,6 +434,7 @@ prepare_write:
 out:
 	if (error) {
 		gfs2_trans_end(sdp);
+out_trans_fail:
 		if (alloc_required) {
 			gfs2_inplace_release(ip);
 out_qunlock:
-- 
cgit v1.2.2


From b9af7ca6d37d541fb0ed96355fd5c65501cbbda8 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 18 Jul 2007 11:40:06 +0100
Subject: [GFS2] Fix setting of inherit jdata attr

Due to a mix up between the jdata attribute and inherit jdata attribute
it has not been possible to set the inherit jdata attribute on
directories. This is now fixed and the ioctl will report the inherit
jdata attribute for directories rather than the jdata attribute as it
did previously. This stems from our need to have the one bit in the
ioctl attr flags mean two different things according to whether the
underlying inode is a directory or not.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_file.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 773421130116..94d76ace0b95 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -177,8 +177,8 @@ static const u32 fsflags_to_gfs2[32] = {
 	[5] = GFS2_DIF_APPENDONLY,
 	[7] = GFS2_DIF_NOATIME,
 	[12] = GFS2_DIF_EXHASH,
-	[14] = GFS2_DIF_JDATA,
-	[20] = GFS2_DIF_DIRECTIO,
+	[14] = GFS2_DIF_INHERIT_JDATA,
+	[20] = GFS2_DIF_INHERIT_DIRECTIO,
 };
 
 static const u32 gfs2_to_fsflags[32] = {
@@ -187,8 +187,6 @@ static const u32 gfs2_to_fsflags[32] = {
 	[gfs2fl_AppendOnly] = FS_APPEND_FL,
 	[gfs2fl_NoAtime] = FS_NOATIME_FL,
 	[gfs2fl_ExHash] = FS_INDEX_FL,
-	[gfs2fl_Jdata] = FS_JOURNAL_DATA_FL,
-	[gfs2fl_Directio] = FS_DIRECTIO_FL,
 	[gfs2fl_InheritDirectio] = FS_DIRECTIO_FL,
 	[gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
 };
@@ -207,6 +205,12 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
 		return error;
 
 	fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
+	if (!S_ISDIR(inode->i_mode)) {
+		if (ip->i_di.di_flags & GFS2_DIF_JDATA)
+			fsflags |= FS_JOURNAL_DATA_FL;
+		if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
+			fsflags |= FS_DIRECTIO_FL;
+	}
 	if (put_user(fsflags, ptr))
 		error = -EFAULT;
 
@@ -270,13 +274,6 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	if ((new_flags ^ flags) == 0)
 		goto out;
 
-	if (S_ISDIR(inode->i_mode)) {
-		if ((new_flags ^ flags) & GFS2_DIF_JDATA)
-			new_flags ^= (GFS2_DIF_JDATA|GFS2_DIF_INHERIT_JDATA);
-		if ((new_flags ^ flags) & GFS2_DIF_DIRECTIO)
-			new_flags ^= (GFS2_DIF_DIRECTIO|GFS2_DIF_INHERIT_DIRECTIO);
-	}
-
 	error = -EINVAL;
 	if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
 		goto out;
@@ -315,11 +312,19 @@ out:
 
 static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
 {
+	struct inode *inode = filp->f_path.dentry->d_inode;
 	u32 fsflags, gfsflags;
 	if (get_user(fsflags, ptr))
 		return -EFAULT;
 	gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
-	return do_gfs2_set_flags(filp, gfsflags, ~0);
+	if (!S_ISDIR(inode->i_mode)) {
+		if (gfsflags & GFS2_DIF_INHERIT_JDATA)
+			gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
+		if (gfsflags & GFS2_DIF_INHERIT_DIRECTIO)
+			gfsflags ^= (GFS2_DIF_DIRECTIO | GFS2_DIF_INHERIT_DIRECTIO);
+		return do_gfs2_set_flags(filp, gfsflags, ~0);
+	}
+	return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
 }
 
 static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-- 
cgit v1.2.2


From d18c4d687dd4625360ee14047d7eb454217719ee Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 19 Jul 2007 16:12:50 +0100
Subject: [GFS2] Revert remounting w/o acl option leaves acls enabled

This reverts commit 569a7b6c2e8965ff4908003b925757703a3d649c. The
code was correct originally. The default setting for ACLs after a
remount should be to be the same as before the remount.

Signed-off-by: Abhijith Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/mount.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index 6f006a804db3..4864659555d4 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -82,19 +82,20 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 	char *options, *o, *v;
 	int error = 0;
 
-	/*  If someone preloaded options, use those instead  */
-	spin_lock(&gfs2_sys_margs_lock);
-	if (!remount && gfs2_sys_margs) {
-		data = gfs2_sys_margs;
-		gfs2_sys_margs = NULL;
-	}
-	spin_unlock(&gfs2_sys_margs_lock);
+	if (!remount) {
+		/*  If someone preloaded options, use those instead  */
+		spin_lock(&gfs2_sys_margs_lock);
+		if (gfs2_sys_margs) {
+			data = gfs2_sys_margs;
+			gfs2_sys_margs = NULL;
+		}
+		spin_unlock(&gfs2_sys_margs_lock);
 
-	/*  Set some defaults  */
-	memset(args, 0, sizeof(struct gfs2_args));
-	args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
-	args->ar_quota = GFS2_QUOTA_DEFAULT;
-	args->ar_data = GFS2_DATA_DEFAULT;
+		/*  Set some defaults  */
+		args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
+		args->ar_quota = GFS2_QUOTA_DEFAULT;
+		args->ar_data = GFS2_DATA_DEFAULT;
+	}
 
 	/* Split the options into tokens with the "," character and
 	   process them */
-- 
cgit v1.2.2


From 5e6e6232753482dc0024a319b9d8f611d7a80c19 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 18 Aug 2007 00:15:20 +0000
Subject: [CIFS] Check return code on failed alloc

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/README | 6 ++++++
 fs/cifs/sess.c | 4 ++++
 2 files changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/README b/fs/cifs/README
index fa096f9c824d..b806b11b5560 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -458,6 +458,12 @@ A partial list of the supported mount options follows:
 		byte range locks).
  remount        remount the share (often used to change from ro to rw mounts
 	        or vice versa)
+ servern        Specify the server 's netbios name (RFC1001 name) to use
+		when attempting to setup a session to the server.  This is
+		This is needed for mounting to some older servers (such
+		as OS/2 or Windows 98 and Windows ME) since they do not
+		support a default server name.  A server name can be up
+		to 15 characters long and is usually uppercased.
  sfu            When the CIFS Unix Extensions are not negotiated, attempt to
 		create device files and fifos in a format compatible with
 		Services for Unix (SFU).  In addition retrieve bits 10-12
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 2ea027dda215..892be9b4d1f3 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -372,6 +372,10 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 
 	/* 2000 big enough to fit max user, domain, NOS name etc. */
 	str_area = kmalloc(2000, GFP_KERNEL);
+	if (str_area == NULL) {
+		cifs_small_buf_release(smb_buf);
+		return -ENOMEM;
+	}
 	bcc_ptr = str_area;
 
 	ses->flags &= ~CIFS_SES_LANMAN;
-- 
cgit v1.2.2


From d2d56c5f51028cb9f3d800882eb6f4cbd3f9099f Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Fri, 17 Aug 2007 21:47:58 +0200
Subject: Reset current->pdeath_signal on SUID binary execution

This fixes a vulnerability in the "parent process death signal"
implementation discoverd by Wojciech Purczynski of COSEINC PTE Ltd.
and iSEC Security Research.

http://marc.info/?l=bugtraq&m=118711306802632&w=2

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 7bdea7937ee8..ce62f7b65f17 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1084,9 +1084,12 @@ int flush_old_exec(struct linux_binprm * bprm)
 	 */
 	current->mm->task_size = TASK_SIZE;
 
-	if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || 
-	    file_permission(bprm->file, MAY_READ) ||
-	    (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
+	if (bprm->e_uid != current->euid || bprm->e_gid != current->egid) {
+		suid_keys(current);
+		set_dumpable(current->mm, suid_dumpable);
+		current->pdeath_signal = 0;
+	} else if (file_permission(bprm->file, MAY_READ) ||
+			(bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
 		suid_keys(current);
 		set_dumpable(current->mm, suid_dumpable);
 	}
@@ -1177,8 +1180,10 @@ void compute_creds(struct linux_binprm *bprm)
 {
 	int unsafe;
 
-	if (bprm->e_uid != current->uid)
+	if (bprm->e_uid != current->uid) {
 		suid_keys(current);
+		current->pdeath_signal = 0;
+	}
 	exec_keys(current);
 
 	task_lock(current);
-- 
cgit v1.2.2


From b5748643332bf75274e0b639926d57e86fb133cf Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 20 Aug 2007 11:05:29 +0100
Subject: JFFS2 locking regression fix.

Commit a491486a2087ac3dfc00efb4f838c8d684afaf54 introduced a locking
problem in JFFS2 -- we up() the alloc_sem when we weren't previously
holding it. This leads to all kinds of fun behaviour later.

There was a _reason_ for the
	if (1 /* alternative path needs testing */ ||
which the above-mentioned commit removed :)

Discovered and debugged by Giulio Fedel <giulio.fedel@andorsystems.com>

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jffs2/write.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index bc6185933664..664c164aa67c 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -566,6 +566,9 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
 		struct jffs2_full_dirent **prev = &dir_f->dents;
 		uint32_t nhash = full_name_hash(name, namelen);
 
+		/* We don't actually want to reserve any space, but we do
+		   want to be holding the alloc_sem when we write to flash */
+		down(&c->alloc_sem);
 		down(&dir_f->sem);
 
 		while ((*prev) && (*prev)->nhash <= nhash) {
-- 
cgit v1.2.2


From 848c4dd5153c7a0de55470ce99a8e13a63b4703f Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Mon, 20 Aug 2007 17:12:01 -0700
Subject: dio: zero struct dio with kzalloc instead of manually

This patch uses kzalloc to zero all of struct dio rather than manually
trying to track which fields we rely on being zero.  It passed aio+dio
stress testing and some bug regression testing on ext3.

This patch was introduced by Linus in the conversation that lead up to
Badari's minimal fix to manually zero .map_bh.b_state in commit:

  6a648fa72161d1f6468dabd96c5d3c0db04f598a

It makes the code a bit smaller.  Maybe a couple fewer cachelines to
load, if we're lucky:

   text    data     bss     dec     hex filename
3285925  568506 1304616 5159047  4eb887 vmlinux
3285797  568506 1304616 5158919  4eb807 vmlinux.patched

I was unable to measure a stable difference in the number of cpu cycles
spent in blockdev_direct_IO() when pushing aio+dio 256K reads at
~340MB/s.

So the resulting intent of the patch isn't a performance gain but to
avoid exposing ourselves to the risk of finding another field like
.map_bh.b_state where we rely on zeroing but don't enforce it in the
code.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/direct-io.c | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 6874785bb65a..901dc55e9f54 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -958,36 +958,22 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	ssize_t ret2;
 	size_t bytes;
 
-	dio->bio = NULL;
 	dio->inode = inode;
 	dio->rw = rw;
 	dio->blkbits = blkbits;
 	dio->blkfactor = inode->i_blkbits - blkbits;
-	dio->start_zero_done = 0;
-	dio->size = 0;
 	dio->block_in_file = offset >> blkbits;
-	dio->blocks_available = 0;
-	dio->cur_page = NULL;
 
-	dio->boundary = 0;
-	dio->reap_counter = 0;
 	dio->get_block = get_block;
 	dio->end_io = end_io;
-	dio->map_bh.b_private = NULL;
-	dio->map_bh.b_state = 0;
 	dio->final_block_in_bio = -1;
 	dio->next_block_for_io = -1;
 
-	dio->page_errors = 0;
-	dio->io_error = 0;
-	dio->result = 0;
 	dio->iocb = iocb;
 	dio->i_size = i_size_read(inode);
 
 	spin_lock_init(&dio->bio_lock);
 	dio->refcount = 1;
-	dio->bio_list = NULL;
-	dio->waiter = NULL;
 
 	/*
 	 * In case of non-aligned buffers, we may need 2 more
@@ -995,8 +981,6 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	 */
 	if (unlikely(dio->blkfactor))
 		dio->pages_in_io = 2;
-	else
-		dio->pages_in_io = 0;
 
 	for (seg = 0; seg < nr_segs; seg++) {
 		user_addr = (unsigned long)iov[seg].iov_base;
@@ -1184,7 +1168,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 		}
 	}
 
-	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+	dio = kzalloc(sizeof(*dio), GFP_KERNEL);
 	retval = -ENOMEM;
 	if (!dio)
 		goto out;
-- 
cgit v1.2.2


From df068464169a84a6a66c05d140f43a46d5eb6176 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Wed, 22 Aug 2007 14:01:02 -0700
Subject: eCryptfs: fix lookup error for special files

When ecryptfs_lookup() is called against special files, eCryptfs generates
the following errors because it tries to treat them like regular eCryptfs
files.

Error opening lower file for lower_dentry [0xffff810233a6f150], lower_mnt [0xffff810235bb4c80], and flags [0x8000]
Error opening lower_file to read header region
Error attempting to read the [user.ecryptfs] xattr from the lower file; return value = [-95]
Valid metadata not found in header region or xattr region; treating file as unencrypted

For instance, the problem can be reproduced by the steps below.

  # mkdir /root/crypt /mnt/crypt
  # mount -t ecryptfs /root/crypt /mnt/crypt
  # mknod /mnt/crypt/c0 c 0 0
  # umount /mnt/crypt
  # mount -t ecryptfs /root/crypt /mnt/crypt
  # ls -l /mnt/crypt

This patch fixes it by adding a check similar to directories and
symlinks.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Acked-by: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 131954b3fb98..5d40ad13ab5c 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -357,6 +357,10 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
 		ecryptfs_printk(KERN_DEBUG, "Is a special file; returning\n");
 		goto out;
 	}
+	if (special_file(lower_inode->i_mode)) {
+		ecryptfs_printk(KERN_DEBUG, "Is a special file; returning\n");
+		goto out;
+	}
 	if (!nd) {
 		ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave"
 				"as we *think* we are about to unlink\n");
-- 
cgit v1.2.2


From f9ee228bdc82cff8ea1ec00fd952890e00679dd8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 22 Aug 2007 14:01:48 -0700
Subject: signalfd: make it group-wide, fix posix-timers scheduling

With this patch any thread can dequeue its own private signals via signalfd,
even if it was created by another sub-thread.

To do so, we pass "current" to dequeue_signal() if the caller is from the same
thread group. This also fixes the scheduling of posix timers broken by the
previous patch.

If the caller doesn't belong to this thread group, we can't handle __SI_TIMER
case properly anyway. Perhaps we should forbid the cross-process signalfd usage
and convert ctx->tsk to ctx->sighand.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Roland McGrath <roland@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c     |  9 ++-------
 fs/signalfd.c | 14 ++++++++++----
 2 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index ce62f7b65f17..af4361c927a9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -779,19 +779,13 @@ static int de_thread(struct task_struct *tsk)
 	struct task_struct *leader = NULL;
 	int count;
 
-	/*
-	 * Tell all the sighand listeners that this sighand has
-	 * been detached. The signalfd_detach() function grabs the
-	 * sighand lock, if signal listeners are present on the sighand.
-	 */
-	signalfd_detach(tsk);
-
 	/*
 	 * If we don't share sighandlers, then we aren't sharing anything
 	 * and we can just re-use it all.
 	 */
 	if (atomic_read(&oldsighand->count) <= 1) {
 		BUG_ON(atomic_read(&sig->count) != 1);
+		signalfd_detach(tsk);
 		exit_itimers(sig);
 		return 0;
 	}
@@ -930,6 +924,7 @@ static int de_thread(struct task_struct *tsk)
 	sig->flags = 0;
 
 no_thread_group:
+	signalfd_detach(tsk);
 	exit_itimers(sig);
 	if (leader)
 		release_task(leader);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 7b941abbcde0..a8e293d30034 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -56,12 +56,18 @@ static int signalfd_lock(struct signalfd_ctx *ctx, struct signalfd_lockctx *lk)
 		sighand = lock_task_sighand(lk->tsk, &lk->flags);
 	rcu_read_unlock();
 
-	if (sighand && !ctx->tsk) {
+	if (!sighand)
+		return 0;
+
+	if (!ctx->tsk) {
 		unlock_task_sighand(lk->tsk, &lk->flags);
-		sighand = NULL;
+		return 0;
 	}
 
-	return sighand != NULL;
+	if (lk->tsk->tgid == current->tgid)
+		lk->tsk = current;
+
+	return 1;
 }
 
 static void signalfd_unlock(struct signalfd_lockctx *lk)
@@ -331,7 +337,7 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
 
 		init_waitqueue_head(&ctx->wqh);
 		ctx->sigmask = sigmask;
-		ctx->tsk = current;
+		ctx->tsk = current->group_leader;
 
 		sighand = current->sighand;
 		/*
-- 
cgit v1.2.2


From 1864f7bd58351732593def024e73eca1f75bc352 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 22 Aug 2007 14:01:54 -0700
Subject: autofs4: deadlock during create

Due to inconsistent locking in the VFS between calls to lookup and
revalidate deadlock can occur in the automounter.

The inconsistency is that the directory inode mutex is held for both lookup
and revalidate calls when called via lookup_hash whereas it is held only
for lookup during a path walk.  Consequently, if the mutex is held during a
call to revalidate autofs4 can't release the mutex to callback the daemon
as it can't know whether it owns the mutex.

This situation happens when a process tries to create a directory within an
automount and a second process also tries to create the same directory
between the lookup and the mkdir.  Since the first process has dropped the
mutex for the daemon callback, the second process takes it during
revalidate leading to deadlock between the autofs daemon and the second
process when the daemon tries to create the mount point directory.

After spending quite a bit of time trying to resolve this on more than one
occassion, using rather complex and ulgy approaches, it turns out that just
delaying the hashing of the dentry until the create operation works fine.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/root.c | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 2d4c8a3e604e..45ff3d63b758 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -587,19 +587,20 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	unhashed = autofs4_lookup_unhashed(sbi, dentry->d_parent, &dentry->d_name);
 	if (!unhashed) {
 		/*
-		 * Mark the dentry incomplete, but add it. This is needed so
-		 * that the VFS layer knows about the dentry, and we can count
-		 * on catching any lookups through the revalidate.
-		 *
-		 * Let all the hard work be done by the revalidate function that
-		 * needs to be able to do this anyway..
-		 *
-		 * We need to do this before we release the directory semaphore.
+		 * Mark the dentry incomplete but don't hash it. We do this
+		 * to serialize our inode creation operations (symlink and
+		 * mkdir) which prevents deadlock during the callback to
+		 * the daemon. Subsequent user space lookups for the same
+		 * dentry are placed on the wait queue while the daemon
+		 * itself is allowed passage unresticted so the create
+		 * operation itself can then hash the dentry. Finally,
+		 * we check for the hashed dentry and return the newly
+		 * hashed dentry.
 		 */
 		dentry->d_op = &autofs4_root_dentry_operations;
 
 		dentry->d_fsdata = NULL;
-		d_add(dentry, NULL);
+		d_instantiate(dentry, NULL);
 	} else {
 		struct autofs_info *ino = autofs4_dentry_ino(unhashed);
 		DPRINTK("rehash %p with %p", dentry, unhashed);
@@ -607,15 +608,17 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 		 * If we are racing with expire the request might not
 		 * be quite complete but the directory has been removed
 		 * so it must have been successful, so just wait for it.
+		 * We need to ensure the AUTOFS_INF_EXPIRING flag is clear
+		 * before continuing as revalidate may fail when calling
+		 * try_to_fill_dentry (returning EAGAIN) if we don't.
 		 */
-		if (ino && (ino->flags & AUTOFS_INF_EXPIRING)) {
+		while (ino && (ino->flags & AUTOFS_INF_EXPIRING)) {
 			DPRINTK("wait for incomplete expire %p name=%.*s",
 				unhashed, unhashed->d_name.len,
 				unhashed->d_name.name);
 			autofs4_wait(sbi, unhashed, NFY_NONE);
 			DPRINTK("request completed");
 		}
-		d_rehash(unhashed);
 		dentry = unhashed;
 	}
 
@@ -658,7 +661,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	 * for all system calls, but it should be OK for the operations
 	 * we permit from an autofs.
 	 */
-	if (dentry->d_inode && d_unhashed(dentry)) {
+	if (!oz_mode && d_unhashed(dentry)) {
 		/*
 		 * A user space application can (and has done in the past)
 		 * remove and re-create this directory during the callback.
@@ -716,7 +719,7 @@ static int autofs4_dir_symlink(struct inode *dir,
 	strcpy(cp, symname);
 
 	inode = autofs4_get_inode(dir->i_sb, ino);
-	d_instantiate(dentry, inode);
+	d_add(dentry, inode);
 
 	if (dir == dir->i_sb->s_root->d_inode)
 		dentry->d_op = &autofs4_root_dentry_operations;
@@ -844,7 +847,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		return -ENOSPC;
 
 	inode = autofs4_get_inode(dir->i_sb, ino);
-	d_instantiate(dentry, inode);
+	d_add(dentry, inode);
 
 	if (dir == dir->i_sb->s_root->d_inode)
 		dentry->d_op = &autofs4_root_dentry_operations;
-- 
cgit v1.2.2


From abd96ecb298675a21c412a29f5de2f80174d5f18 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 22 Aug 2007 14:01:58 -0700
Subject: exec: kill unsafe BUG_ON(sig->count) checks

de_thread:

	if (atomic_read(&oldsighand->count) <= 1)
		BUG_ON(atomic_read(&sig->count) != 1);

This is not safe without the rmb() in between.  The results of two
correctly ordered __exit_signal()->atomic_dec_and_test()'s could be seen
out of order on our CPU.

The same is true for the "thread_group_empty()" case, __unhash_process()'s
changes could be seen before atomic_dec_and_test(&sig->count).

On some platforms (including i386) atomic_read() doesn't provide even the
compiler barrier, in that case these checks are simply racy.

Remove these BUG_ON()'s. Alternatively, we can do something like

	BUG_ON( ({ smp_rmb(); atomic_read(&sig->count) != 1; }) );

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index af4361c927a9..c21a8cc06277 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -784,7 +784,6 @@ static int de_thread(struct task_struct *tsk)
 	 * and we can just re-use it all.
 	 */
 	if (atomic_read(&oldsighand->count) <= 1) {
-		BUG_ON(atomic_read(&sig->count) != 1);
 		signalfd_detach(tsk);
 		exit_itimers(sig);
 		return 0;
@@ -929,8 +928,6 @@ no_thread_group:
 	if (leader)
 		release_task(leader);
 
-	BUG_ON(atomic_read(&sig->count) != 1);
-
 	if (atomic_read(&oldsighand->count) == 1) {
 		/*
 		 * Now that we nuked the rest of the thread group,
-- 
cgit v1.2.2