From 07ff2fa8fcb3d9207f1c16e5acf9086d5731ed8b Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Tue, 28 Feb 2006 12:29:51 +1100
Subject: [XFS] Fix a realtime allocator regression introduced by an old iget
 race fix.  Noticed by Roger Willcocks.

SGI-PV: 949821
SGI-Modid: xfs-linux-melb:xfs-kern:25257a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_rtalloc.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 06fc061c50fc..5b413946b1c5 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -130,7 +130,8 @@ xfs_growfs_rt_alloc(
 		/*
 		 * Lock the inode.
 		 */
-		if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL, &ip)))
+		if ((error = xfs_trans_iget(mp, tp, ino, 0,
+						XFS_ILOCK_EXCL, &ip)))
 			goto error_exit;
 		XFS_BMAP_INIT(&flist, &firstblock);
 		/*
@@ -170,8 +171,8 @@ xfs_growfs_rt_alloc(
 			/*
 			 * Lock the bitmap inode.
 			 */
-			if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL,
-					&ip)))
+			if ((error = xfs_trans_iget(mp, tp, ino, 0,
+							XFS_ILOCK_EXCL, &ip)))
 				goto error_exit;
 			/*
 			 * Get a buffer for the block.
@@ -2023,8 +2024,8 @@ xfs_growfs_rt(
 		/*
 		 * Lock out other callers by grabbing the bitmap inode lock.
 		 */
-		if ((error = xfs_trans_iget(mp, tp, 0, mp->m_sb.sb_rbmino,
-				XFS_ILOCK_EXCL, &ip)))
+		if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+						XFS_ILOCK_EXCL, &ip)))
 			goto error_exit;
 		ASSERT(ip == mp->m_rbmip);
 		/*
@@ -2037,8 +2038,8 @@ xfs_growfs_rt(
 		/*
 		 * Get the summary inode into the transaction.
 		 */
-		if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino,
-				0, XFS_ILOCK_EXCL, &ip)))
+		if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0,
+						XFS_ILOCK_EXCL, &ip)))
 			goto error_exit;
 		ASSERT(ip == mp->m_rsumip);
 		/*
@@ -2158,10 +2159,9 @@ xfs_rtallocate_extent(
 	/*
 	 * Lock out other callers by grabbing the bitmap inode lock.
 	 */
-	error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip);
-	if (error) {
+	if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+					XFS_ILOCK_EXCL, &ip)))
 		return error;
-	}
 	sumbp = NULL;
 	/*
 	 * Allocate by size, or near another block, or exactly at some block.
@@ -2221,10 +2221,9 @@ xfs_rtfree_extent(
 	/*
 	 * Synchronize by locking the bitmap inode.
 	 */
-	error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip);
-	if (error) {
+	if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+					XFS_ILOCK_EXCL, &ip)))
 		return error;
-	}
 #if defined(__KERNEL__) && defined(DEBUG)
 	/*
 	 * Check to see that this whole range is currently allocated.
@@ -2365,8 +2364,8 @@ xfs_rtpick_extent(
 	__uint64_t	seq;		/* sequence number of file creation */
 	__uint64_t	*seqp;		/* pointer to seqno in inode */
 
-	error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip);
-	if (error)
+	if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+					XFS_ILOCK_EXCL, &ip)))
 		return error;
 	ASSERT(ip == mp->m_rbmip);
 	seqp = (__uint64_t *)&ip->i_d.di_atime;
-- 
cgit v1.2.2


From dae81d4774ecbeb7d24bb9a6a4db9f9baee54d85 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Tue, 28 Feb 2006 12:30:13 +1100
Subject: [XFS] Reduce stack use during quota mounts (caused a panic).  This
 regressed recently via the fix for inherited quota inode attributes.

SGI-PV: 947312
SGI-Modid: xfs-linux-melb:xfs-kern:25318a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/quota/xfs_qm.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 53a00fb217fa..7c0e39dc6189 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -68,6 +68,9 @@ kmem_zone_t	*qm_dqzone;
 kmem_zone_t	*qm_dqtrxzone;
 STATIC kmem_shaker_t	xfs_qm_shaker;
 
+STATIC cred_t	xfs_zerocr;
+STATIC xfs_inode_t	xfs_zeroino;
+
 STATIC void	xfs_qm_list_init(xfs_dqlist_t *, char *, int);
 STATIC void	xfs_qm_list_destroy(xfs_dqlist_t *);
 
@@ -1393,8 +1396,6 @@ xfs_qm_qino_alloc(
 	xfs_trans_t	*tp;
 	int		error;
 	unsigned long	s;
-	cred_t		zerocr;
-	xfs_inode_t	zeroino;
 	int		committed;
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
@@ -1406,11 +1407,9 @@ xfs_qm_qino_alloc(
 		xfs_trans_cancel(tp, 0);
 		return error;
 	}
-	memset(&zerocr, 0, sizeof(zerocr));
-	memset(&zeroino, 0, sizeof(zeroino));
 
-	if ((error = xfs_dir_ialloc(&tp, &zeroino, S_IFREG, 1, 0,
-				   &zerocr, 0, 1, ip, &committed))) {
+	if ((error = xfs_dir_ialloc(&tp, &xfs_zeroino, S_IFREG, 1, 0,
+				   &xfs_zerocr, 0, 1, ip, &committed))) {
 		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
 				 XFS_TRANS_ABORT);
 		return error;
-- 
cgit v1.2.2


From 2353e8e9b6ae29aad77935f21735a30f5cc419b4 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sgi.com>
Date: Tue, 28 Feb 2006 12:30:30 +1100
Subject: [XFS] Don't map non-uptodate buffers in xfs_probe_cluster; also fixes
 obscure corruption case

SGI-PV: 942658
SGI-Modid: xfs-linux-melb:xfs-kern:207119a

Signed-off-by: Eric Sandeen <sandeen@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 8f2beec526cf..74d8be87f983 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -540,7 +540,7 @@ xfs_probe_cluster(
 
 	/* First sum forwards in this page */
 	do {
-		if (mapped != buffer_mapped(bh))
+		if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh)))
 			return total;
 		total += bh->b_size;
 	} while ((bh = bh->b_this_page) != head);
-- 
cgit v1.2.2


From 50322fe7d46b544d5649edb58bdbe5c95dd44b98 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 28 Feb 2006 16:59:03 -0800
Subject: [PATCH] fuse: fix bug in negative lookup

If negative entries (nodeid == 0) were sent in reply to LOOKUP requests,
two bugs could be triggered:

- looking up a negative entry would return -EIO,

- revaildate on an entry which turned negative would send a FORGET
  request with zero nodeid, which would cause an abort() in the
  library.

The above would only happen if the 'negative_timeout=N' option was used,
otherwise lookups reply -ENOENT, which worked correctly.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dir.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 21fd59c7bc24..c72a8a97935c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -111,6 +111,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 
 		/* Doesn't hurt to "reset" the validity timeout */
 		fuse_invalidate_entry_cache(entry);
+
+		/* For negative dentries, always do a fresh lookup */
 		if (!inode)
 			return 0;
 
@@ -122,6 +124,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg);
 		request_send(fc, req);
 		err = req->out.h.error;
+		/* Zero nodeid is same as -ENOENT */
+		if (!err && !outarg.nodeid)
+			err = -ENOENT;
 		if (!err) {
 			struct fuse_inode *fi = get_fuse_inode(inode);
 			if (outarg.nodeid != get_node_id(inode)) {
@@ -190,8 +195,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	fuse_lookup_init(req, dir, entry, &outarg);
 	request_send(fc, req);
 	err = req->out.h.error;
-	if (!err && ((outarg.nodeid && invalid_nodeid(outarg.nodeid)) ||
-		     !valid_mode(outarg.attr.mode)))
+	/* Zero nodeid is same as -ENOENT, but with valid timeout */
+	if (!err && outarg.nodeid &&
+	    (invalid_nodeid(outarg.nodeid) || !valid_mode(outarg.attr.mode)))
 		err = -EIO;
 	if (!err && outarg.nodeid) {
 		inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
-- 
cgit v1.2.2


From 0551fbd29e16fccd46e41b7d01bf0f8f39b14212 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 28 Feb 2006 16:59:19 -0800
Subject: [PATCH] Add mm->task_size and fix powerpc vdso

This patch adds mm->task_size to keep track of the task size of a given mm
and uses that to fix the powerpc vdso so that it uses the mm task size to
decide what pages to fault in instead of the current thread flags (which
broke when ptracing).

(akpm: I expect that mm_struct.task_size will become the way in which we
finally sort out the confusion between 32-bit processes and 32-bit mm's.  It
may need tweaks, but at this stage this patch is powerpc-only.)

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 0e1c95074d42..0b515ac53134 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -885,6 +885,12 @@ int flush_old_exec(struct linux_binprm * bprm)
 	current->flags &= ~PF_RANDOMIZE;
 	flush_thread();
 
+	/* Set the new mm task size. We have to do that late because it may
+	 * depend on TIF_32BIT which is only updated in flush_thread() on
+	 * some architectures like powerpc
+	 */
+	current->mm->task_size = TASK_SIZE;
+
 	if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || 
 	    file_permission(bprm->file, MAY_READ) ||
 	    (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
-- 
cgit v1.2.2


From 6b7a6c94c9c15b2664b568ead83e6b3aaf60d65c Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 21 Feb 2006 11:57:30 -0500
Subject: [PATCH] ocfs2: fix -Wformat warnings when building UML on x86-64

 The check to determine which format string is appopriate for u64 and
 friends works in most cases, but UML on x86_64 doesn't define CONFIG_X86_64,
 so it results in screen fulls of compile-time warnings.

 This patch fixes it to handle that case.

 fs/ocfs2/cluster/masklog.h |    2 +-
 1 files changed, 1 insertion(+), 1 deletion(-)

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/masklog.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index e8c56a3d9c64..2cadc3009c83 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -256,7 +256,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 	}								\
 } while (0)
 
-#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64)
+#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64) || (defined(CONFIG_UML_X86) && defined(CONFIG_64BIT))
 #define MLFi64 "lld"
 #define MLFu64 "llu"
 #define MLFx64 "llx"
-- 
cgit v1.2.2


From d3178bcdd41b050e221337d7f5e30b3c58d4015a Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 24 Feb 2006 17:23:36 -0800
Subject: [PATCH] ocfs2: remove pointless max journal size limit

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/ocfs2_fs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index dfb8a5bedfc8..c5b1ac547c15 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -138,7 +138,6 @@
 
 /* Journal limits (in bytes) */
 #define OCFS2_MIN_JOURNAL_SIZE		(4 * 1024 * 1024)
-#define OCFS2_MAX_JOURNAL_SIZE		(500 * 1024 * 1024)
 
 struct ocfs2_system_inode_info {
 	char	*si_name;
-- 
cgit v1.2.2


From d267a56c883b350a2fa80f1daf4636809e3f8e67 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 23 Feb 2006 13:23:39 -0800
Subject: [PATCH] ocfs2: remove unused code

Remove some #ifdef'd out code which was inadvertantly introduced in our
initial merge.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/file.c  | 51 +--------------------------------------------------
 fs/ocfs2/ocfs2.h |  3 ---
 2 files changed, 1 insertion(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 1715bc90e705..8a4048b55fdc 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -933,9 +933,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_dentry->d_inode;
 	loff_t newsize, saved_pos;
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-#endif
 
 	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
 		   (unsigned int)count,
@@ -951,14 +948,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 		return -EIO;
 	}
 
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-	/* ugh, work around some applications which open everything O_DIRECT +
-	 * O_APPEND and really don't mean to use O_DIRECT. */
-	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
-	    (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT)) 
-		filp->f_flags &= ~O_DIRECT;
-#endif
-
 	mutex_lock(&inode->i_mutex);
 	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
 	if (filp->f_flags & O_DIRECT) {
@@ -1079,27 +1068,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	/* communicate with ocfs2_dio_end_io */
 	ocfs2_iocb_set_rw_locked(iocb);
 
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
-	    filp->f_flags & O_DIRECT) {
-		unsigned int saved_flags = filp->f_flags;
-		int sector_size = 1 << osb->s_sectsize_bits;
-
-		if ((saved_pos & (sector_size - 1)) ||
-		    (count & (sector_size - 1)) ||
-		    ((unsigned long)buf & (sector_size - 1))) {
-			filp->f_flags |= O_SYNC;
-			filp->f_flags &= ~O_DIRECT;
-		}
-
-		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
-						    &iocb->ki_pos);
-
-		filp->f_flags = saved_flags;
-	} else
-#endif
-		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
-						    &iocb->ki_pos);
+	ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
 
 	/* buffered aio wouldn't have proper lock coverage today */
 	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
@@ -1140,9 +1109,6 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 	int ret = 0, rw_level = -1, have_alloc_sem = 0;
 	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_dentry->d_inode;
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-#endif
 
 	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
 		   (unsigned int)count,
@@ -1155,21 +1121,6 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 		goto bail;
 	}
 
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
-		if (filp->f_flags & O_DIRECT) {
-			int sector_size = 1 << osb->s_sectsize_bits;
-
-			if ((pos & (sector_size - 1)) ||
-			    (count & (sector_size - 1)) ||
-			    ((unsigned long)buf & (sector_size - 1)) ||
-			    (i_size_read(inode) & (sector_size -1))) {
-				filp->f_flags &= ~O_DIRECT;
-			}
-		}
-	}
-#endif
-
 	/* 
 	 * buffered reads protect themselves in ->readpage().  O_DIRECT reads
 	 * need locks to protect pending reads from racing with truncate.
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 8d8e4779df92..19360e3d842e 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -174,9 +174,6 @@ enum ocfs2_mount_options
 	OCFS2_MOUNT_NOINTR  = 1 << 2,   /* Don't catch signals */
 	OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
 	OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-	OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */
-#endif
 };
 
 #define OCFS2_OSB_SOFT_RO	0x0001
-- 
cgit v1.2.2


From 362342f68e331f080d0438f08af1e2c570b0b5fe Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 21 Feb 2006 16:46:33 -0800
Subject: [PATCH] ocfs2: remove non existing function prototypes

Remove some prototypes from tcp.h for functions which have long been gone.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/tcp.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index a6f4585501c8..616ff2b8434a 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -85,13 +85,10 @@ enum {
 	O2NET_DRIVER_READY,
 };
 
-int o2net_init_tcp_sock(struct inode *inode);
 int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
 		       u8 target_node, int *status);
 int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
 			   size_t veclen, u8 target_node, int *status);
-int o2net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len,
-			    struct inode *group);
 
 int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
 			   o2net_msg_handler_func *func, void *data,
@@ -107,7 +104,5 @@ void o2net_disconnect_node(struct o2nm_node *node);
 
 int o2net_init(void);
 void o2net_exit(void);
-int o2net_proc_init(struct proc_dir_entry *parent);
-void o2net_proc_exit(struct proc_dir_entry *parent);
 
 #endif /* O2CLUSTER_TCP_H */
-- 
cgit v1.2.2


From 895928b8380cc697ac56e9732cedf549c0a4f79c Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 21 Feb 2006 16:54:00 -0800
Subject: [PATCH] ocfs2: complete failure recovery for nodemanager init

 This patch finishes cleaning up the node manager allocations if it fails
 to initialize.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/nodemanager.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index cf7828f23361..e1fceb8aa32d 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -756,7 +756,7 @@ static int __init init_o2nm(void)
 	if (!ocfs2_table_header) {
 		printk(KERN_ERR "nodemanager: unable to register sysctl\n");
 		ret = -ENOMEM; /* or something. */
-		goto out;
+		goto out_o2net;
 	}
 
 	ret = o2net_register_hb_callbacks();
@@ -780,6 +780,8 @@ out_callbacks:
 	o2net_unregister_hb_callbacks();
 out_sysctl:
 	unregister_sysctl_table(ocfs2_table_header);
+out_o2net:
+	o2net_exit();
 out:
 	return ret;
 }
-- 
cgit v1.2.2


From b4df6ed8db0c387d38292e31f00adc4cd297ed5a Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 22 Feb 2006 17:35:08 -0800
Subject: [PATCH] ocfs2: fix orphan recovery deadlock

Orphan dir recovery can deadlock with another process in
ocfs2_delete_inode() in some corner cases. Fix this by tracking recovery
state more closely and allowing it to handle inode wipes which might
deadlock.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/heartbeat.c |   1 +
 fs/ocfs2/inode.c     |  46 ++++++++++++++++++-
 fs/ocfs2/journal.c   | 124 ++++++++++++++++++++++++++++++++++++++-------------
 fs/ocfs2/ocfs2.h     |   4 ++
 fs/ocfs2/super.c     |  11 +++++
 5 files changed, 154 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 0bbd22f46c80..cbfd45a97a63 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -67,6 +67,7 @@ void ocfs2_init_node_maps(struct ocfs2_super *osb)
 	ocfs2_node_map_init(&osb->mounted_map);
 	ocfs2_node_map_init(&osb->recovery_map);
 	ocfs2_node_map_init(&osb->umount_map);
+	ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
 }
 
 static void ocfs2_do_node_down(int node_num,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 8122489c5762..315472a5c192 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -41,6 +41,7 @@
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
+#include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
 #include "namei.h"
@@ -544,6 +545,42 @@ bail:
 	return status;
 }
 
+/* 
+ * Serialize with orphan dir recovery. If the process doing
+ * recovery on this orphan dir does an iget() with the dir
+ * i_mutex held, we'll deadlock here. Instead we detect this
+ * and exit early - recovery will wipe this inode for us.
+ */
+static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
+					     int slot)
+{
+	int ret = 0;
+
+	spin_lock(&osb->osb_lock);
+	if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) {
+		mlog(0, "Recovery is happening on orphan dir %d, will skip "
+		     "this inode\n", slot);
+		ret = -EDEADLK;
+		goto out;
+	}
+	/* This signals to the orphan recovery process that it should
+	 * wait for us to handle the wipe. */
+	osb->osb_orphan_wipes[slot]++;
+out:
+	spin_unlock(&osb->osb_lock);
+	return ret;
+}
+
+static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb,
+					 int slot)
+{
+	spin_lock(&osb->osb_lock);
+	osb->osb_orphan_wipes[slot]--;
+	spin_unlock(&osb->osb_lock);
+
+	wake_up(&osb->osb_wipe_event);
+}
+
 static int ocfs2_wipe_inode(struct inode *inode,
 			    struct buffer_head *di_bh)
 {
@@ -555,6 +592,11 @@ static int ocfs2_wipe_inode(struct inode *inode,
 	/* We've already voted on this so it should be readonly - no
 	 * spinlock needed. */
 	orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+
+	status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
+	if (status)
+		return status;
+
 	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
 						       ORPHAN_DIR_SYSTEM_INODE,
 						       orphaned_slot);
@@ -597,6 +639,7 @@ bail_unlock_dir:
 	brelse(orphan_dir_bh);
 bail:
 	iput(orphan_dir_inode);
+	ocfs2_signal_wipe_completion(osb, orphaned_slot);
 
 	return status;
 }
@@ -822,7 +865,8 @@ void ocfs2_delete_inode(struct inode *inode)
 
 	status = ocfs2_wipe_inode(inode, di_bh);
 	if (status < 0) {
-		mlog_errno(status);
+		if (status != -EDEADLK)
+			mlog_errno(status);
 		goto bail_unlock_inode;
 	}
 
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index d329c9df90ae..4be801f4559b 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1408,21 +1408,17 @@ bail:
 	return status;
 }
 
-static int ocfs2_recover_orphans(struct ocfs2_super *osb,
-				 int slot)
+static int ocfs2_queue_orphans(struct ocfs2_super *osb,
+			       int slot,
+			       struct inode **head)
 {
-	int status = 0;
-	int have_disk_lock = 0;
-	struct inode *inode = NULL;
-	struct inode *iter;
+	int status;
 	struct inode *orphan_dir_inode = NULL;
+	struct inode *iter;
 	unsigned long offset, blk, local;
 	struct buffer_head *bh = NULL;
 	struct ocfs2_dir_entry *de;
 	struct super_block *sb = osb->sb;
-	struct ocfs2_inode_info *oi;
-
-	mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
 
 	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
 						       ORPHAN_DIR_SYSTEM_INODE,
@@ -1430,17 +1426,15 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 	if  (!orphan_dir_inode) {
 		status = -ENOENT;
 		mlog_errno(status);
-		goto out;
-	}
+		return status;
+	}	
 
 	mutex_lock(&orphan_dir_inode->i_mutex);
 	status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0);
 	if (status < 0) {
-		mutex_unlock(&orphan_dir_inode->i_mutex);
 		mlog_errno(status);
 		goto out;
 	}
-	have_disk_lock = 1;
 
 	offset = 0;
 	iter = NULL;
@@ -1451,11 +1445,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 		if (!bh)
 			status = -EINVAL;
 		if (status < 0) {
-			mutex_unlock(&orphan_dir_inode->i_mutex);
 			if (bh)
 				brelse(bh);
 			mlog_errno(status);
-			goto out;
+			goto out_unlock;
 		}
 
 		local = 0;
@@ -1465,11 +1458,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 
 			if (!ocfs2_check_dir_entry(orphan_dir_inode,
 						  de, bh, local)) {
-				mutex_unlock(&orphan_dir_inode->i_mutex);
 				status = -EINVAL;
 				mlog_errno(status);
 				brelse(bh);
-				goto out;
+				goto out_unlock;
 			}
 
 			local += le16_to_cpu(de->rec_len);
@@ -1504,18 +1496,95 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 
 			mlog(0, "queue orphan %"MLFu64"\n",
 			     OCFS2_I(iter)->ip_blkno);
-			OCFS2_I(iter)->ip_next_orphan = inode;
-			inode = iter;
+			/* No locking is required for the next_orphan
+			 * queue as there is only ever a single
+			 * process doing orphan recovery. */
+			OCFS2_I(iter)->ip_next_orphan = *head;
+			*head = iter;
 		}
 		brelse(bh);
 	}
-	mutex_unlock(&orphan_dir_inode->i_mutex);
 
+out_unlock:
 	ocfs2_meta_unlock(orphan_dir_inode, 0);
-	have_disk_lock = 0;
-
+out:
+	mutex_unlock(&orphan_dir_inode->i_mutex);
 	iput(orphan_dir_inode);
-	orphan_dir_inode = NULL;
+	return status;
+}
+
+static int ocfs2_orphan_recovery_can_continue(struct ocfs2_super *osb,
+					      int slot)
+{
+	int ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = !osb->osb_orphan_wipes[slot];
+	spin_unlock(&osb->osb_lock);
+	return ret;
+}
+
+static void ocfs2_mark_recovering_orphan_dir(struct ocfs2_super *osb,
+					     int slot)
+{
+	spin_lock(&osb->osb_lock);
+	/* Mark ourselves such that new processes in delete_inode()
+	 * know to quit early. */
+	ocfs2_node_map_set_bit(osb, &osb->osb_recovering_orphan_dirs, slot);
+	while (osb->osb_orphan_wipes[slot]) {
+		/* If any processes are already in the middle of an
+		 * orphan wipe on this dir, then we need to wait for
+		 * them. */
+		spin_unlock(&osb->osb_lock);
+		wait_event_interruptible(osb->osb_wipe_event,
+					 ocfs2_orphan_recovery_can_continue(osb, slot));
+		spin_lock(&osb->osb_lock);
+	}
+	spin_unlock(&osb->osb_lock);
+}
+
+static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,
+					      int slot)
+{
+	ocfs2_node_map_clear_bit(osb, &osb->osb_recovering_orphan_dirs, slot);
+}
+
+/*
+ * Orphan recovery. Each mounted node has it's own orphan dir which we
+ * must run during recovery. Our strategy here is to build a list of
+ * the inodes in the orphan dir and iget/iput them. The VFS does
+ * (most) of the rest of the work.
+ *
+ * Orphan recovery can happen at any time, not just mount so we have a
+ * couple of extra considerations.
+ *
+ * - We grab as many inodes as we can under the orphan dir lock -
+ *   doing iget() outside the orphan dir risks getting a reference on
+ *   an invalid inode.
+ * - We must be sure not to deadlock with other processes on the
+ *   system wanting to run delete_inode(). This can happen when they go
+ *   to lock the orphan dir and the orphan recovery process attempts to
+ *   iget() inside the orphan dir lock. This can be avoided by
+ *   advertising our state to ocfs2_delete_inode().
+ */
+static int ocfs2_recover_orphans(struct ocfs2_super *osb,
+				 int slot)
+{
+	int ret = 0;
+	struct inode *inode = NULL;
+	struct inode *iter;
+	struct ocfs2_inode_info *oi;
+
+	mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
+
+	ocfs2_mark_recovering_orphan_dir(osb, slot);
+	ret = ocfs2_queue_orphans(osb, slot, &inode);
+	ocfs2_clear_recovering_orphan_dir(osb, slot);
+
+	/* Error here should be noted, but we want to continue with as
+	 * many queued inodes as we've got. */
+	if (ret)
+		mlog_errno(ret);
 
 	while (inode) {
 		oi = OCFS2_I(inode);
@@ -1541,14 +1610,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 		inode = iter;
 	}
 
-out:
-	if (have_disk_lock)
-		ocfs2_meta_unlock(orphan_dir_inode, 0);
-
-	if (orphan_dir_inode)
-		iput(orphan_dir_inode);
-
-	return status;
+	return ret;
 }
 
 static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 19360e3d842e..e89de9b6e491 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -287,6 +287,10 @@ struct ocfs2_super
 	struct inode			*osb_tl_inode;
 	struct buffer_head		*osb_tl_bh;
 	struct work_struct		osb_truncate_log_wq;
+
+	struct ocfs2_node_map		osb_recovering_orphan_dirs;
+	unsigned int			*osb_orphan_wipes;
+	wait_queue_head_t		osb_wipe_event;
 };
 
 #define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 046824b6b625..8dd3aafec499 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1325,6 +1325,16 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	}
 	mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots);
 
+	init_waitqueue_head(&osb->osb_wipe_event);
+	osb->osb_orphan_wipes = kcalloc(osb->max_slots,
+					sizeof(*osb->osb_orphan_wipes),
+					GFP_KERNEL);
+	if (!osb->osb_orphan_wipes) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
 	osb->s_feature_compat =
 		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
 	osb->s_feature_ro_compat =
@@ -1638,6 +1648,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
 	if (osb->slot_info)
 		ocfs2_free_slot_info(osb->slot_info);
 
+	kfree(osb->osb_orphan_wipes);
 	/* FIXME
 	 * This belongs in journal shutdown, but because we have to
 	 * allocate osb->journal at the start of ocfs2_initalize_osb(),
-- 
cgit v1.2.2


From 93cc9ac4555a9b95c78b2f5dfe536fe8196002a7 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 27 Feb 2006 16:53:05 -0800
Subject: ocfs2: Set .owner on masklog sysfs attributes.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/masklog.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index fd741cea5705..636593bf4d17 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -74,6 +74,7 @@ struct mlog_attribute {
 #define define_mask(_name) {			\
 	.attr = {				\
 		.name = #_name,			\
+		.owner = THIS_MODULE,		\
 		.mode = S_IRUGO | S_IWUSR,	\
 	},					\
 	.mask = ML_##_name,			\
-- 
cgit v1.2.2


From 110ba90858a7f619ff26c6b9b43c27b3c0872335 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 28 Feb 2006 17:58:36 -0800
Subject: ocfs2: Respond to on-disk corruption in the extent map code.

The extent map code has long noticed when the on-disk extent information
is corrupt.  However, so far it has only returned an error.  We should
take the filesystem read-only, as it is corrupt.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/extent_map.c | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index b6ba292e9544..e6f207eebab4 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -181,6 +181,12 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode,
 			ret = -EBADR;
 			if (rec_end > OCFS2_I(inode)->ip_clusters) {
 				mlog_errno(ret);
+				ocfs2_error(inode->i_sb,
+					    "Extent %d at e_blkno %"MLFu64" of inode %"MLFu64" goes past ip_clusters of %u\n",
+					    i,
+					    le64_to_cpu(rec->e_blkno),
+					    OCFS2_I(inode)->ip_blkno,
+					    OCFS2_I(inode)->ip_clusters);
 				goto out_free;
 			}
 
@@ -226,6 +232,12 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode,
 			ret = -EBADR;
 			if (blkno) {
 				mlog_errno(ret);
+				ocfs2_error(inode->i_sb,
+					    "Multiple extents for (cpos = %u, clusters = %u) on inode %"MLFu64"; e_blkno %"MLFu64" and rec %d at e_blkno %"MLFu64"\n",
+					    cpos, clusters,
+					    OCFS2_I(inode)->ip_blkno,
+					    blkno, i,
+					    le64_to_cpu(rec->e_blkno));
 				goto out_free;
 			}
 
@@ -238,6 +250,10 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode,
 		 */
 		ret = -EBADR;
 		if (!blkno) {
+			ocfs2_error(inode->i_sb,
+				    "No record found for (cpos = %u, clusters = %u) on inode %"MLFu64"\n",
+				    cpos, clusters,
+				    OCFS2_I(inode)->ip_blkno);
 			mlog_errno(ret);
 			goto out_free;
 		}
@@ -266,6 +282,20 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode,
 
 	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
 		rec = &el->l_recs[i];
+
+		if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
+		    OCFS2_I(inode)->ip_clusters) {
+			ret = -EBADR;
+			mlog_errno(ret);
+			ocfs2_error(inode->i_sb,
+				    "Extent %d at e_blkno %"MLFu64" of inode %"MLFu64" goes past ip_clusters of %u\n",
+				    i,
+				    le64_to_cpu(rec->e_blkno),
+				    OCFS2_I(inode)->ip_blkno,
+				    OCFS2_I(inode)->ip_clusters);
+			return ret;
+		}
+
 		ret = ocfs2_extent_map_insert(inode, rec,
 					      le16_to_cpu(el->l_tree_depth));
 		if (ret) {
@@ -526,6 +556,10 @@ static int ocfs2_extent_map_insert(struct inode *inode,
 		    OCFS2_I(inode)->ip_map.em_clusters) {
 			ret = -EBADR;
 			mlog_errno(ret);
+			ocfs2_error(inode->i_sb,
+				    "Zero e_clusters on non-tail extent record at e_blkno %"MLFu64" on inode %"MLFu64"\n",
+				    le64_to_cpu(rec->e_blkno),
+				    OCFS2_I(inode)->ip_blkno);
 			return ret;
 		}
 
@@ -588,12 +622,12 @@ static int ocfs2_extent_map_insert(struct inode *inode,
  * Existing record in the extent map:
  *
  *	cpos = 10, len = 10
- * 	|---------|
+ *	|---------|
  *
  * New Record:
  *
  *	cpos = 10, len = 20
- * 	|------------------|
+ *	|------------------|
  *
  * The passed record is the new on-disk record.  The new_clusters value
  * is how many clusters were added to the file.  If the append is a
-- 
cgit v1.2.2


From b7668c72d2ae004363fb0588600bfa942e1b245c Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 28 Feb 2006 23:28:01 -0800
Subject: [PATCH] ocfs2: added source addr to bind() in o2net_start_connect()

to prevent confusion when a virtual ip is created on the same interface

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/tcp.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d22d4cf08db1..0f60cc0d3985 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1318,7 +1318,7 @@ static void o2net_start_connect(void *arg)
 {
 	struct o2net_node *nn = arg;
 	struct o2net_sock_container *sc = NULL;
-	struct o2nm_node *node = NULL;
+	struct o2nm_node *node = NULL, *mynode = NULL;
 	struct socket *sock = NULL;
 	struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
 	int ret = 0;
@@ -1334,6 +1334,12 @@ static void o2net_start_connect(void *arg)
 		goto out;
 	}
 
+	mynode = o2nm_get_node_by_num(o2nm_this_node());
+	if (mynode == NULL) {
+		ret = 0;
+		goto out;
+	}
+
 	spin_lock(&nn->nn_lock);
 	/* see if we already have one pending or have given up */
 	if (nn->nn_sc || nn->nn_persistent_error)
@@ -1361,12 +1367,14 @@ static void o2net_start_connect(void *arg)
 	sock->sk->sk_allocation = GFP_ATOMIC;
 
 	myaddr.sin_family = AF_INET;
+	myaddr.sin_addr.s_addr = (__force u32)mynode->nd_ipv4_address;
 	myaddr.sin_port = (__force u16)htons(0); /* any port */
 
 	ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
 			      sizeof(myaddr));
 	if (ret) {
-		mlog(0, "bind failed: %d\n", ret);
+		mlog(ML_ERROR, "bind failed with %d at address %u.%u.%u.%u\n",
+		     ret, NIPQUAD(mynode->nd_ipv4_address));
 		goto out;
 	}
 
@@ -1407,6 +1415,8 @@ out:
 		sc_put(sc);
 	if (node)
 		o2nm_node_put(node);
+	if (mynode)
+		o2nm_node_put(mynode);
 
 	return;
 }
-- 
cgit v1.2.2


From 81f2094a631df1ba275f4d4bd7ea5bacfd8dbcfc Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 28 Feb 2006 17:31:22 -0800
Subject: [PATCH] ocfs2: use hlists for lockres hash

Switch from list_head to hlist_head. Make the size of the hash dependent
upon the allocated area, rather than a constant.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmcommon.h   |  8 +++-----
 fs/ocfs2/dlm/dlmdebug.c    | 12 +++++-------
 fs/ocfs2/dlm/dlmdomain.c   | 39 +++++++++++++++++++--------------------
 fs/ocfs2/dlm/dlmmaster.c   |  4 ++--
 fs/ocfs2/dlm/dlmrecovery.c | 23 ++++++++++++-----------
 5 files changed, 41 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 23ceaa7127b4..9c772583744a 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,9 +37,7 @@
 #define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
 #define DLM_THREAD_MS                  200   // flush at least every 200 ms
 
-#define DLM_HASH_BITS     7
-#define DLM_HASH_SIZE     (1 << DLM_HASH_BITS)
-#define DLM_HASH_MASK     (DLM_HASH_SIZE - 1)
+#define DLM_HASH_BUCKETS     (PAGE_SIZE / sizeof(struct hlist_head))
 
 enum dlm_ast_type {
 	DLM_AST = 0,
@@ -87,7 +85,7 @@ enum dlm_ctxt_state {
 struct dlm_ctxt
 {
 	struct list_head list;
-	struct list_head *resources;
+	struct hlist_head *lockres_hash;
 	struct list_head dirty_list;
 	struct list_head purge_list;
 	struct list_head pending_asts;
@@ -217,7 +215,7 @@ struct dlm_lock_resource
 {
 	/* WARNING: Please see the comment in dlm_init_lockres before
 	 * adding fields here. */
-	struct list_head list;
+	struct hlist_node hash_node;
 	struct kref      refs;
 
 	/* please keep these next 3 in this order
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index f339fe27975a..54f61b76ab51 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -117,8 +117,8 @@ EXPORT_SYMBOL_GPL(dlm_print_one_lock);
 void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
 {
 	struct dlm_lock_resource *res;
-	struct list_head *iter;
-	struct list_head *bucket;
+	struct hlist_node *iter;
+	struct hlist_head *bucket;
 	int i;
 
 	mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
@@ -129,12 +129,10 @@ void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
 	}
 
 	spin_lock(&dlm->spinlock);
-	for (i=0; i<DLM_HASH_SIZE; i++) {
-		bucket = &(dlm->resources[i]);
-		list_for_each(iter, bucket) {
-			res = list_entry(iter, struct dlm_lock_resource, list);
+	for (i=0; i<DLM_HASH_BUCKETS; i++) {
+		bucket = &(dlm->lockres_hash[i]);
+		hlist_for_each_entry(res, iter, bucket, hash_node)
 			dlm_print_one_lock_resource(res);
-		}
 	}
 	spin_unlock(&dlm->spinlock);
 }
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6ee30837389c..8f3a9e3106fd 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -77,26 +77,26 @@ static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
 
 void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
 {
-	list_del_init(&lockres->list);
+	hlist_del_init(&lockres->hash_node);
 	dlm_lockres_put(lockres);
 }
 
 void __dlm_insert_lockres(struct dlm_ctxt *dlm,
 		       struct dlm_lock_resource *res)
 {
-	struct list_head *bucket;
+	struct hlist_head *bucket;
 	struct qstr *q;
 
 	assert_spin_locked(&dlm->spinlock);
 
 	q = &res->lockname;
 	q->hash = full_name_hash(q->name, q->len);
-	bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]);
+	bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]);
 
 	/* get a reference for our hashtable */
 	dlm_lockres_get(res);
 
-	list_add_tail(&res->list, bucket);
+	hlist_add_head(&res->hash_node, bucket);
 }
 
 struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
@@ -104,9 +104,9 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
 					 unsigned int len)
 {
 	unsigned int hash;
-	struct list_head *iter;
+	struct hlist_node *iter;
 	struct dlm_lock_resource *tmpres=NULL;
-	struct list_head *bucket;
+	struct hlist_head *bucket;
 
 	mlog_entry("%.*s\n", len, name);
 
@@ -114,11 +114,11 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
 
 	hash = full_name_hash(name, len);
 
-	bucket = &(dlm->resources[hash & DLM_HASH_MASK]);
+	bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]);
 
 	/* check for pre-existing lock */
-	list_for_each(iter, bucket) {
-		tmpres = list_entry(iter, struct dlm_lock_resource, list);
+	hlist_for_each(iter, bucket) {
+		tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node);
 		if (tmpres->lockname.len == len &&
 		    memcmp(tmpres->lockname.name, name, len) == 0) {
 			dlm_lockres_get(tmpres);
@@ -193,8 +193,8 @@ static int dlm_wait_on_domain_helper(const char *domain)
 
 static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
 {
-	if (dlm->resources)
-		free_page((unsigned long) dlm->resources);
+	if (dlm->lockres_hash)
+		free_page((unsigned long) dlm->lockres_hash);
 
 	if (dlm->name)
 		kfree(dlm->name);
@@ -303,10 +303,10 @@ static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
 	mlog(0, "Migrating locks from domain %s\n", dlm->name);
 restart:
 	spin_lock(&dlm->spinlock);
-	for (i=0; i<DLM_HASH_SIZE; i++) {
-		while (!list_empty(&dlm->resources[i])) {
-			res = list_entry(dlm->resources[i].next,
-				     struct dlm_lock_resource, list);
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		while (!hlist_empty(&dlm->lockres_hash[i])) {
+			res = hlist_entry(dlm->lockres_hash[i].first,
+					  struct dlm_lock_resource, hash_node);
 			/* need reference when manually grabbing lockres */
 			dlm_lockres_get(res);
 			/* this should unhash the lockres
@@ -1191,18 +1191,17 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 		goto leave;
 	}
 
-	dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL);
-	if (!dlm->resources) {
+	dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL);
+	if (!dlm->lockres_hash) {
 		mlog_errno(-ENOMEM);
 		kfree(dlm->name);
 		kfree(dlm);
 		dlm = NULL;
 		goto leave;
 	}
-	memset(dlm->resources, 0, PAGE_SIZE);
 
-	for (i=0; i<DLM_HASH_SIZE; i++)
-		INIT_LIST_HEAD(&dlm->resources[i]);
+	for (i=0; i<DLM_HASH_BUCKETS; i++)
+		INIT_HLIST_HEAD(&dlm->lockres_hash[i]);
 
 	strcpy(dlm->name, domain);
 	dlm->key = key;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 2e2e95e69499..847dd3cc4cf5 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -564,7 +564,7 @@ static void dlm_lockres_release(struct kref *kref)
 
 	/* By the time we're ready to blow this guy away, we shouldn't
 	 * be on any lists. */
-	BUG_ON(!list_empty(&res->list));
+	BUG_ON(!hlist_unhashed(&res->hash_node));
 	BUG_ON(!list_empty(&res->granted));
 	BUG_ON(!list_empty(&res->converting));
 	BUG_ON(!list_empty(&res->blocked));
@@ -605,7 +605,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
 
 	init_waitqueue_head(&res->wq);
 	spin_lock_init(&res->spinlock);
-	INIT_LIST_HEAD(&res->list);
+	INIT_HLIST_NODE(&res->hash_node);
 	INIT_LIST_HEAD(&res->granted);
 	INIT_LIST_HEAD(&res->converting);
 	INIT_LIST_HEAD(&res->blocked);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index ed76bda1a534..1e232000f3f7 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1693,7 +1693,10 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
 					      u8 dead_node, u8 new_master)
 {
 	int i;
-	struct list_head *iter, *iter2, *bucket;
+	struct list_head *iter, *iter2;
+	struct hlist_node *hash_iter;
+	struct hlist_head *bucket;
+
 	struct dlm_lock_resource *res;
 
 	mlog_entry_void();
@@ -1717,10 +1720,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
 	 * for now we need to run the whole hash, clear
 	 * the RECOVERING state and set the owner
 	 * if necessary */
-	for (i=0; i<DLM_HASH_SIZE; i++) {
-		bucket = &(dlm->resources[i]);
-		list_for_each(iter, bucket) {
-			res = list_entry (iter, struct dlm_lock_resource, list);
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		bucket = &(dlm->lockres_hash[i]);
+		hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
 			if (res->state & DLM_LOCK_RES_RECOVERING) {
 				if (res->owner == dead_node) {
 					mlog(0, "(this=%u) res %.*s owner=%u "
@@ -1852,10 +1854,10 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
 
 static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
 {
-	struct list_head *iter;
+	struct hlist_node *iter;
 	struct dlm_lock_resource *res;
 	int i;
-	struct list_head *bucket;
+	struct hlist_head *bucket;
 	struct dlm_lock *lock;
 
 
@@ -1876,10 +1878,9 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
 	 *    can be kicked again to see if any ASTs or BASTs
 	 *    need to be fired as a result.
 	 */
-	for (i=0; i<DLM_HASH_SIZE; i++) {
-		bucket = &(dlm->resources[i]);
-		list_for_each(iter, bucket) {
-			res = list_entry (iter, struct dlm_lock_resource, list);
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		bucket = &(dlm->lockres_hash[i]);
+		hlist_for_each_entry(res, iter, bucket, hash_node) {
  			/* always prune any $RECOVERY entries for dead nodes,
  			 * otherwise hangs can occur during later recovery */
 			if (dlm_is_recovery_lock(res->lockname.name,
-- 
cgit v1.2.2


From 6a3124a3946c16159c3faf83e62ffdb5d1134b3a Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Thu, 2 Mar 2006 02:54:30 -0800
Subject: [PATCH] v9fs: fix atomic create open

In order to assure atomic create+open v9fs stores the open fid produced by
v9fs_vfs_create in the dentry, from where v9fs_file_open retrieves it and
associates it with the open file.

This patch modifies v9fs to use nameidata.intent.open values to do the atomic
create+open.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/fid.c       |  73 ++-------
 fs/9p/fid.h       |   5 +-
 fs/9p/v9fs_vfs.h  |   1 +
 fs/9p/vfs_file.c  | 106 +++++-------
 fs/9p/vfs_inode.c | 478 +++++++++++++++++++++++++++++++++++-------------------
 fs/9p/vfs_super.c |  12 +-
 6 files changed, 379 insertions(+), 296 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index eda449778fa5..c27f546dd25b 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -40,7 +40,7 @@
  *
  */
 
-static int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry)
+int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry)
 {
 	struct list_head *fid_list = (struct list_head *)dentry->d_fsdata;
 	dprintk(DEBUG_9P, "fid %d (%p) dentry %s (%p)\n", fid->fid, fid,
@@ -68,14 +68,11 @@ static int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry)
  *
  */
 
-struct v9fs_fid *v9fs_fid_create(struct dentry *dentry,
-	struct v9fs_session_info *v9ses, int fid, int create)
+struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *v9ses, int fid)
 {
 	struct v9fs_fid *new;
 
-	dprintk(DEBUG_9P, "fid create dentry %p, fid %d, create %d\n",
-		dentry, fid, create);
-
+	dprintk(DEBUG_9P, "fid create fid %d\n", fid);
 	new = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL);
 	if (new == NULL) {
 		dprintk(DEBUG_ERROR, "Out of Memory\n");
@@ -85,19 +82,13 @@ struct v9fs_fid *v9fs_fid_create(struct dentry *dentry,
 	new->fid = fid;
 	new->v9ses = v9ses;
 	new->fidopen = 0;
-	new->fidcreate = create;
 	new->fidclunked = 0;
 	new->iounit = 0;
 	new->rdir_pos = 0;
 	new->rdir_fcall = NULL;
+	INIT_LIST_HEAD(&new->list);
 
-	if (v9fs_fid_insert(new, dentry) == 0)
 		return new;
-	else {
-		dprintk(DEBUG_ERROR, "Problems inserting to dentry\n");
-		kfree(new);
-		return NULL;
-	}
 }
 
 /**
@@ -119,7 +110,7 @@ void v9fs_fid_destroy(struct v9fs_fid *fid)
 static struct v9fs_fid *v9fs_fid_walk_up(struct dentry *dentry)
 {
 	int fidnum, cfidnum, err;
-	struct v9fs_fid *cfid;
+	struct v9fs_fid *cfid, *fid;
 	struct dentry *cde;
 	struct v9fs_session_info *v9ses;
 
@@ -158,7 +149,16 @@ static struct v9fs_fid *v9fs_fid_walk_up(struct dentry *dentry)
 		cde = cde->d_parent;
 	}
 
-	return v9fs_fid_create(dentry, v9ses, fidnum, 0);
+	fid = v9fs_fid_create(v9ses, fidnum);
+	if (fid) {
+		err = v9fs_fid_insert(fid, dentry);
+		if (err < 0) {
+			kfree(fid);
+			goto clunk_fid;
+		}
+	}
+
+	return fid;
 
 clunk_fid:
 	v9fs_t_clunk(v9ses, fidnum);
@@ -179,29 +179,12 @@ clunk_fid:
 struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry)
 {
 	struct list_head *fid_list = (struct list_head *)dentry->d_fsdata;
-	struct v9fs_fid *current_fid = NULL;
-	struct v9fs_fid *temp = NULL;
 	struct v9fs_fid *return_fid = NULL;
 
 	dprintk(DEBUG_9P, " dentry: %s (%p)\n", dentry->d_iname, dentry);
 
-	if (fid_list) {
-		list_for_each_entry_safe(current_fid, temp, fid_list, list) {
-			if (!current_fid->fidcreate) {
-				return_fid = current_fid;
-				break;
-			}
-		}
-
-		if (!return_fid)
-			return_fid = current_fid;
-	}
-
-	/* we are at the root but didn't match */
-	if ((!return_fid) && (dentry->d_parent == dentry)) {
-		/* TODO: clone attach with new uid */
-		return_fid = current_fid;
-	}
+	if (fid_list)
+		return_fid = list_entry(fid_list->next, struct v9fs_fid, list);
 
 	if (!return_fid) {
 		struct dentry *par = current->fs->pwd->d_parent;
@@ -228,25 +211,3 @@ struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry)
 
 	return return_fid;
 }
-
-struct v9fs_fid *v9fs_fid_get_created(struct dentry *dentry)
-{
-	struct list_head *fid_list;
-	struct v9fs_fid *fid, *ftmp, *ret;
-
-	dprintk(DEBUG_9P, " dentry: %s (%p)\n", dentry->d_iname, dentry);
-	fid_list = (struct list_head *)dentry->d_fsdata;
-	ret = NULL;
-	if (fid_list) {
-		list_for_each_entry_safe(fid, ftmp, fid_list, list) {
-			if (fid->fidcreate && fid->pid == current->pid) {
-				list_del(&fid->list);
-				ret = fid;
-				break;
-			}
-		}
-	}
-
-	dprintk(DEBUG_9P, "return %p\n", ret);
-	return ret;
-}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index 84c673a44c83..7ccf0d064e25 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -33,7 +33,6 @@ struct v9fs_fid {
 
 	u32 fid;
 	unsigned char fidopen;	  /* set when fid is opened */
-	unsigned char fidcreate;  /* set when fid was just created */
 	unsigned char fidclunked; /* set when fid has already been clunked */
 
 	struct v9fs_qid qid;
@@ -56,5 +55,5 @@ struct v9fs_fid {
 struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry);
 struct v9fs_fid *v9fs_fid_get_created(struct dentry *);
 void v9fs_fid_destroy(struct v9fs_fid *fid);
-struct v9fs_fid *v9fs_fid_create(struct dentry *,
-	struct v9fs_session_info *v9ses, int fid, int create);
+struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *, int fid);
+int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry);
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 69cf2905dc90..a759278acaae 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -51,3 +51,4 @@ int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
 void v9fs_inode2stat(struct inode *inode, struct v9fs_stat *stat);
 void v9fs_dentry_release(struct dentry *);
+int v9fs_uflags2omode(int uflags);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index c7e14d917215..de3a129698da 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -53,94 +53,70 @@
 int v9fs_file_open(struct inode *inode, struct file *file)
 {
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
-	struct v9fs_fid *v9fid, *fid;
+	struct v9fs_fid *vfid;
 	struct v9fs_fcall *fcall = NULL;
-	int open_mode = 0;
-	unsigned int iounit = 0;
-	int newfid = -1;
-	long result = -1;
+	int omode;
+	int fid = V9FS_NOFID;
+	int err;
 
 	dprintk(DEBUG_VFS, "inode: %p file: %p \n", inode, file);
 
-	v9fid = v9fs_fid_get_created(file->f_dentry);
-	if (!v9fid)
-		v9fid = v9fs_fid_lookup(file->f_dentry);
-
-	if (!v9fid) {
+	vfid = v9fs_fid_lookup(file->f_dentry);
+	if (!vfid) {
 		dprintk(DEBUG_ERROR, "Couldn't resolve fid from dentry\n");
 		return -EBADF;
 	}
 
-	if (!v9fid->fidcreate) {
-		fid = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL);
-		if (fid == NULL) {
-			dprintk(DEBUG_ERROR, "Out of Memory\n");
-			return -ENOMEM;
-		}
-
-		fid->fidopen = 0;
-		fid->fidcreate = 0;
-		fid->fidclunked = 0;
-		fid->iounit = 0;
-		fid->v9ses = v9ses;
-
-		newfid = v9fs_get_idpool(&v9ses->fidpool);
-		if (newfid < 0) {
+	fid = v9fs_get_idpool(&v9ses->fidpool);
+	if (fid < 0) {
 			eprintk(KERN_WARNING, "newfid fails!\n");
 			return -ENOSPC;
 		}
 
-		result =
-		    v9fs_t_walk(v9ses, v9fid->fid, newfid, NULL, NULL);
-
-		if (result < 0) {
-			v9fs_put_idpool(newfid, &v9ses->fidpool);
+	err = v9fs_t_walk(v9ses, vfid->fid, fid, NULL, NULL);
+	if (err < 0) {
 			dprintk(DEBUG_ERROR, "rewalk didn't work\n");
-			return -EBADF;
+		goto put_fid;
+	}
+
+	vfid = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL);
+	if (vfid == NULL) {
+		dprintk(DEBUG_ERROR, "out of memory\n");
+		goto clunk_fid;
 		}
 
-		fid->fid = newfid;
-		v9fid = fid;
 		/* TODO: do special things for O_EXCL, O_NOFOLLOW, O_SYNC */
 		/* translate open mode appropriately */
-		open_mode = file->f_flags & 0x3;
+	omode = v9fs_uflags2omode(file->f_flags);
+	err = v9fs_t_open(v9ses, fid, omode, &fcall);
+	if (err < 0) {
+		PRINT_FCALL_ERROR("open failed", fcall);
+		goto destroy_vfid;
+	}
 
-		if (file->f_flags & O_EXCL)
-			open_mode |= V9FS_OEXCL;
+	file->private_data = vfid;
+	vfid->fid = fid;
+	vfid->fidopen = 1;
+	vfid->fidclunked = 0;
+	vfid->iounit = fcall->params.ropen.iounit;
+	vfid->rdir_pos = 0;
+	vfid->rdir_fcall = NULL;
+	vfid->filp = file;
+	kfree(fcall);
 
-		if (v9ses->extended) {
-			if (file->f_flags & O_TRUNC)
-				open_mode |= V9FS_OTRUNC;
+	return 0;
 
-			if (file->f_flags & O_APPEND)
-				open_mode |= V9FS_OAPPEND;
-		}
+destroy_vfid:
+	v9fs_fid_destroy(vfid);
 
-		result = v9fs_t_open(v9ses, newfid, open_mode, &fcall);
-		if (result < 0) {
-			PRINT_FCALL_ERROR("open failed", fcall);
-			kfree(fcall);
-			return result;
-		}
+clunk_fid:
+	v9fs_t_clunk(v9ses, fid);
 
-		iounit = fcall->params.ropen.iounit;
+put_fid:
+	v9fs_put_idpool(fid, &v9ses->fidpool);
 		kfree(fcall);
-	} else {
-		/* create case */
-		newfid = v9fid->fid;
-		iounit = v9fid->iounit;
-		v9fid->fidcreate = 0;
-	}
-
-	file->private_data = v9fid;
-
-	v9fid->rdir_pos = 0;
-	v9fid->rdir_fcall = NULL;
-	v9fid->fidopen = 1;
-	v9fid->filp = file;
-	v9fid->iounit = iounit;
 
-	return 0;
+	return err;
 }
 
 /**
@@ -289,9 +265,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
 		total += result;
 	} while (count);
 
-	if(inode->i_mapping->nrpages)
 		invalidate_inode_pages2(inode->i_mapping);
-
 	return total;
 }
 
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 63e5b0398e8b..dce729d42869 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -125,6 +125,38 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
 	return res;
 }
 
+int v9fs_uflags2omode(int uflags)
+{
+	int ret;
+
+	ret = 0;
+	switch (uflags&3) {
+	default:
+	case O_RDONLY:
+		ret = V9FS_OREAD;
+		break;
+
+	case O_WRONLY:
+		ret = V9FS_OWRITE;
+		break;
+
+	case O_RDWR:
+		ret = V9FS_ORDWR;
+		break;
+	}
+
+	if (uflags & O_EXCL)
+		ret |= V9FS_OEXCL;
+
+	if (uflags & O_TRUNC)
+		ret |= V9FS_OTRUNC;
+
+	if (uflags & O_APPEND)
+		ret |= V9FS_OAPPEND;
+
+	return ret;
+}
+
 /**
  * v9fs_blank_wstat - helper function to setup a 9P stat structure
  * @v9ses: 9P session info (for determining extended mode)
@@ -163,7 +195,7 @@ v9fs_blank_wstat(struct v9fs_wstat *wstat)
 
 struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 {
-	struct inode *inode = NULL;
+	struct inode *inode;
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
 
 	dprintk(DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
@@ -222,171 +254,135 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 	return inode;
 }
 
-/**
- * v9fs_create - helper function to create files and directories
- * @dir: directory inode file is being created in
- * @file_dentry: dentry file is being created in
- * @perm: permissions file is being created with
- * @open_mode: resulting open mode for file
- *
- */
-
 static int
-v9fs_create(struct inode *dir,
-	    struct dentry *file_dentry,
-	    unsigned int perm, unsigned int open_mode)
+v9fs_create(struct v9fs_session_info *v9ses, u32 pfid, char *name,
+	u32 perm, u8 mode, u32 *fidp, struct v9fs_qid *qid, u32 *iounit)
 {
-	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
-	struct super_block *sb = dir->i_sb;
-	struct v9fs_fid *dirfid =
-	    v9fs_fid_lookup(file_dentry->d_parent);
-	struct v9fs_fid *fid = NULL;
-	struct inode *file_inode = NULL;
-	struct v9fs_fcall *fcall = NULL;
-	struct v9fs_qid qid;
-	int dirfidnum = -1;
-	long newfid = -1;
-	int result = 0;
-	unsigned int iounit = 0;
-	int wfidno = -1;
+	u32 fid;
 	int err;
+	struct v9fs_fcall *fcall;
 
-	perm = unixmode2p9mode(v9ses, perm);
-
-	dprintk(DEBUG_VFS, "dir: %p dentry: %p perm: %o mode: %o\n", dir,
-		file_dentry, perm, open_mode);
-
-	if (!dirfid)
-		return -EBADF;
-
-	dirfidnum = dirfid->fid;
-	if (dirfidnum < 0) {
-		dprintk(DEBUG_ERROR, "No fid for the directory #%lu\n",
-			dir->i_ino);
-		return -EBADF;
-	}
-
-	if (file_dentry->d_inode) {
-		dprintk(DEBUG_ERROR,
-			"Odd. There is an inode for dir %lu, name :%s:\n",
-			dir->i_ino, file_dentry->d_name.name);
-		return -EEXIST;
-	}
-
-	newfid = v9fs_get_idpool(&v9ses->fidpool);
-	if (newfid < 0) {
+	fid = v9fs_get_idpool(&v9ses->fidpool);
+	if (fid < 0) {
 		eprintk(KERN_WARNING, "no free fids available\n");
-		return -ENOSPC;
+		err = -ENOSPC;
+		goto error;
 	}
 
-	result = v9fs_t_walk(v9ses, dirfidnum, newfid, NULL, &fcall);
-	if (result < 0) {
+	err = v9fs_t_walk(v9ses, pfid, fid, NULL, &fcall);
+	if (err < 0) {
 		PRINT_FCALL_ERROR("clone error", fcall);
-		v9fs_put_idpool(newfid, &v9ses->fidpool);
-		newfid = -1;
-		goto CleanUpFid;
+		goto error;
 	}
-
 	kfree(fcall);
-	fcall = NULL;
 
-	result = v9fs_t_create(v9ses, newfid, (char *)file_dentry->d_name.name,
-			       perm, open_mode, &fcall);
-	if (result < 0) {
+	err = v9fs_t_create(v9ses, fid, name, perm, mode, &fcall);
+	if (err < 0) {
 		PRINT_FCALL_ERROR("create fails", fcall);
-		goto CleanUpFid;
+		goto error;
 	}
 
-	iounit = fcall->params.rcreate.iounit;
-	qid = fcall->params.rcreate.qid;
+	if (iounit)
+		*iounit = fcall->params.rcreate.iounit;
+
+	if (qid)
+		*qid = fcall->params.rcreate.qid;
+
+	if (fidp)
+		*fidp = fid;
+
 	kfree(fcall);
-	fcall = NULL;
+	return 0;
 
-	if (!(perm&V9FS_DMDIR)) {
-		fid = v9fs_fid_create(file_dentry, v9ses, newfid, 1);
-		dprintk(DEBUG_VFS, "fid %p %d\n", fid, fid->fidcreate);
-		if (!fid) {
-			result = -ENOMEM;
-			goto CleanUpFid;
-		}
+error:
+	if (fid >= 0)
+		v9fs_put_idpool(fid, &v9ses->fidpool);
 
-		fid->qid = qid;
-		fid->iounit = iounit;
-	} else {
-		err = v9fs_t_clunk(v9ses, newfid);
-		newfid = -1;
-		if (err < 0)
-			dprintk(DEBUG_ERROR, "clunk for mkdir failed: %d\n", err);
-	}
+	kfree(fcall);
+	return err;
+}
+
+static struct v9fs_fid*
+v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry)
+{
+	int err;
+	u32 nfid;
+	struct v9fs_fid *ret;
+	struct v9fs_fcall *fcall;
 
-	/* walk to the newly created file and put the fid in the dentry */
-	wfidno = v9fs_get_idpool(&v9ses->fidpool);
-	if (wfidno < 0) {
+	nfid = v9fs_get_idpool(&v9ses->fidpool);
+	if (nfid < 0) {
 		eprintk(KERN_WARNING, "no free fids available\n");
-		return -ENOSPC;
+		err = -ENOSPC;
+		goto error;
 	}
 
-	result = v9fs_t_walk(v9ses, dirfidnum, wfidno,
-		(char *) file_dentry->d_name.name, &fcall);
-	if (result < 0) {
-		PRINT_FCALL_ERROR("clone error", fcall);
-		v9fs_put_idpool(wfidno, &v9ses->fidpool);
-		wfidno = -1;
-		goto CleanUpFid;
+	err = v9fs_t_walk(v9ses, fid, nfid, (char *) dentry->d_name.name,
+		&fcall);
+
+	if (err < 0) {
+		PRINT_FCALL_ERROR("walk error", fcall);
+		v9fs_put_idpool(nfid, &v9ses->fidpool);
+		goto error;
 	}
+
 	kfree(fcall);
 	fcall = NULL;
+	ret = v9fs_fid_create(v9ses, nfid);
+	if (!ret) {
+		err = -ENOMEM;
+		goto clunk_fid;
+	}
 
-	if (!v9fs_fid_create(file_dentry, v9ses, wfidno, 0)) {
-		v9fs_put_idpool(wfidno, &v9ses->fidpool);
-
-		goto CleanUpFid;
+	err = v9fs_fid_insert(ret, dentry);
+	if (err < 0) {
+		v9fs_fid_destroy(ret);
+		goto clunk_fid;
 	}
 
-	if ((perm & V9FS_DMSYMLINK) || (perm & V9FS_DMLINK) ||
-	    (perm & V9FS_DMNAMEDPIPE) || (perm & V9FS_DMSOCKET) ||
-	    (perm & V9FS_DMDEVICE))
-		return 0;
+	return ret;
 
-	result = v9fs_t_stat(v9ses, wfidno, &fcall);
-	if (result < 0) {
-		PRINT_FCALL_ERROR("stat error", fcall);
-		goto CleanUpFid;
-	}
+clunk_fid:
+	v9fs_t_clunk(v9ses, nfid);
+
+error:
+	kfree(fcall);
+	return ERR_PTR(err);
+}
 
+struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, u32 fid,
+	struct super_block *sb)
+{
+	int err, umode;
+	struct inode *ret;
+	struct v9fs_fcall *fcall;
 
-	file_inode = v9fs_get_inode(sb,
-		p9mode2unixmode(v9ses, fcall->params.rstat.stat.mode));
+	ret = NULL;
+	err = v9fs_t_stat(v9ses, fid, &fcall);
+	if (err) {
+		PRINT_FCALL_ERROR("stat error", fcall);
+		goto error;
+	}
 
-	if ((!file_inode) || IS_ERR(file_inode)) {
-		dprintk(DEBUG_ERROR, "create inode failed\n");
-		result = -EBADF;
-		goto CleanUpFid;
+	umode = p9mode2unixmode(v9ses, fcall->params.rstat.stat.mode);
+	ret = v9fs_get_inode(sb, umode);
+	if (IS_ERR(ret)) {
+		err = PTR_ERR(ret);
+		ret = NULL;
+		goto error;
 	}
 
-	v9fs_stat2inode(&fcall->params.rstat.stat, file_inode, sb);
+	v9fs_stat2inode(&fcall->params.rstat.stat, ret, sb);
 	kfree(fcall);
-	fcall = NULL;
-	file_dentry->d_op = &v9fs_dentry_operations;
-	d_instantiate(file_dentry, file_inode);
-
-	return 0;
+	return ret;
 
-      CleanUpFid:
+error:
 	kfree(fcall);
-	fcall = NULL;
+	if (ret)
+		iput(ret);
 
-	if (newfid >= 0) {
- 		err = v9fs_t_clunk(v9ses, newfid);
- 		if (err < 0)
- 			dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
-	}
-	if (wfidno >= 0) {
- 		err = v9fs_t_clunk(v9ses, wfidno);
- 		if (err < 0)
- 			dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
-	}
-	return result;
+	return ERR_PTR(err);
 }
 
 /**
@@ -440,20 +436,97 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 	return result;
 }
 
+static int
+v9fs_open_created(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
 /**
  * v9fs_vfs_create - VFS hook to create files
  * @inode: directory inode that is being deleted
  * @dentry:  dentry that is being deleted
- * @perm: create permissions
+ * @mode: create permissions
  * @nd: path information
  *
  */
 
 static int
-v9fs_vfs_create(struct inode *inode, struct dentry *dentry, int perm,
+v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 		struct nameidata *nd)
 {
-	return v9fs_create(inode, dentry, perm, O_RDWR);
+	int err;
+	u32 fid, perm, iounit;
+	int flags;
+	struct v9fs_session_info *v9ses;
+	struct v9fs_fid *dfid, *vfid, *ffid;
+	struct inode *inode;
+	struct v9fs_qid qid;
+	struct file *filp;
+
+	inode = NULL;
+	vfid = NULL;
+	v9ses = v9fs_inode2v9ses(dir);
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	perm = unixmode2p9mode(v9ses, mode);
+
+	if (nd && nd->flags & LOOKUP_OPEN)
+		flags = nd->intent.open.flags - 1;
+	else
+		flags = O_RDWR;
+
+	err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name,
+		perm, v9fs_uflags2omode(flags), &fid, &qid, &iounit);
+
+	if (err)
+		goto error;
+
+	vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry);
+	if (IS_ERR(vfid)) {
+		err = PTR_ERR(vfid);
+		vfid = NULL;
+		goto error;
+	}
+
+	inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		inode = NULL;
+		goto error;
+	}
+
+	dentry->d_op = &v9fs_dentry_operations;
+	d_instantiate(dentry, inode);
+
+	if (nd && nd->flags & LOOKUP_OPEN) {
+		ffid = v9fs_fid_create(v9ses, fid);
+		if (!ffid)
+			return -ENOMEM;
+
+		filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created);
+		if (IS_ERR(filp)) {
+			v9fs_fid_destroy(ffid);
+			return PTR_ERR(filp);
+		}
+
+		ffid->rdir_pos = 0;
+		ffid->rdir_fcall = NULL;
+		ffid->fidopen = 1;
+		ffid->iounit = iounit;
+		ffid->filp = filp;
+		filp->private_data = ffid;
+	}
+
+	return 0;
+
+error:
+	if (vfid)
+		v9fs_fid_destroy(vfid);
+
+	if (inode)
+		iput(inode);
+
+	return err;
 }
 
 /**
@@ -464,9 +537,57 @@ v9fs_vfs_create(struct inode *inode, struct dentry *dentry, int perm,
  *
  */
 
-static int v9fs_vfs_mkdir(struct inode *inode, struct dentry *dentry, int mode)
+static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-	return v9fs_create(inode, dentry, mode | S_IFDIR, O_RDONLY);
+	int err;
+	u32 fid, perm;
+	struct v9fs_session_info *v9ses;
+	struct v9fs_fid *dfid, *vfid;
+	struct inode *inode;
+
+	inode = NULL;
+	vfid = NULL;
+	v9ses = v9fs_inode2v9ses(dir);
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	perm = unixmode2p9mode(v9ses, mode | S_IFDIR);
+
+	err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name,
+		perm, V9FS_OREAD, &fid, NULL, NULL);
+
+	if (err) {
+		dprintk(DEBUG_ERROR, "create error %d\n", err);
+		goto error;
+	}
+
+	err = v9fs_t_clunk(v9ses, fid);
+	if (err) {
+		dprintk(DEBUG_ERROR, "clunk error %d\n", err);
+		goto error;
+	}
+
+	vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry);
+	if (IS_ERR(vfid)) {
+		err = PTR_ERR(vfid);
+		vfid = NULL;
+		goto error;
+	}
+
+	inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		inode = NULL;
+		goto error;
+	}
+
+	dentry->d_op = &v9fs_dentry_operations;
+	d_instantiate(dentry, inode);
+	return 0;
+
+error:
+	if (vfid)
+		v9fs_fid_destroy(vfid);
+
+	return err;
 }
 
 /**
@@ -516,9 +637,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 		return ERR_PTR(-ENOSPC);
 	}
 
-	result =
-	    v9fs_t_walk(v9ses, dirfidnum, newfid, (char *)dentry->d_name.name,
-			NULL);
+	result = v9fs_t_walk(v9ses, dirfidnum, newfid,
+		(char *)dentry->d_name.name, NULL);
 	if (result < 0) {
 		v9fs_put_idpool(newfid, &v9ses->fidpool);
 		if (result == -ENOENT) {
@@ -551,13 +671,17 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat.qid);
 
-	fid = v9fs_fid_create(dentry, v9ses, newfid, 0);
+	fid = v9fs_fid_create(v9ses, newfid);
 	if (fid == NULL) {
 		dprintk(DEBUG_ERROR, "couldn't insert\n");
 		result = -ENOMEM;
 		goto FreeFcall;
 	}
 
+	result = v9fs_fid_insert(fid, dentry);
+	if (result < 0)
+		goto FreeFcall;
+
 	fid->qid = fcall->params.rstat.stat.qid;
 
 	dentry->d_op = &v9fs_dentry_operations;
@@ -886,8 +1010,8 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
 	}
 
 	/* copy extension buffer into buffer */
-	if (fcall->params.rstat.stat.extension.len+1 < buflen)
-		buflen = fcall->params.rstat.stat.extension.len + 1;
+	if (fcall->params.rstat.stat.extension.len < buflen)
+		buflen = fcall->params.rstat.stat.extension.len;
 
 	memcpy(buffer, fcall->params.rstat.stat.extension.str, buflen - 1);
 	buffer[buflen-1] = 0;
@@ -951,7 +1075,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	if (!link)
 		link = ERR_PTR(-ENOMEM);
 	else {
-		len = v9fs_readlink(dentry, link, PATH_MAX);
+		len = v9fs_readlink(dentry, link, strlen(link));
 
 		if (len < 0) {
 			__putname(link);
@@ -983,53 +1107,75 @@ static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void
 static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 	int mode, const char *extension)
 {
-	int err, retval;
+	int err;
+	u32 fid, perm;
 	struct v9fs_session_info *v9ses;
+	struct v9fs_fid *dfid, *vfid;
+	struct inode *inode;
 	struct v9fs_fcall *fcall;
-	struct v9fs_fid *fid;
 	struct v9fs_wstat wstat;
 
-	v9ses = v9fs_inode2v9ses(dir);
-	retval = -EPERM;
 	fcall = NULL;
+	inode = NULL;
+	vfid = NULL;
+	v9ses = v9fs_inode2v9ses(dir);
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	perm = unixmode2p9mode(v9ses, mode);
 
 	if (!v9ses->extended) {
 		dprintk(DEBUG_ERROR, "not extended\n");
-		goto free_mem;
+		return -EPERM;
 	}
 
-	/* issue a create */
-	retval = v9fs_create(dir, dentry, mode, 0);
-	if (retval != 0)
-		goto free_mem;
+	err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name,
+		perm, V9FS_OREAD, &fid, NULL, NULL);
 
-	fid = v9fs_fid_get_created(dentry);
-	if (!fid) {
-		dprintk(DEBUG_ERROR, "couldn't resolve fid from dentry\n");
-		goto free_mem;
+	if (err)
+		goto error;
+
+	err = v9fs_t_clunk(v9ses, fid);
+	if (err)
+		goto error;
+
+	vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry);
+	if (IS_ERR(vfid)) {
+		err = PTR_ERR(vfid);
+		vfid = NULL;
+		goto error;
+	}
+
+	inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		inode = NULL;
+		goto error;
 	}
 
 	/* issue a Twstat */
 	v9fs_blank_wstat(&wstat);
 	wstat.muid = v9ses->name;
 	wstat.extension = (char *) extension;
-	retval = v9fs_t_wstat(v9ses, fid->fid, &wstat, &fcall);
-	if (retval < 0) {
-		PRINT_FCALL_ERROR("wstat error", fcall);
-		goto free_mem;
-	}
-
-	err = v9fs_t_clunk(v9ses, fid->fid);
+	err = v9fs_t_wstat(v9ses, vfid->fid, &wstat, &fcall);
 	if (err < 0) {
-		dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
-		goto free_mem;
+		PRINT_FCALL_ERROR("wstat error", fcall);
+		goto error;
 	}
 
-	d_drop(dentry);		/* FID - will this also clunk? */
+	kfree(fcall);
+	dentry->d_op = &v9fs_dentry_operations;
+	d_instantiate(dentry, inode);
+	return 0;
 
-free_mem:
+error:
 	kfree(fcall);
-	return retval;
+	if (vfid)
+		v9fs_fid_destroy(vfid);
+
+	if (inode)
+		iput(inode);
+
+	return err;
+
 }
 
 /**
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 2c4fa75be025..0c85872be51a 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -146,7 +146,6 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 	inode->i_gid = gid;
 
 	root = d_alloc_root(inode);
-
 	if (!root) {
 		retval = -ENOMEM;
 		goto put_back_sb;
@@ -157,24 +156,27 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 	stat_result = v9fs_t_stat(v9ses, newfid, &fcall);
 	if (stat_result < 0) {
 		dprintk(DEBUG_ERROR, "stat error\n");
+		kfree(fcall);
 		v9fs_t_clunk(v9ses, newfid);
-		v9fs_put_idpool(newfid, &v9ses->fidpool);
 	} else {
 		/* Setup the Root Inode */
-		root_fid = v9fs_fid_create(root, v9ses, newfid, 0);
+		kfree(fcall);
+		root_fid = v9fs_fid_create(v9ses, newfid);
 		if (root_fid == NULL) {
 			retval = -ENOMEM;
 			goto put_back_sb;
 		}
 
+		retval = v9fs_fid_insert(root_fid, root);
+		if (retval < 0)
+			goto put_back_sb;
+
 		root_fid->qid = fcall->params.rstat.stat.qid;
 		root->d_inode->i_ino =
 		    v9fs_qid2ino(&fcall->params.rstat.stat.qid);
 		v9fs_stat2inode(&fcall->params.rstat.stat, root->d_inode, sb);
 	}
 
-	kfree(fcall);
-
 	if (stat_result < 0) {
 		retval = stat_result;
 		goto put_back_sb;
-- 
cgit v1.2.2


From 74b8054c730785cd9db093e48f53337e521b6270 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Thu, 2 Mar 2006 02:54:32 -0800
Subject: [PATCH] v9fs: fix bug in atomic create open fix

Lucho's atomic create+open fix had a bug in the super block initialization
causing all mounts to fail.  He was freeing an fcall too early.  This patch
fixes that oversight.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/vfs_super.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 0c85872be51a..cdf787ee08de 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -160,7 +160,6 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 		v9fs_t_clunk(v9ses, newfid);
 	} else {
 		/* Setup the Root Inode */
-		kfree(fcall);
 		root_fid = v9fs_fid_create(v9ses, newfid);
 		if (root_fid == NULL) {
 			retval = -ENOMEM;
@@ -168,8 +167,10 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 		}
 
 		retval = v9fs_fid_insert(root_fid, root);
-		if (retval < 0)
+		if (retval < 0) {
+			kfree(fcall);
 			goto put_back_sb;
+		}
 
 		root_fid->qid = fcall->params.rstat.stat.qid;
 		root->d_inode->i_ino =
@@ -177,6 +178,8 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 		v9fs_stat2inode(&fcall->params.rstat.stat, root->d_inode, sb);
 	}
 
+	kfree(fcall);
+
 	if (stat_result < 0) {
 		retval = stat_result;
 		goto put_back_sb;
-- 
cgit v1.2.2


From 46f6dac259717551916405ee3388de89fb152bca Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Thu, 2 Mar 2006 02:54:33 -0800
Subject: [PATCH] v9fs: simplify fid mapping

v9fs has been plagued by an over-complicated approach trying to map Linux
dentry semantics to Plan 9 fid semantics.  Our previous approach called for
aggressive flushing of the dcache resulting in several problems (including
wierd cwd behavior when running /bin/pwd).

This patch dramatically simplifies our handling of this fid management.  Fids
will not be clunked as promptly, but the new approach is more functionally
correct.  We now clunk un-open fids only when their dentry ref_count reaches 0
(and d_delete is called).

Another simplification is we no longer seek to match fids to the process-id or
uid of the action initiator.  The uid-matching will need to be revisited when
we fix the security model.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/fid.c        | 94 ++++--------------------------------------------------
 fs/9p/fid.h        |  1 -
 fs/9p/v9fs.c       |  1 +
 fs/9p/vfs_dentry.c | 45 +++++---------------------
 4 files changed, 15 insertions(+), 126 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index c27f546dd25b..c4d13bf904d2 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -1,7 +1,7 @@
 /*
  * V9FS FID Management
  *
- *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2005, 2006 by Eric Van Hensbergen <ericvh@gmail.com>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -57,7 +57,6 @@ int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry)
 	}
 
 	fid->uid = current->uid;
-	fid->pid = current->pid;
 	list_add(&fid->list, fid_list);
 	return 0;
 }
@@ -88,7 +87,7 @@ struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *v9ses, int fid)
 	new->rdir_fcall = NULL;
 	INIT_LIST_HEAD(&new->list);
 
-		return new;
+	return new;
 }
 
 /**
@@ -103,76 +102,14 @@ void v9fs_fid_destroy(struct v9fs_fid *fid)
 	kfree(fid);
 }
 
-/**
- * v9fs_fid_walk_up - walks from the process current directory
- * 	up to the specified dentry.
- */
-static struct v9fs_fid *v9fs_fid_walk_up(struct dentry *dentry)
-{
-	int fidnum, cfidnum, err;
-	struct v9fs_fid *cfid, *fid;
-	struct dentry *cde;
-	struct v9fs_session_info *v9ses;
-
-	v9ses = v9fs_inode2v9ses(current->fs->pwd->d_inode);
-	cfid = v9fs_fid_lookup(current->fs->pwd);
-	if (cfid == NULL) {
-		dprintk(DEBUG_ERROR, "process cwd doesn't have a fid\n");
-		return ERR_PTR(-ENOENT);
-	}
-
-	cfidnum = cfid->fid;
-	cde = current->fs->pwd;
-	/* TODO: take advantage of multiwalk */
-
-	fidnum = v9fs_get_idpool(&v9ses->fidpool);
-	if (fidnum < 0) {
-		dprintk(DEBUG_ERROR, "could not get a new fid num\n");
-		err = -ENOENT;
-		goto clunk_fid;
-	}
-
-	while (cde != dentry) {
-		if (cde == cde->d_parent) {
-			dprintk(DEBUG_ERROR, "can't find dentry\n");
-			err = -ENOENT;
-			goto clunk_fid;
-		}
-
-		err = v9fs_t_walk(v9ses, cfidnum, fidnum, "..", NULL);
-		if (err < 0) {
-			dprintk(DEBUG_ERROR, "problem walking to parent\n");
-			goto clunk_fid;
-		}
-
-		cfidnum = fidnum;
-		cde = cde->d_parent;
-	}
-
-	fid = v9fs_fid_create(v9ses, fidnum);
-	if (fid) {
-		err = v9fs_fid_insert(fid, dentry);
-		if (err < 0) {
-			kfree(fid);
-			goto clunk_fid;
-		}
-	}
-
-	return fid;
-
-clunk_fid:
-	v9fs_t_clunk(v9ses, fidnum);
-	return ERR_PTR(err);
-}
-
 /**
  * v9fs_fid_lookup - retrieve the right fid from a  particular dentry
  * @dentry: dentry to look for fid in
  * @type: intent of lookup (operation or traversal)
  *
- * search list of fids associated with a dentry for a fid with a matching
- * thread id or uid.  If that fails, look up the dentry's parents to see if you
- * can find a matching fid.
+ * find a fid in the dentry
+ *
+ * TODO: only match fids that have the same uid as current user
  *
  */
 
@@ -187,26 +124,7 @@ struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry)
 		return_fid = list_entry(fid_list->next, struct v9fs_fid, list);
 
 	if (!return_fid) {
-		struct dentry *par = current->fs->pwd->d_parent;
-		int count = 1;
-		while (par != NULL) {
-			if (par == dentry)
-				break;
-			count++;
-			if (par == par->d_parent) {
-				dprintk(DEBUG_ERROR,
-					"got to root without finding dentry\n");
-				break;
-			}
-			par = par->d_parent;
-		}
-
-/* XXX - there may be some duplication we can get rid of */
-		if (par == dentry) {
-			return_fid = v9fs_fid_walk_up(dentry);
-			if (IS_ERR(return_fid))
-				return_fid = NULL;
-		}
+		dprintk(DEBUG_ERROR, "Couldn't find a fid in dentry\n");
 	}
 
 	return return_fid;
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index 7ccf0d064e25..1fc2dd08d75a 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -44,7 +44,6 @@ struct v9fs_fid {
 	struct v9fs_fcall *rdir_fcall;
 
 	/* management stuff */
-	pid_t pid;		/* thread associated with this fid */
 	uid_t uid;		/* user associated with this fid */
 
 	/* private data */
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index ef3386549140..61352491ba36 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -397,6 +397,7 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
 	}
 
 	if (v9ses->afid != ~0) {
+		dprintk(DEBUG_ERROR, "afid not equal to ~0\n");
 		if (v9fs_t_clunk(v9ses, v9ses->afid))
 			dprintk(DEBUG_ERROR, "clunk failed\n");
 	}
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 2dd806dac9f1..12c9cc926b71 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -43,47 +43,18 @@
 #include "fid.h"
 
 /**
- * v9fs_dentry_validate - VFS dcache hook to validate cache
- * @dentry:  dentry that is being validated
- * @nd: path data
+ * v9fs_dentry_delete - called when dentry refcount equals 0
+ * @dentry:  dentry in question
  *
- * dcache really shouldn't be used for 9P2000 as at all due to
- * potential attached semantics to directory traversal (walk).
- *
- * FUTURE: look into how to use dcache to allow multi-stage
- * walks in Plan 9 & potential for better dcache operation which
- * would remain valid for Plan 9 semantics.  Older versions
- * had validation via stat for those interested.  However, since
- * stat has the same approximate overhead as walk there really
- * is no difference.  The only improvement would be from a
- * time-decay cache like NFS has and that undermines the
- * synchronous nature of 9P2000.
+ * By returning 1 here we should remove cacheing of unused
+ * dentry components.
  *
  */
 
-static int v9fs_dentry_validate(struct dentry *dentry, struct nameidata *nd)
+int v9fs_dentry_delete(struct dentry *dentry)
 {
-	struct dentry *dc = current->fs->pwd;
-
-	dprintk(DEBUG_VFS, "dentry: %s (%p)\n", dentry->d_iname, dentry);
-	if (v9fs_fid_lookup(dentry)) {
-		dprintk(DEBUG_VFS, "VALID\n");
-		return 1;
-	}
-
-	while (dc != NULL) {
-		if (dc == dentry) {
-			dprintk(DEBUG_VFS, "VALID\n");
-			return 1;
-		}
-		if (dc == dc->d_parent)
-			break;
-
-		dc = dc->d_parent;
-	}
-
-	dprintk(DEBUG_VFS, "INVALID\n");
-	return 0;
+	dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
+	return 1;
 }
 
 /**
@@ -118,6 +89,6 @@ void v9fs_dentry_release(struct dentry *dentry)
 }
 
 struct dentry_operations v9fs_dentry_operations = {
-	.d_revalidate = v9fs_dentry_validate,
+	.d_delete = v9fs_dentry_delete,
 	.d_release = v9fs_dentry_release,
 };
-- 
cgit v1.2.2


From c499ec24c31edf270e777a868ffd0daddcfe7ebd Mon Sep 17 00:00:00 2001
From: "Vladimir V. Saveliev" <vs@namesys.com>
Date: Thu, 2 Mar 2006 02:54:39 -0800
Subject: [PATCH] reiserfs: do not check if unsigned < 0

This patch fixes bugs in reiserfs where unsigned integers were checked
whether they are less then 0.

Signed-off-by: Vladimir V. Saveliev <vs@namesys.com>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Hans Reiser <reiser@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/file.c    | 14 +++++++-------
 fs/reiserfs/inode.c   |  8 ++------
 fs/reiserfs/journal.c |  3 +--
 3 files changed, 10 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index f3473176c83a..be12879bb179 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1464,13 +1464,11 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 		   partially overwritten pages, if needed. And lock the pages,
 		   so that nobody else can access these until we are done.
 		   We get number of actual blocks needed as a result. */
-		blocks_to_allocate =
-		    reiserfs_prepare_file_region_for_write(inode, pos,
-							   num_pages,
-							   write_bytes,
-							   prepared_pages);
-		if (blocks_to_allocate < 0) {
-			res = blocks_to_allocate;
+		res = reiserfs_prepare_file_region_for_write(inode, pos,
+							     num_pages,
+							     write_bytes,
+							     prepared_pages);
+		if (res < 0) {
 			reiserfs_release_claimed_blocks(inode->i_sb,
 							num_pages <<
 							(PAGE_CACHE_SHIFT -
@@ -1478,6 +1476,8 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 			break;
 		}
 
+		blocks_to_allocate = res;
+
 		/* First we correct our estimate of how many blocks we need */
 		reiserfs_release_claimed_blocks(inode->i_sb,
 						(num_pages <<
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index b33d67bba2fd..d60f6238c66a 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -627,11 +627,6 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 	reiserfs_write_lock(inode->i_sb);
 	version = get_inode_item_key_version(inode);
 
-	if (block < 0) {
-		reiserfs_write_unlock(inode->i_sb);
-		return -EIO;
-	}
-
 	if (!file_capable(inode, block)) {
 		reiserfs_write_unlock(inode->i_sb);
 		return -EFBIG;
@@ -934,12 +929,13 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 				     //pos_in_item * inode->i_sb->s_blocksize,
 				     TYPE_INDIRECT, 3);	// key type is unimportant
 
+			RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
+			       "green-805: invalid offset");
 			blocks_needed =
 			    1 +
 			    ((cpu_key_k_offset(&key) -
 			      cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
 			     s_blocksize_bits);
-			RFALSE(blocks_needed < 0, "green-805: invalid offset");
 
 			if (blocks_needed == 1) {
 				un = &unf_single;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index b7a179560ab4..5a9d2722fa0a 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2319,8 +2319,7 @@ static int journal_read(struct super_block *p_s_sb)
 		return 1;
 	}
 	jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
-	if (le32_to_cpu(jh->j_first_unflushed_offset) >= 0 &&
-	    le32_to_cpu(jh->j_first_unflushed_offset) <
+	if (le32_to_cpu(jh->j_first_unflushed_offset) <
 	    SB_ONDISK_JOURNAL_SIZE(p_s_sb)
 	    && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
 		oldest_start =
-- 
cgit v1.2.2


From 3af1efe8a301f5b1c813f5f761cb1e10d6175605 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Thu, 2 Mar 2006 13:25:26 -0500
Subject: [PATCH] reiserfs: fix unaligned bitmap usage

The bitmaps associated with generation numbers for directory entries
are declared as an array of ints. On some platforms, this causes alignment
exceptions.

The following patch uses the standard bitmap declaration macros to
declare the bitmaps, fixing the problem.

Originally from Takashi Iwai.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Acked-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/namei.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index c8123308e060..284f7852de8b 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -247,7 +247,7 @@ static int linear_search_in_dir_item(struct cpu_key *key,
 		/* mark, that this generation number is used */
 		if (de->de_gen_number_bit_string)
 			set_bit(GET_GENERATION_NUMBER(deh_offset(deh)),
-				(unsigned long *)de->de_gen_number_bit_string);
+				de->de_gen_number_bit_string);
 
 		// calculate pointer to name and namelen
 		de->de_entry_num = i;
@@ -431,7 +431,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 	struct reiserfs_de_head *deh;
 	INITIALIZE_PATH(path);
 	struct reiserfs_dir_entry de;
-	int bit_string[MAX_GENERATION_NUMBER / (sizeof(int) * 8) + 1];
+	DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1);
 	int gen_number;
 	char small_buf[32 + DEH_SIZE];	/* 48 bytes now and we avoid kmalloc
 					   if we create file with short name */
@@ -486,7 +486,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 
 	/* find the proper place for the new entry */
 	memset(bit_string, 0, sizeof(bit_string));
-	de.de_gen_number_bit_string = (char *)bit_string;
+	de.de_gen_number_bit_string = bit_string;
 	retval = reiserfs_find_entry(dir, name, namelen, &path, &de);
 	if (retval != NAME_NOT_FOUND) {
 		if (buffer != small_buf)
@@ -508,7 +508,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 	}
 
 	gen_number =
-	    find_first_zero_bit((unsigned long *)bit_string,
+	    find_first_zero_bit(bit_string,
 				MAX_GENERATION_NUMBER + 1);
 	if (gen_number > MAX_GENERATION_NUMBER) {
 		/* there is no free generation number */
-- 
cgit v1.2.2


From e77e6f3be93763ef88ccbaa9e0ebda5360d92f7c Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sun, 5 Mar 2006 03:39:55 +0000
Subject: [CIFS] Always match oplock break (cache notification) to the right
 tcp session when multiply mounted.

Fixes slow response when cifs client is mounted to shares on multiple
servers and oplock break occurs (usually due to attempt to multiply open a
file).  When treeids on mutiple mounted shares match and we find the wrong
match first, we searched for the wrong cached files to send oplock break
response for which usually meant that no matching file was found and thus
the server would have to timeout the notification.  Oplock break timeout is
about 20 seconds on some servers so this could cause significantly slower
performance on file open calls in a few cases (in particular when multiple
shares are mounted from multiple servers, tree ids match, and we have a
cached file which is later opened multiple times).  This was the most
important of the bugs that was found and fixed at Connectathon
(interoperability testing event) this week.

Acked-by:  Shaggy (shaggy@austin.ibm.com)
Signed-off-by: Steve French (sfrench@us.ibm.com)
---
 fs/cifs/cifsproto.h | 2 +-
 fs/cifs/connect.c   | 2 +-
 fs/cifs/misc.c      | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 3c03aadaff0c..7b25463d3c14 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -52,7 +52,7 @@ extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *,
 			int * /* type of buf returned */ , const int long_op);
 extern int checkSMBhdr(struct smb_hdr *smb, __u16 mid);
 extern int checkSMB(struct smb_hdr *smb, __u16 mid, int length);
-extern int is_valid_oplock_break(struct smb_hdr *smb);
+extern int is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *);
 extern int is_size_safe_to_change(struct cifsInodeInfo *);
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *);
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index ef5ae6f93c75..2a0c1f4ca0ae 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -630,7 +630,7 @@ multi_t2_fnd:
 					smallbuf = NULL;
 			}
 			wake_up_process(task_to_wake);
-		} else if ((is_valid_oplock_break(smb_buffer) == FALSE)
+		} else if ((is_valid_oplock_break(smb_buffer, server) == FALSE)
 		    && (isMultiRsp == FALSE)) {                          
 			cERROR(1, ("No task to wake, unknown frame rcvd!"));
 			cifs_dump_mem("Received Data is: ",(char *)smb_buffer,
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 812c6bb0fe38..432ba15e2c2d 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -475,7 +475,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, int length)
 	return 0;
 }
 int
-is_valid_oplock_break(struct smb_hdr *buf)
+is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 {    
 	struct smb_com_lock_req * pSMB = (struct smb_com_lock_req *)buf;
 	struct list_head *tmp;
@@ -535,7 +535,7 @@ is_valid_oplock_break(struct smb_hdr *buf)
 	read_lock(&GlobalSMBSeslock);
 	list_for_each(tmp, &GlobalTreeConnectionList) {
 		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		if (tcon->tid == buf->Tid) {
+		if ((tcon->tid == buf->Tid) && (srv == tcon->ses->server)) {
 			cifs_stats_inc(&tcon->num_oplock_brks);
 			list_for_each(tmp1,&tcon->openFileList){
 				netfile = list_entry(tmp1,struct cifsFileInfo,
-- 
cgit v1.2.2


From ff3aea0e68bfd46120ce2d08bc1f8240fa2bd36a Mon Sep 17 00:00:00 2001
From: Dave Johnson <djohnson@sw.starentnetworks.com>
Date: Mon, 6 Mar 2006 15:42:36 -0800
Subject: [PATCH] cramfs mounts provide corrupted content since 2.6.15

Fix handling of cramfs images created by util-linux containing empty
regular files.  Images created by cramfstools 1.x were ok.

Fill out inode contents in cramfs_iget5_set() instead of get_cramfs_inode()
to prevent issues if cramfs_iget5_test() is called with I_LOCK|I_NEW still
set.

Signed-off-by: Dave Johnson <djohnson+linux-kernel@sw.starentnetworks.com>
Cc: Olaf Hering <olh@suse.de>
Cc: Chris Mason <mason@suse.com>
Cc: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/cramfs/inode.c | 60 +++++++++++++++++++++++++++----------------------------
 1 file changed, 29 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 7fe85415ae7c..8ad52f5bf255 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -36,7 +36,7 @@ static DECLARE_MUTEX(read_mutex);
 
 /* These two macros may change in future, to provide better st_ino
    semantics. */
-#define CRAMINO(x)	((x)->offset?(x)->offset<<2:1)
+#define CRAMINO(x)	(((x)->offset && (x)->size)?(x)->offset<<2:1)
 #define OFFSET(x)	((x)->i_ino)
 
 
@@ -66,8 +66,36 @@ static int cramfs_iget5_test(struct inode *inode, void *opaque)
 
 static int cramfs_iget5_set(struct inode *inode, void *opaque)
 {
+	static struct timespec zerotime;
 	struct cramfs_inode *cramfs_inode = opaque;
+	inode->i_mode = cramfs_inode->mode;
+	inode->i_uid = cramfs_inode->uid;
+	inode->i_size = cramfs_inode->size;
+	inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
+	inode->i_blksize = PAGE_CACHE_SIZE;
+	inode->i_gid = cramfs_inode->gid;
+	/* Struct copy intentional */
+	inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
 	inode->i_ino = CRAMINO(cramfs_inode);
+	/* inode->i_nlink is left 1 - arguably wrong for directories,
+	   but it's the best we can do without reading the directory
+           contents.  1 yields the right result in GNU find, even
+	   without -noleaf option. */
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_fop = &generic_ro_fops;
+		inode->i_data.a_ops = &cramfs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &cramfs_dir_inode_operations;
+		inode->i_fop = &cramfs_directory_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &page_symlink_inode_operations;
+		inode->i_data.a_ops = &cramfs_aops;
+	} else {
+		inode->i_size = 0;
+		inode->i_blocks = 0;
+		init_special_inode(inode, inode->i_mode,
+			old_decode_dev(cramfs_inode->size));
+	}
 	return 0;
 }
 
@@ -77,37 +105,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
 	struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode),
 					    cramfs_iget5_test, cramfs_iget5_set,
 					    cramfs_inode);
-	static struct timespec zerotime;
-
 	if (inode && (inode->i_state & I_NEW)) {
-		inode->i_mode = cramfs_inode->mode;
-		inode->i_uid = cramfs_inode->uid;
-		inode->i_size = cramfs_inode->size;
-		inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
-		inode->i_blksize = PAGE_CACHE_SIZE;
-		inode->i_gid = cramfs_inode->gid;
-		/* Struct copy intentional */
-		inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
-		inode->i_ino = CRAMINO(cramfs_inode);
-		/* inode->i_nlink is left 1 - arguably wrong for directories,
-		   but it's the best we can do without reading the directory
-	           contents.  1 yields the right result in GNU find, even
-		   without -noleaf option. */
-		if (S_ISREG(inode->i_mode)) {
-			inode->i_fop = &generic_ro_fops;
-			inode->i_data.a_ops = &cramfs_aops;
-		} else if (S_ISDIR(inode->i_mode)) {
-			inode->i_op = &cramfs_dir_inode_operations;
-			inode->i_fop = &cramfs_directory_operations;
-		} else if (S_ISLNK(inode->i_mode)) {
-			inode->i_op = &page_symlink_inode_operations;
-			inode->i_data.a_ops = &cramfs_aops;
-		} else {
-			inode->i_size = 0;
-			inode->i_blocks = 0;
-			init_special_inode(inode, inode->i_mode,
-				old_decode_dev(cramfs_inode->size));
-		}
 		unlock_new_inode(inode);
 	}
 	return inode;
-- 
cgit v1.2.2


From ecbd3a632c8198744655b769c5c2b5a1455c1fba Mon Sep 17 00:00:00 2001
From: Peter Staubach <staubach@redhat.com>
Date: Mon, 6 Mar 2006 15:42:56 -0800
Subject: [PATCH] ramfs needs to update directory m/ctime on symlink

ramfs neglects to update the directory mtime and ctime fields when creating
a new symbolic link.  Ramfs was modified in 2.6.15 to update these fields
when other types of entries are created.  The symlink support is separate
from that other support, so that change did not cover quite all of the
possibilities.

All of the directory content manipulation entry points now seem to be
covered with respect to these time field updates.

Signed-off-by: Peter Staubach <staubach@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ramfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index cde5d48994ae..14bd2246fb6d 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -137,6 +137,7 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
 				inode->i_gid = dir->i_gid;
 			d_instantiate(dentry, inode);
 			dget(dentry);
+			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 		} else
 			iput(inode);
 	}
-- 
cgit v1.2.2


From 5ddfae16bddb12104fff63c36fb5901f1a3729fc Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Mon, 6 Mar 2006 15:42:57 -0800
Subject: [PATCH] smaps: hugepages fix

smaps doesn't have a hugepage pagetable walker. Skip walking hugepage
vmas.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/task_mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 0eaad41f4658..35787f907f12 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -289,7 +289,7 @@ static int show_smap(struct seq_file *m, void *v)
 	struct mem_size_stats mss;
 
 	memset(&mss, 0, sizeof mss);
-	if (vma->vm_mm)
+	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
 		smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss);
 	return show_map_internal(m, v, &mss);
 }
-- 
cgit v1.2.2


From ad820c5dd47dff9397ef1e94388bc6577983f68b Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Mon, 6 Mar 2006 15:42:58 -0800
Subject: [PATCH] smaps: shared fix

The point of the smaps "shared" is to count the number of pages that are
mapped by more than one process, according to Mauricio Lin.  However, smaps
uses page_count for this, so it will return a false positive for every page
that is mapped by just that one process, which is also in pagecache or
swapcache.  There are false positive situations for anonymous pages not in
swapcache as well: - page reclaim, migration - get_user_pages (eg.
direct-io, ptrace)

Use page_mapcount instead, to count the number of mappings to the page.

Use vm_normal_page so that weird things like /dev/mem aren't counted either.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/task_mmu.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 35787f907f12..91b7c15ab373 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -204,7 +204,6 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 {
 	pte_t *pte, ptent;
 	spinlock_t *ptl;
-	unsigned long pfn;
 	struct page *page;
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -214,12 +213,12 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 			continue;
 
 		mss->resident += PAGE_SIZE;
-		pfn = pte_pfn(ptent);
-		if (!pfn_valid(pfn))
+
+		page = vm_normal_page(vma, addr, ptent);
+		if (!page)
 			continue;
 
-		page = pfn_to_page(pfn);
-		if (page_count(page) >= 2) {
+		if (page_mapcount(page) >= 2) {
 			if (pte_dirty(ptent))
 				mss->shared_dirty += PAGE_SIZE;
 			else
-- 
cgit v1.2.2


From d19e9974084b4024abcfcfc9d8676c90d26994bb Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Tue, 7 Mar 2006 09:16:35 -0800
Subject: Simplify fifo_open() locking logic

We don't do interruptible waits for the pipe mutex anywhere else any
more either, so don't do it in fifo_open() either.

Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fifo.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fifo.c b/fs/fifo.c
index 923371b753ab..d13fcd3ec803 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -34,10 +34,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 {
 	int ret;
 
-	ret = -ERESTARTSYS;
-	if (mutex_lock_interruptible(PIPE_MUTEX(*inode)))
-		goto err_nolock_nocleanup;
-
+	mutex_lock(PIPE_MUTEX(*inode));
 	if (!inode->i_pipe) {
 		ret = -ENOMEM;
 		if(!pipe_new(inode))
@@ -140,8 +137,6 @@ err:
 
 err_nocleanup:
 	mutex_unlock(PIPE_MUTEX(*inode));
-
-err_nolock_nocleanup:
 	return ret;
 }
 
-- 
cgit v1.2.2


From a19cbd4bf258840ade3b6ee9e9256006d0644e09 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Wed, 8 Mar 2006 14:03:09 -0800
Subject: Mark the pipe file operations static

They aren't used (nor even really usable) outside of pipe.c anyway

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/pipe.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index d722579df79a..8aada8e426f4 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -605,7 +605,7 @@ struct file_operations rdwr_fifo_fops = {
 	.fasync		= pipe_rdwr_fasync,
 };
 
-struct file_operations read_pipe_fops = {
+static struct file_operations read_pipe_fops = {
 	.llseek		= no_llseek,
 	.read		= pipe_read,
 	.readv		= pipe_readv,
@@ -617,7 +617,7 @@ struct file_operations read_pipe_fops = {
 	.fasync		= pipe_read_fasync,
 };
 
-struct file_operations write_pipe_fops = {
+static struct file_operations write_pipe_fops = {
 	.llseek		= no_llseek,
 	.read		= bad_pipe_r,
 	.write		= pipe_write,
@@ -629,7 +629,7 @@ struct file_operations write_pipe_fops = {
 	.fasync		= pipe_write_fasync,
 };
 
-struct file_operations rdwr_pipe_fops = {
+static struct file_operations rdwr_pipe_fops = {
 	.llseek		= no_llseek,
 	.read		= pipe_read,
 	.readv		= pipe_readv,
-- 
cgit v1.2.2


From 4d6660eb3665f22d16aff466eb9d45df6102b254 Mon Sep 17 00:00:00 2001
From: Phillip Susi <psusi@cfl.rr.com>
Date: Tue, 7 Mar 2006 21:55:24 -0800
Subject: [PATCH] udf: fix uid/gid options and add uid/gid=ignore and forget
 options

Fix a bug in udf where it would write uid/gid = 0 to the disk for files
owned by the id given with the uid=/gid= mount options.  It also adds 4 new
mount options: uid/gid=forget and uid/gid=ignore.  Without any options the
id in core and on disk always match.  Giving uid/gid=nnn specifies a
default ID to be used in core when the on disk ID is -1.  uid/gid=ignore
forces the in core ID to allways be used no matter what the on disk ID is.
uid/gid=forget forces the on disk ID to always be written out as -1.

The use of these options allows you to override ownerships on a disk or
disable ownwership information from being written, allowing the media to be
used portably between different computers and possibly different users
without permissions issues that would require root to correct.

Signed-off-by: Phillip Susi <psusi@cfl.rr.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/udf/inode.c  | 16 ++++++++++++----
 fs/udf/super.c  | 18 +++++++++++++++++-
 fs/udf/udf_sb.h |  4 ++++
 3 files changed, 33 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 395e582ee542..d04cff2273b6 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1045,10 +1045,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 	}
 
 	inode->i_uid = le32_to_cpu(fe->uid);
-	if ( inode->i_uid == -1 ) inode->i_uid = UDF_SB(inode->i_sb)->s_uid;
+	if (inode->i_uid == -1 || UDF_QUERY_FLAG(inode->i_sb,
+					UDF_FLAG_UID_IGNORE))
+		inode->i_uid = UDF_SB(inode->i_sb)->s_uid;
 
 	inode->i_gid = le32_to_cpu(fe->gid);
-	if ( inode->i_gid == -1 ) inode->i_gid = UDF_SB(inode->i_sb)->s_gid;
+	if (inode->i_gid == -1 || UDF_QUERY_FLAG(inode->i_sb,
+					UDF_FLAG_GID_IGNORE))
+		inode->i_gid = UDF_SB(inode->i_sb)->s_gid;
 
 	inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
 	if (!inode->i_nlink)
@@ -1335,10 +1339,14 @@ udf_update_inode(struct inode *inode, int do_sync)
 		return err;
 	}
 
-	if (inode->i_uid != UDF_SB(inode->i_sb)->s_uid)
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET))
+		fe->uid = cpu_to_le32(-1);
+	else if (inode->i_uid != UDF_SB(inode->i_sb)->s_uid)
 		fe->uid = cpu_to_le32(inode->i_uid);
 
-	if (inode->i_gid != UDF_SB(inode->i_sb)->s_gid)
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET))
+		fe->gid = cpu_to_le32(-1);
+	else if (inode->i_gid != UDF_SB(inode->i_sb)->s_gid)
 		fe->gid = cpu_to_le32(inode->i_gid);
 
 	udfperms =	((inode->i_mode & S_IRWXO)     ) |
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 4a6f49adc609..368d8f81fe54 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -269,7 +269,7 @@ enum {
 	Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock,
 	Opt_anchor, Opt_volume, Opt_partition, Opt_fileset,
 	Opt_rootdir, Opt_utf8, Opt_iocharset,
-	Opt_err
+	Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore
 };
 
 static match_table_t tokens = {
@@ -282,6 +282,10 @@ static match_table_t tokens = {
 	{Opt_adinicb, "adinicb"},
 	{Opt_shortad, "shortad"},
 	{Opt_longad, "longad"},
+	{Opt_uforget, "uid=forget"},
+	{Opt_uignore, "uid=ignore"},
+	{Opt_gforget, "gid=forget"},
+	{Opt_gignore, "gid=ignore"},
 	{Opt_gid, "gid=%u"},
 	{Opt_uid, "uid=%u"},
 	{Opt_umask, "umask=%o"},
@@ -414,6 +418,18 @@ udf_parse_options(char *options, struct udf_options *uopt)
 				uopt->flags |= (1 << UDF_FLAG_NLS_MAP);
 				break;
 #endif
+			case Opt_uignore:
+				uopt->flags |= (1 << UDF_FLAG_UID_IGNORE);
+				break;
+			case Opt_uforget:
+				uopt->flags |= (1 << UDF_FLAG_UID_FORGET);
+				break;
+			case Opt_gignore:
+			    uopt->flags |= (1 << UDF_FLAG_GID_IGNORE);
+				break;
+			case Opt_gforget:
+			    uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
+				break;
 			default:
 				printk(KERN_ERR "udf: bad mount option \"%s\" "
 						"or missing value\n", p);
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 663669810be6..110f8d62616f 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -20,6 +20,10 @@
 #define UDF_FLAG_VARCONV		8
 #define UDF_FLAG_NLS_MAP		9
 #define UDF_FLAG_UTF8			10
+#define UDF_FLAG_UID_FORGET     11    /* save -1 for uid to disk */
+#define UDF_FLAG_UID_IGNORE     12    /* use sb uid instead of on disk uid */
+#define UDF_FLAG_GID_FORGET     13
+#define UDF_FLAG_GID_IGNORE     14
 
 #define UDF_PART_FLAG_UNALLOC_BITMAP	0x0001
 #define UDF_PART_FLAG_UNALLOC_TABLE	0x0002
-- 
cgit v1.2.2


From 529bf6be5c04f2e869d07bfdb122e9fd98ade714 Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Tue, 7 Mar 2006 21:55:35 -0800
Subject: [PATCH] fix file counting

I have benchmarked this on an x86_64 NUMA system and see no significant
performance difference on kernbench.  Tested on both x86_64 and powerpc.

The way we do file struct accounting is not very suitable for batched
freeing.  For scalability reasons, file accounting was
constructor/destructor based.  This meant that nr_files was decremented
only when the object was removed from the slab cache.  This is susceptible
to slab fragmentation.  With RCU based file structure, consequent batched
freeing and a test program like Serge's, we just speed this up and end up
with a very fragmented slab -

llm22:~ # cat /proc/sys/fs/file-nr
587730  0       758844

At the same time, I see only a 2000+ objects in filp cache.  The following
patch I fixes this problem.

This patch changes the file counting by removing the filp_count_lock.
Instead we use a separate percpu counter, nr_files, for now and all
accesses to it are through get_nr_files() api.  In the sysctl handler for
nr_files, we populate files_stat.nr_files before returning to user.

Counting files as an when they are created and destroyed (as opposed to
inside slab) allows us to correctly count open files with RCU.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/dcache.c     |  2 +-
 fs/file_table.c | 87 ++++++++++++++++++++++++++++++++++++---------------------
 2 files changed, 56 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index a173bba32666..11dc83092d4a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1736,7 +1736,7 @@ void __init vfs_caches_init(unsigned long mempages)
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 
 	dcache_init(mempages);
 	inode_init(mempages);
diff --git a/fs/file_table.c b/fs/file_table.c
index 768b58167543..44fabeaa9415 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -5,6 +5,7 @@
  *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
  */
 
+#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/file.h>
@@ -19,52 +20,67 @@
 #include <linux/capability.h>
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
+#include <linux/sysctl.h>
+#include <linux/percpu_counter.h>
+
+#include <asm/atomic.h>
 
 /* sysctl tunables... */
 struct files_stat_struct files_stat = {
 	.max_files = NR_FILE
 };
 
-EXPORT_SYMBOL(files_stat); /* Needed by unix.o */
-
 /* public. Not pretty! */
- __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
 
-static DEFINE_SPINLOCK(filp_count_lock);
+static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 
-/* slab constructors and destructors are called from arbitrary
- * context and must be fully threaded - use a local spinlock
- * to protect files_stat.nr_files
- */
-void filp_ctor(void *objp, struct kmem_cache *cachep, unsigned long cflags)
+static inline void file_free_rcu(struct rcu_head *head)
 {
-	if ((cflags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		unsigned long flags;
-		spin_lock_irqsave(&filp_count_lock, flags);
-		files_stat.nr_files++;
-		spin_unlock_irqrestore(&filp_count_lock, flags);
-	}
+	struct file *f =  container_of(head, struct file, f_u.fu_rcuhead);
+	kmem_cache_free(filp_cachep, f);
 }
 
-void filp_dtor(void *objp, struct kmem_cache *cachep, unsigned long dflags)
+static inline void file_free(struct file *f)
 {
-	unsigned long flags;
-	spin_lock_irqsave(&filp_count_lock, flags);
-	files_stat.nr_files--;
-	spin_unlock_irqrestore(&filp_count_lock, flags);
+	percpu_counter_dec(&nr_files);
+	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
 }
 
-static inline void file_free_rcu(struct rcu_head *head)
+/*
+ * Return the total number of open files in the system
+ */
+static int get_nr_files(void)
 {
-	struct file *f =  container_of(head, struct file, f_u.fu_rcuhead);
-	kmem_cache_free(filp_cachep, f);
+	return percpu_counter_read_positive(&nr_files);
 }
 
-static inline void file_free(struct file *f)
+/*
+ * Return the maximum number of open files in the system
+ */
+int get_max_files(void)
 {
-	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
+	return files_stat.max_files;
 }
+EXPORT_SYMBOL_GPL(get_max_files);
+
+/*
+ * Handle nr_files sysctl
+ */
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+int proc_nr_files(ctl_table *table, int write, struct file *filp,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	files_stat.nr_files = get_nr_files();
+	return proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+#else
+int proc_nr_files(ctl_table *table, int write, struct file *filp,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+#endif
 
 /* Find an unused file structure and return a pointer to it.
  * Returns NULL, if there are no more free file structures or
@@ -78,14 +94,20 @@ struct file *get_empty_filp(void)
 	/*
 	 * Privileged users can go above max_files
 	 */
-	if (files_stat.nr_files >= files_stat.max_files &&
-				!capable(CAP_SYS_ADMIN))
-		goto over;
+	if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
+		/*
+		 * percpu_counters are inaccurate.  Do an expensive check before
+		 * we go and fail.
+		 */
+		if (percpu_counter_sum(&nr_files) >= files_stat.max_files)
+			goto over;
+	}
 
 	f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
 	if (f == NULL)
 		goto fail;
 
+	percpu_counter_inc(&nr_files);
 	memset(f, 0, sizeof(*f));
 	if (security_file_alloc(f))
 		goto fail_sec;
@@ -101,10 +123,10 @@ struct file *get_empty_filp(void)
 
 over:
 	/* Ran out of filps - report that */
-	if (files_stat.nr_files > old_max) {
+	if (get_nr_files() > old_max) {
 		printk(KERN_INFO "VFS: file-max limit %d reached\n",
-					files_stat.max_files);
-		old_max = files_stat.nr_files;
+					get_max_files());
+		old_max = get_nr_files();
 	}
 	goto fail;
 
@@ -276,4 +298,5 @@ void __init files_init(unsigned long mempages)
 	if (files_stat.max_files < NR_FILE)
 		files_stat.max_files = NR_FILE;
 	files_defer_init();
+	percpu_counter_init(&nr_files);
 } 
-- 
cgit v1.2.2


From e96fb230cc97760e448327c0de612cfba94ca7bf Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 7 Mar 2006 21:55:36 -0800
Subject: [PATCH] jffs2: avoid divide-by-zero

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jffs2/scan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 3e51dd1da8aa..cf55b221fc2b 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -233,7 +233,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 		c->nextblock->dirty_size = 0;
 	}
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
-	if (!jffs2_can_mark_obsolete(c) && c->nextblock && (c->nextblock->free_size % c->wbuf_pagesize)) {
+	if (!jffs2_can_mark_obsolete(c) && c->wbuf_pagesize && c->nextblock && (c->nextblock->free_size % c->wbuf_pagesize)) {
 		/* If we're going to start writing into a block which already
 		   contains data, and the end of the data isn't page-aligned,
 		   skip a little and align it. */
-- 
cgit v1.2.2


From 90f0094dc607abe384a412bfb7199fb667ab0735 Mon Sep 17 00:00:00 2001
From: Horst Hummel <horst.hummel@de.ibm.com>
Date: Tue, 7 Mar 2006 21:55:39 -0800
Subject: [PATCH] s390: dasd partition detection

DASD allows to open a device as soon as gendisk is registered, which means the
device is a fake device (capacity=0) and we do know nothing about blocksize
and partitions at that point of time.  In case the device is opened by
someone, the bdev and inode creation is done with the fake device info and the
following partition detection code is just using the wrong data.

To avoid this modify the DASD state machine to make sure that the open is
rejected until the device analysis is either finished or an unformatted device
was detected.

Signed-off-by: Horst Hummel <horst.hummel@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/partitions/ibm.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 78010ad60e47..1e4a93835fed 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -52,6 +52,7 @@ int
 ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
 	int blocksize, offset, size;
+	loff_t i_size;
 	dasd_information_t *info;
 	struct hd_geometry *geo;
 	char type[5] = {0,};
@@ -63,6 +64,13 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	unsigned char *data;
 	Sector sect;
 
+	blocksize = bdev_hardsect_size(bdev);
+	if (blocksize <= 0)
+		return 0;
+	i_size = i_size_read(bdev->bd_inode);
+	if (i_size == 0)
+		return 0;
+
 	if ((info = kmalloc(sizeof(dasd_information_t), GFP_KERNEL)) == NULL)
 		goto out_noinfo;
 	if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL)
@@ -73,9 +81,6 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 ||
 	    ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
 		goto out_noioctl;
-	
-	if ((blocksize = bdev_hardsect_size(bdev)) <= 0)
-		goto out_badsect;
 
 	/*
 	 * Get volume label, extract name and type.
@@ -111,7 +116,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 		} else {
 			printk("CMS1/%8s:", name);
 			offset = (info->label_block + 1);
-			size = bdev->bd_inode->i_size >> 9;
+			size = i_size >> 9;
 		}
 		put_partition(state, 1, offset*(blocksize >> 9),
 				 size-offset*(blocksize >> 9));
@@ -168,7 +173,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 		else
 			printk("(nonl)/%8s:", name);
 		offset = (info->label_block + 1);
-		size = (bdev->bd_inode->i_size >> 9);
+		size = i_size >> 9;
 		put_partition(state, 1, offset*(blocksize >> 9),
 				 size-offset*(blocksize >> 9));
 	}
@@ -180,7 +185,6 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	return 1;
 	
 out_readerr:
-out_badsect:
 out_noioctl:
 	kfree(label);
 out_nolab:
-- 
cgit v1.2.2


From 731805b49489055c1548f7ccfbd44c9b84013264 Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@advancedsolutions.com>
Date: Tue, 7 Mar 2006 21:55:42 -0800
Subject: [PATCH] v9fs: fix for access to unitialized variables or freed memory

Miscellaneous fixes related to accessing uninitialized variables or memory
that was already freed.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Cc: Eric Van Hensbergen <ericvh@ericvh.myip.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/9p.c        | 1 -
 fs/9p/trans_fd.c  | 1 +
 fs/9p/vfs_inode.c | 8 +++-----
 fs/9p/vfs_super.c | 1 -
 4 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/9p.c b/fs/9p/9p.c
index 1a6d08761f39..f86a28d1d6a6 100644
--- a/fs/9p/9p.c
+++ b/fs/9p/9p.c
@@ -111,7 +111,6 @@ static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc,
 	if (!rc)
 		return;
 
-	dprintk(DEBUG_9P, "tcall id %d rcall id %d\n", tc->id, rc->id);
 	v9ses = a;
 	if (rc->id == RCLUNK)
 		v9fs_put_idpool(fid, &v9ses->fidpool);
diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c
index 1a28ef97a3d1..5b2ce21b10fa 100644
--- a/fs/9p/trans_fd.c
+++ b/fs/9p/trans_fd.c
@@ -80,6 +80,7 @@ static int v9fs_fd_send(struct v9fs_transport *trans, void *v, int len)
 	if (!trans || trans->status != Connected || !ts)
 		return -EIO;
 
+	oldfs = get_fs();
 	set_fs(get_ds());
 	/* The cast to a user pointer is valid due to the set_fs() */
 	ret = vfs_write(ts->out_file, (void __user *)v, len, &ts->out_file->f_pos);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index dce729d42869..3ad8455f8577 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -265,8 +265,7 @@ v9fs_create(struct v9fs_session_info *v9ses, u32 pfid, char *name,
 	fid = v9fs_get_idpool(&v9ses->fidpool);
 	if (fid < 0) {
 		eprintk(KERN_WARNING, "no free fids available\n");
-		err = -ENOSPC;
-		goto error;
+		return -ENOSPC;
 	}
 
 	err = v9fs_t_walk(v9ses, pfid, fid, NULL, &fcall);
@@ -313,8 +312,7 @@ v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry)
 	nfid = v9fs_get_idpool(&v9ses->fidpool);
 	if (nfid < 0) {
 		eprintk(KERN_WARNING, "no free fids available\n");
-		err = -ENOSPC;
-		goto error;
+		return ERR_PTR(-ENOSPC);
 	}
 
 	err = v9fs_t_walk(v9ses, fid, nfid, (char *) dentry->d_name.name,
@@ -612,7 +610,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	int result = 0;
 
 	dprintk(DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
-		dir, dentry->d_iname, dentry, nameidata);
+		dir, dentry->d_name.name, dentry, nameidata);
 
 	sb = dir->i_sb;
 	v9ses = v9fs_inode2v9ses(dir);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index cdf787ee08de..d05318fa684e 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -156,7 +156,6 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 	stat_result = v9fs_t_stat(v9ses, newfid, &fcall);
 	if (stat_result < 0) {
 		dprintk(DEBUG_ERROR, "stat error\n");
-		kfree(fcall);
 		v9fs_t_clunk(v9ses, newfid);
 	} else {
 		/* Setup the Root Inode */
-- 
cgit v1.2.2


From 1efa3c05f8640c37ba89d54dfaa18504d21986ce Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Wed, 8 Mar 2006 16:46:08 -0800
Subject: [NET] compat ifconf: fix limits

A recent change to compat. dev_ifconf() in fs/compat_ioctl.c
causes ifconf data to be truncated 1 entry too early when copying it
to userspace.  The correct amount of data (length) is returned,
but the final entry is empty (zero, not filled in).
The for-loop 'i' check should use <= to allow the final struct
ifreq32 to be copied.  I also used the ifconf-corruption program
in kernel bugzilla #4746 to make sure that this change does not
re-introduce the corruption.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/compat_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 537ac70edfe5..c666769a875d 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -446,7 +446,7 @@ static int dev_ifconf(unsigned int fd, unsigned int cmd, unsigned long arg)
 	ifr = ifc.ifc_req;
 	ifr32 = compat_ptr(ifc32.ifcbuf);
 	for (i = 0, j = 0;
-             i + sizeof (struct ifreq32) < ifc32.ifc_len && j < ifc.ifc_len;
+             i + sizeof (struct ifreq32) <= ifc32.ifc_len && j < ifc.ifc_len;
 	     i += sizeof (struct ifreq32), j += sizeof (struct ifreq)) {
 		if (copy_in_user(ifr32, ifr, sizeof (struct ifreq32)))
 			return -EFAULT;
-- 
cgit v1.2.2


From 0ef675d491bd65028fa838015ebc6ce8abefab6f Mon Sep 17 00:00:00 2001
From: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Date: Thu, 9 Mar 2006 17:33:38 -0800
Subject: [PATCH] mtd: 64 bit fixes

Fix some bugs in mtd/jffs2 on 64bit platform.

The MEMGETBADBLOCK/MEMSETBADBLOCK ioctl are not listed in compat_ioctl.h.

And some variables in jffs2 are declared as uint32_t but used to hold
size_t values.

Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jffs2/nodelist.c  | 3 ++-
 fs/jffs2/readinode.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index b635e167a3fa..d4d0c41490cd 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -406,7 +406,8 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
 	int err = 0, pointed = 0;
 	struct jffs2_eraseblock *jeb;
 	unsigned char *buffer;
-	uint32_t crc, ofs, retlen, len;
+	uint32_t crc, ofs, len;
+	size_t retlen;
 
 	BUG_ON(tn->csize == 0);
 
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 5f0652df5d47..f1695642d0f7 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -112,7 +112,7 @@ static struct jffs2_raw_node_ref *jffs2_first_valid_node(struct jffs2_raw_node_r
  * 	    negative error code on failure.
  */
 static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
-				struct jffs2_raw_dirent *rd, uint32_t read, struct jffs2_full_dirent **fdp,
+				struct jffs2_raw_dirent *rd, size_t read, struct jffs2_full_dirent **fdp,
 				uint32_t *latest_mctime, uint32_t *mctime_ver)
 {
 	struct jffs2_full_dirent *fd;
-- 
cgit v1.2.2


From 0adb25d2e71ab047423d6fc63d5d184590d0a66f Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@openvz.org>
Date: Sat, 11 Mar 2006 03:27:13 -0800
Subject: [PATCH] ext3: ext3_symlink should use GFP_NOFS allocations inside

This patch fixes illegal __GFP_FS allocation inside ext3 transaction in
ext3_symlink().  Such allocation may re-enter ext3 code from
try_to_free_pages.  But JBD/ext3 code keeps a pointer to current journal
handle in task_struct and, hence, is not reentrable.

This bug led to "Assertion failure in journal_dirty_metadata()" messages.

http://bugzilla.openvz.org/show_bug.cgi?id=115

Signed-off-by: Andrey Savochkin <saw@saw.sw.com.sg>
Signed-off-by: Kirill Korotaev <dev@openvz.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/namei.c |  3 ++-
 fs/namei.c      | 13 +++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 8bd8ac077704..b8f5cd1e540d 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2141,7 +2141,8 @@ retry:
 		 * We have a transaction open.  All is sweetness.  It also sets
 		 * i_size in generic_commit_write().
 		 */
-		err = page_symlink(inode, symname, l);
+		err = __page_symlink(inode, symname, l,
+				mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 		if (err) {
 			ext3_dec_count(handle, inode);
 			ext3_mark_inode_dirty(handle, inode);
diff --git a/fs/namei.c b/fs/namei.c
index 557dcf395ca1..8dc2b038d5d9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2613,13 +2613,15 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
 	}
 }
 
-int page_symlink(struct inode *inode, const char *symname, int len)
+int __page_symlink(struct inode *inode, const char *symname, int len,
+		gfp_t gfp_mask)
 {
 	struct address_space *mapping = inode->i_mapping;
-	struct page *page = grab_cache_page(mapping, 0);
+	struct page *page;
 	int err = -ENOMEM;
 	char *kaddr;
 
+	page = find_or_create_page(mapping, 0, gfp_mask);
 	if (!page)
 		goto fail;
 	err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
@@ -2654,6 +2656,12 @@ fail:
 	return err;
 }
 
+int page_symlink(struct inode *inode, const char *symname, int len)
+{
+	return __page_symlink(inode, symname, len,
+			mapping_gfp_mask(inode->i_mapping));
+}
+
 struct inode_operations page_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
@@ -2672,6 +2680,7 @@ EXPORT_SYMBOL(lookup_one_len);
 EXPORT_SYMBOL(page_follow_link_light);
 EXPORT_SYMBOL(page_put_link);
 EXPORT_SYMBOL(page_readlink);
+EXPORT_SYMBOL(__page_symlink);
 EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
 EXPORT_SYMBOL(path_lookup);
-- 
cgit v1.2.2


From cd6ef84e6ac9454080707f2f338360f5d7e556fc Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Sat, 11 Mar 2006 03:27:14 -0800
Subject: [PATCH] ext3: fix nobh mode for chattr +j inodes

One can do "chattr +j" on a file to change its journalling mode.  Fix
writeback mode with "nobh" handling for it.

Even though, we mount ext3 filesystem in writeback mode with "nobh" option,
some one can do "chattr +j" on a single file to force it to do journalled
mode.  In order to do journaling, ext3_block_truncate_page() need to
fallback to default case of creating buffers and adding them to transaction
etc.

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/inode.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 3fc4238e9703..0384e539b88f 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1624,15 +1624,14 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
 	 * For "nobh" option,  we can only work if we don't need to
 	 * read-in the page - otherwise we create buffers to do the IO.
 	 */
-	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH)) {
-		if (PageUptodate(page)) {
-			kaddr = kmap_atomic(page, KM_USER0);
-			memset(kaddr + offset, 0, length);
-			flush_dcache_page(page);
-			kunmap_atomic(kaddr, KM_USER0);
-			set_page_dirty(page);
-			goto unlock;
-		}
+	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
+	     ext3_should_writeback_data(inode) && PageUptodate(page)) {
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr + offset, 0, length);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+		set_page_dirty(page);
+		goto unlock;
 	}
 
 	if (!page_has_buffers(page))
-- 
cgit v1.2.2


From 143f412eb4c7cc48b9eb4381f9133b7d36c68075 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 13 Mar 2006 21:20:46 -0800
Subject: [PATCH] NFS: Fix a potential panic in O_DIRECT

Based on an original patch by Mike O'Connor and Greg Banks of SGI.

Mike states:

A normal user can panic an NFS client and cause a local DoS with
'judicious'(?) use of O_DIRECT.  Any O_DIRECT write to an NFS file where the
user buffer starts with a valid mapped page and contains an unmapped page,
will crash in this way.  I haven't followed the code, but O_DIRECT reads with
similar user buffers will probably also crash albeit in different ways.

Details: when nfs_get_user_pages() calls get_user_pages(), it detects and
correctly handles get_user_pages() returning an error, which happens if the
first page covered by the user buffer's address range is unmapped.  However,
if the first page is mapped but some subsequent page isn't, get_user_pages()
will return a positive number which is less than the number of pages requested
(this behaviour is sort of analagous to a short write() call and appears to be
intentional).  nfs_get_user_pages() doesn't detect this and hands off the
array of pages (whose last few elements are random rubbish from the newly
allocated array memory) to it's caller, whence they go to
nfs_direct_write_seg(), which then totally ignores the nr_pages it's given,
and calculates its own idea of how many pages are in the array from the user
buffer length.  Needless to say, when it comes to transmit those uninitialised
page* pointers, we see a crash in the network stack.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs/direct.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 04ab2fc360e7..4e9b3a1b36c5 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -57,6 +57,7 @@
 #define NFSDBG_FACILITY		NFSDBG_VFS
 #define MAX_DIRECTIO_SIZE	(4096UL << PAGE_SHIFT)
 
+static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty);
 static kmem_cache_t *nfs_direct_cachep;
 
 /*
@@ -107,6 +108,15 @@ nfs_get_user_pages(int rw, unsigned long user_addr, size_t size,
 					page_count, (rw == READ), 0,
 					*pages, NULL);
 		up_read(&current->mm->mmap_sem);
+		/*
+		 * If we got fewer pages than expected from get_user_pages(),
+		 * the user buffer runs off the end of a mapping; return EFAULT.
+		 */
+		if (result >= 0 && result < page_count) {
+			nfs_free_user_pages(*pages, result, 0);
+			*pages = NULL;
+			result = -EFAULT;
+		}
 	}
 	return result;
 }
-- 
cgit v1.2.2


From c12e87f4652b1ba3be168b4f63a440399b941928 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 13 Mar 2006 21:20:47 -0800
Subject: [PATCH] NFSv4: fix mount segfault on errors returned that are < -1000

It turns out that nfs4_proc_get_root() may return raw NFSv4 errors instead of
mapping them to kernel errors.  Problem spotted by Neil Horman
<nhorman@tuxdriver.com>

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs/nfs4proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 984ca3454d04..f8c0066e02e1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1430,7 +1430,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
 	if (status == 0)
 		status = nfs4_do_fsinfo(server, fhandle, info);
 out:
-	return status;
+	return nfs4_map_errors(status);
 }
 
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
-- 
cgit v1.2.2


From 30f4e20a0d3492668f5065af582b5af2d1e4256b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 13 Mar 2006 21:20:49 -0800
Subject: [PATCH] NLM: Ensure we do not Oops in the case of an unlock

In theory, NLM specs assure us that the server will only reply LCK_GRANTED or
LCK_DENIED_GRACE_PERIOD to our NLM_UNLOCK request.

In practice, we should not assume this to be the case, and the code will
currently Oops if we do.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/lockd/clntproc.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 220058d8616d..970b6a6aa337 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -662,12 +662,18 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	 * reclaimed while we're stuck in the unlock call. */
 	fl->fl_u.nfs_fl.flags &= ~NFS_LCK_GRANTED;
 
+	/*
+	 * Note: the server is supposed to either grant us the unlock
+	 * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either
+	 * case, we want to unlock.
+	 */
+	do_vfs_lock(fl);
+
 	if (req->a_flags & RPC_TASK_ASYNC) {
 		status = nlmclnt_async_call(req, NLMPROC_UNLOCK,
 					&nlmclnt_unlock_ops);
 		/* Hrmf... Do the unlock early since locks_remove_posix()
 		 * really expects us to free the lock synchronously */
-		do_vfs_lock(fl);
 		if (status < 0) {
 			nlmclnt_release_lockargs(req);
 			kfree(req);
@@ -680,7 +686,6 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	if (status < 0)
 		return status;
 
-	do_vfs_lock(fl);
 	if (resp->status == NLM_LCK_GRANTED)
 		return 0;
 
-- 
cgit v1.2.2