From a19cbd4bf258840ade3b6ee9e9256006d0644e09 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Wed, 8 Mar 2006 14:03:09 -0800
Subject: Mark the pipe file operations static

They aren't used (nor even really usable) outside of pipe.c anyway

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/pipe.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index d722579df79a..8aada8e426f4 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -605,7 +605,7 @@ struct file_operations rdwr_fifo_fops = {
 	.fasync		= pipe_rdwr_fasync,
 };
 
-struct file_operations read_pipe_fops = {
+static struct file_operations read_pipe_fops = {
 	.llseek		= no_llseek,
 	.read		= pipe_read,
 	.readv		= pipe_readv,
@@ -617,7 +617,7 @@ struct file_operations read_pipe_fops = {
 	.fasync		= pipe_read_fasync,
 };
 
-struct file_operations write_pipe_fops = {
+static struct file_operations write_pipe_fops = {
 	.llseek		= no_llseek,
 	.read		= bad_pipe_r,
 	.write		= pipe_write,
@@ -629,7 +629,7 @@ struct file_operations write_pipe_fops = {
 	.fasync		= pipe_write_fasync,
 };
 
-struct file_operations rdwr_pipe_fops = {
+static struct file_operations rdwr_pipe_fops = {
 	.llseek		= no_llseek,
 	.read		= pipe_read,
 	.readv		= pipe_readv,
-- 
cgit v1.2.2


From 4d6660eb3665f22d16aff466eb9d45df6102b254 Mon Sep 17 00:00:00 2001
From: Phillip Susi <psusi@cfl.rr.com>
Date: Tue, 7 Mar 2006 21:55:24 -0800
Subject: [PATCH] udf: fix uid/gid options and add uid/gid=ignore and forget
 options

Fix a bug in udf where it would write uid/gid = 0 to the disk for files
owned by the id given with the uid=/gid= mount options.  It also adds 4 new
mount options: uid/gid=forget and uid/gid=ignore.  Without any options the
id in core and on disk always match.  Giving uid/gid=nnn specifies a
default ID to be used in core when the on disk ID is -1.  uid/gid=ignore
forces the in core ID to allways be used no matter what the on disk ID is.
uid/gid=forget forces the on disk ID to always be written out as -1.

The use of these options allows you to override ownerships on a disk or
disable ownwership information from being written, allowing the media to be
used portably between different computers and possibly different users
without permissions issues that would require root to correct.

Signed-off-by: Phillip Susi <psusi@cfl.rr.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/udf/inode.c  | 16 ++++++++++++----
 fs/udf/super.c  | 18 +++++++++++++++++-
 fs/udf/udf_sb.h |  4 ++++
 3 files changed, 33 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 395e582ee542..d04cff2273b6 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1045,10 +1045,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 	}
 
 	inode->i_uid = le32_to_cpu(fe->uid);
-	if ( inode->i_uid == -1 ) inode->i_uid = UDF_SB(inode->i_sb)->s_uid;
+	if (inode->i_uid == -1 || UDF_QUERY_FLAG(inode->i_sb,
+					UDF_FLAG_UID_IGNORE))
+		inode->i_uid = UDF_SB(inode->i_sb)->s_uid;
 
 	inode->i_gid = le32_to_cpu(fe->gid);
-	if ( inode->i_gid == -1 ) inode->i_gid = UDF_SB(inode->i_sb)->s_gid;
+	if (inode->i_gid == -1 || UDF_QUERY_FLAG(inode->i_sb,
+					UDF_FLAG_GID_IGNORE))
+		inode->i_gid = UDF_SB(inode->i_sb)->s_gid;
 
 	inode->i_nlink = le16_to_cpu(fe->fileLinkCount);
 	if (!inode->i_nlink)
@@ -1335,10 +1339,14 @@ udf_update_inode(struct inode *inode, int do_sync)
 		return err;
 	}
 
-	if (inode->i_uid != UDF_SB(inode->i_sb)->s_uid)
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET))
+		fe->uid = cpu_to_le32(-1);
+	else if (inode->i_uid != UDF_SB(inode->i_sb)->s_uid)
 		fe->uid = cpu_to_le32(inode->i_uid);
 
-	if (inode->i_gid != UDF_SB(inode->i_sb)->s_gid)
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET))
+		fe->gid = cpu_to_le32(-1);
+	else if (inode->i_gid != UDF_SB(inode->i_sb)->s_gid)
 		fe->gid = cpu_to_le32(inode->i_gid);
 
 	udfperms =	((inode->i_mode & S_IRWXO)     ) |
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 4a6f49adc609..368d8f81fe54 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -269,7 +269,7 @@ enum {
 	Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock,
 	Opt_anchor, Opt_volume, Opt_partition, Opt_fileset,
 	Opt_rootdir, Opt_utf8, Opt_iocharset,
-	Opt_err
+	Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore
 };
 
 static match_table_t tokens = {
@@ -282,6 +282,10 @@ static match_table_t tokens = {
 	{Opt_adinicb, "adinicb"},
 	{Opt_shortad, "shortad"},
 	{Opt_longad, "longad"},
+	{Opt_uforget, "uid=forget"},
+	{Opt_uignore, "uid=ignore"},
+	{Opt_gforget, "gid=forget"},
+	{Opt_gignore, "gid=ignore"},
 	{Opt_gid, "gid=%u"},
 	{Opt_uid, "uid=%u"},
 	{Opt_umask, "umask=%o"},
@@ -414,6 +418,18 @@ udf_parse_options(char *options, struct udf_options *uopt)
 				uopt->flags |= (1 << UDF_FLAG_NLS_MAP);
 				break;
 #endif
+			case Opt_uignore:
+				uopt->flags |= (1 << UDF_FLAG_UID_IGNORE);
+				break;
+			case Opt_uforget:
+				uopt->flags |= (1 << UDF_FLAG_UID_FORGET);
+				break;
+			case Opt_gignore:
+			    uopt->flags |= (1 << UDF_FLAG_GID_IGNORE);
+				break;
+			case Opt_gforget:
+			    uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
+				break;
 			default:
 				printk(KERN_ERR "udf: bad mount option \"%s\" "
 						"or missing value\n", p);
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 663669810be6..110f8d62616f 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -20,6 +20,10 @@
 #define UDF_FLAG_VARCONV		8
 #define UDF_FLAG_NLS_MAP		9
 #define UDF_FLAG_UTF8			10
+#define UDF_FLAG_UID_FORGET     11    /* save -1 for uid to disk */
+#define UDF_FLAG_UID_IGNORE     12    /* use sb uid instead of on disk uid */
+#define UDF_FLAG_GID_FORGET     13
+#define UDF_FLAG_GID_IGNORE     14
 
 #define UDF_PART_FLAG_UNALLOC_BITMAP	0x0001
 #define UDF_PART_FLAG_UNALLOC_TABLE	0x0002
-- 
cgit v1.2.2


From 529bf6be5c04f2e869d07bfdb122e9fd98ade714 Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Tue, 7 Mar 2006 21:55:35 -0800
Subject: [PATCH] fix file counting

I have benchmarked this on an x86_64 NUMA system and see no significant
performance difference on kernbench.  Tested on both x86_64 and powerpc.

The way we do file struct accounting is not very suitable for batched
freeing.  For scalability reasons, file accounting was
constructor/destructor based.  This meant that nr_files was decremented
only when the object was removed from the slab cache.  This is susceptible
to slab fragmentation.  With RCU based file structure, consequent batched
freeing and a test program like Serge's, we just speed this up and end up
with a very fragmented slab -

llm22:~ # cat /proc/sys/fs/file-nr
587730  0       758844

At the same time, I see only a 2000+ objects in filp cache.  The following
patch I fixes this problem.

This patch changes the file counting by removing the filp_count_lock.
Instead we use a separate percpu counter, nr_files, for now and all
accesses to it are through get_nr_files() api.  In the sysctl handler for
nr_files, we populate files_stat.nr_files before returning to user.

Counting files as an when they are created and destroyed (as opposed to
inside slab) allows us to correctly count open files with RCU.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/dcache.c     |  2 +-
 fs/file_table.c | 87 ++++++++++++++++++++++++++++++++++++---------------------
 2 files changed, 56 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index a173bba32666..11dc83092d4a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1736,7 +1736,7 @@ void __init vfs_caches_init(unsigned long mempages)
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 
 	dcache_init(mempages);
 	inode_init(mempages);
diff --git a/fs/file_table.c b/fs/file_table.c
index 768b58167543..44fabeaa9415 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -5,6 +5,7 @@
  *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
  */
 
+#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/file.h>
@@ -19,52 +20,67 @@
 #include <linux/capability.h>
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
+#include <linux/sysctl.h>
+#include <linux/percpu_counter.h>
+
+#include <asm/atomic.h>
 
 /* sysctl tunables... */
 struct files_stat_struct files_stat = {
 	.max_files = NR_FILE
 };
 
-EXPORT_SYMBOL(files_stat); /* Needed by unix.o */
-
 /* public. Not pretty! */
- __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
 
-static DEFINE_SPINLOCK(filp_count_lock);
+static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 
-/* slab constructors and destructors are called from arbitrary
- * context and must be fully threaded - use a local spinlock
- * to protect files_stat.nr_files
- */
-void filp_ctor(void *objp, struct kmem_cache *cachep, unsigned long cflags)
+static inline void file_free_rcu(struct rcu_head *head)
 {
-	if ((cflags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		unsigned long flags;
-		spin_lock_irqsave(&filp_count_lock, flags);
-		files_stat.nr_files++;
-		spin_unlock_irqrestore(&filp_count_lock, flags);
-	}
+	struct file *f =  container_of(head, struct file, f_u.fu_rcuhead);
+	kmem_cache_free(filp_cachep, f);
 }
 
-void filp_dtor(void *objp, struct kmem_cache *cachep, unsigned long dflags)
+static inline void file_free(struct file *f)
 {
-	unsigned long flags;
-	spin_lock_irqsave(&filp_count_lock, flags);
-	files_stat.nr_files--;
-	spin_unlock_irqrestore(&filp_count_lock, flags);
+	percpu_counter_dec(&nr_files);
+	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
 }
 
-static inline void file_free_rcu(struct rcu_head *head)
+/*
+ * Return the total number of open files in the system
+ */
+static int get_nr_files(void)
 {
-	struct file *f =  container_of(head, struct file, f_u.fu_rcuhead);
-	kmem_cache_free(filp_cachep, f);
+	return percpu_counter_read_positive(&nr_files);
 }
 
-static inline void file_free(struct file *f)
+/*
+ * Return the maximum number of open files in the system
+ */
+int get_max_files(void)
 {
-	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
+	return files_stat.max_files;
 }
+EXPORT_SYMBOL_GPL(get_max_files);
+
+/*
+ * Handle nr_files sysctl
+ */
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+int proc_nr_files(ctl_table *table, int write, struct file *filp,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	files_stat.nr_files = get_nr_files();
+	return proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+#else
+int proc_nr_files(ctl_table *table, int write, struct file *filp,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+#endif
 
 /* Find an unused file structure and return a pointer to it.
  * Returns NULL, if there are no more free file structures or
@@ -78,14 +94,20 @@ struct file *get_empty_filp(void)
 	/*
 	 * Privileged users can go above max_files
 	 */
-	if (files_stat.nr_files >= files_stat.max_files &&
-				!capable(CAP_SYS_ADMIN))
-		goto over;
+	if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
+		/*
+		 * percpu_counters are inaccurate.  Do an expensive check before
+		 * we go and fail.
+		 */
+		if (percpu_counter_sum(&nr_files) >= files_stat.max_files)
+			goto over;
+	}
 
 	f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
 	if (f == NULL)
 		goto fail;
 
+	percpu_counter_inc(&nr_files);
 	memset(f, 0, sizeof(*f));
 	if (security_file_alloc(f))
 		goto fail_sec;
@@ -101,10 +123,10 @@ struct file *get_empty_filp(void)
 
 over:
 	/* Ran out of filps - report that */
-	if (files_stat.nr_files > old_max) {
+	if (get_nr_files() > old_max) {
 		printk(KERN_INFO "VFS: file-max limit %d reached\n",
-					files_stat.max_files);
-		old_max = files_stat.nr_files;
+					get_max_files());
+		old_max = get_nr_files();
 	}
 	goto fail;
 
@@ -276,4 +298,5 @@ void __init files_init(unsigned long mempages)
 	if (files_stat.max_files < NR_FILE)
 		files_stat.max_files = NR_FILE;
 	files_defer_init();
+	percpu_counter_init(&nr_files);
 } 
-- 
cgit v1.2.2


From e96fb230cc97760e448327c0de612cfba94ca7bf Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 7 Mar 2006 21:55:36 -0800
Subject: [PATCH] jffs2: avoid divide-by-zero

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jffs2/scan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 3e51dd1da8aa..cf55b221fc2b 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -233,7 +233,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 		c->nextblock->dirty_size = 0;
 	}
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
-	if (!jffs2_can_mark_obsolete(c) && c->nextblock && (c->nextblock->free_size % c->wbuf_pagesize)) {
+	if (!jffs2_can_mark_obsolete(c) && c->wbuf_pagesize && c->nextblock && (c->nextblock->free_size % c->wbuf_pagesize)) {
 		/* If we're going to start writing into a block which already
 		   contains data, and the end of the data isn't page-aligned,
 		   skip a little and align it. */
-- 
cgit v1.2.2


From 90f0094dc607abe384a412bfb7199fb667ab0735 Mon Sep 17 00:00:00 2001
From: Horst Hummel <horst.hummel@de.ibm.com>
Date: Tue, 7 Mar 2006 21:55:39 -0800
Subject: [PATCH] s390: dasd partition detection

DASD allows to open a device as soon as gendisk is registered, which means the
device is a fake device (capacity=0) and we do know nothing about blocksize
and partitions at that point of time.  In case the device is opened by
someone, the bdev and inode creation is done with the fake device info and the
following partition detection code is just using the wrong data.

To avoid this modify the DASD state machine to make sure that the open is
rejected until the device analysis is either finished or an unformatted device
was detected.

Signed-off-by: Horst Hummel <horst.hummel@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/partitions/ibm.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 78010ad60e47..1e4a93835fed 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -52,6 +52,7 @@ int
 ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
 	int blocksize, offset, size;
+	loff_t i_size;
 	dasd_information_t *info;
 	struct hd_geometry *geo;
 	char type[5] = {0,};
@@ -63,6 +64,13 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	unsigned char *data;
 	Sector sect;
 
+	blocksize = bdev_hardsect_size(bdev);
+	if (blocksize <= 0)
+		return 0;
+	i_size = i_size_read(bdev->bd_inode);
+	if (i_size == 0)
+		return 0;
+
 	if ((info = kmalloc(sizeof(dasd_information_t), GFP_KERNEL)) == NULL)
 		goto out_noinfo;
 	if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL)
@@ -73,9 +81,6 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 ||
 	    ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
 		goto out_noioctl;
-	
-	if ((blocksize = bdev_hardsect_size(bdev)) <= 0)
-		goto out_badsect;
 
 	/*
 	 * Get volume label, extract name and type.
@@ -111,7 +116,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 		} else {
 			printk("CMS1/%8s:", name);
 			offset = (info->label_block + 1);
-			size = bdev->bd_inode->i_size >> 9;
+			size = i_size >> 9;
 		}
 		put_partition(state, 1, offset*(blocksize >> 9),
 				 size-offset*(blocksize >> 9));
@@ -168,7 +173,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 		else
 			printk("(nonl)/%8s:", name);
 		offset = (info->label_block + 1);
-		size = (bdev->bd_inode->i_size >> 9);
+		size = i_size >> 9;
 		put_partition(state, 1, offset*(blocksize >> 9),
 				 size-offset*(blocksize >> 9));
 	}
@@ -180,7 +185,6 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	return 1;
 	
 out_readerr:
-out_badsect:
 out_noioctl:
 	kfree(label);
 out_nolab:
-- 
cgit v1.2.2


From 731805b49489055c1548f7ccfbd44c9b84013264 Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@advancedsolutions.com>
Date: Tue, 7 Mar 2006 21:55:42 -0800
Subject: [PATCH] v9fs: fix for access to unitialized variables or freed memory

Miscellaneous fixes related to accessing uninitialized variables or memory
that was already freed.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Cc: Eric Van Hensbergen <ericvh@ericvh.myip.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/9p.c        | 1 -
 fs/9p/trans_fd.c  | 1 +
 fs/9p/vfs_inode.c | 8 +++-----
 fs/9p/vfs_super.c | 1 -
 4 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/9p.c b/fs/9p/9p.c
index 1a6d08761f39..f86a28d1d6a6 100644
--- a/fs/9p/9p.c
+++ b/fs/9p/9p.c
@@ -111,7 +111,6 @@ static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc,
 	if (!rc)
 		return;
 
-	dprintk(DEBUG_9P, "tcall id %d rcall id %d\n", tc->id, rc->id);
 	v9ses = a;
 	if (rc->id == RCLUNK)
 		v9fs_put_idpool(fid, &v9ses->fidpool);
diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c
index 1a28ef97a3d1..5b2ce21b10fa 100644
--- a/fs/9p/trans_fd.c
+++ b/fs/9p/trans_fd.c
@@ -80,6 +80,7 @@ static int v9fs_fd_send(struct v9fs_transport *trans, void *v, int len)
 	if (!trans || trans->status != Connected || !ts)
 		return -EIO;
 
+	oldfs = get_fs();
 	set_fs(get_ds());
 	/* The cast to a user pointer is valid due to the set_fs() */
 	ret = vfs_write(ts->out_file, (void __user *)v, len, &ts->out_file->f_pos);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index dce729d42869..3ad8455f8577 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -265,8 +265,7 @@ v9fs_create(struct v9fs_session_info *v9ses, u32 pfid, char *name,
 	fid = v9fs_get_idpool(&v9ses->fidpool);
 	if (fid < 0) {
 		eprintk(KERN_WARNING, "no free fids available\n");
-		err = -ENOSPC;
-		goto error;
+		return -ENOSPC;
 	}
 
 	err = v9fs_t_walk(v9ses, pfid, fid, NULL, &fcall);
@@ -313,8 +312,7 @@ v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry)
 	nfid = v9fs_get_idpool(&v9ses->fidpool);
 	if (nfid < 0) {
 		eprintk(KERN_WARNING, "no free fids available\n");
-		err = -ENOSPC;
-		goto error;
+		return ERR_PTR(-ENOSPC);
 	}
 
 	err = v9fs_t_walk(v9ses, fid, nfid, (char *) dentry->d_name.name,
@@ -612,7 +610,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	int result = 0;
 
 	dprintk(DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
-		dir, dentry->d_iname, dentry, nameidata);
+		dir, dentry->d_name.name, dentry, nameidata);
 
 	sb = dir->i_sb;
 	v9ses = v9fs_inode2v9ses(dir);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index cdf787ee08de..d05318fa684e 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -156,7 +156,6 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 	stat_result = v9fs_t_stat(v9ses, newfid, &fcall);
 	if (stat_result < 0) {
 		dprintk(DEBUG_ERROR, "stat error\n");
-		kfree(fcall);
 		v9fs_t_clunk(v9ses, newfid);
 	} else {
 		/* Setup the Root Inode */
-- 
cgit v1.2.2


From 1efa3c05f8640c37ba89d54dfaa18504d21986ce Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Wed, 8 Mar 2006 16:46:08 -0800
Subject: [NET] compat ifconf: fix limits

A recent change to compat. dev_ifconf() in fs/compat_ioctl.c
causes ifconf data to be truncated 1 entry too early when copying it
to userspace.  The correct amount of data (length) is returned,
but the final entry is empty (zero, not filled in).
The for-loop 'i' check should use <= to allow the final struct
ifreq32 to be copied.  I also used the ifconf-corruption program
in kernel bugzilla #4746 to make sure that this change does not
re-introduce the corruption.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/compat_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 537ac70edfe5..c666769a875d 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -446,7 +446,7 @@ static int dev_ifconf(unsigned int fd, unsigned int cmd, unsigned long arg)
 	ifr = ifc.ifc_req;
 	ifr32 = compat_ptr(ifc32.ifcbuf);
 	for (i = 0, j = 0;
-             i + sizeof (struct ifreq32) < ifc32.ifc_len && j < ifc.ifc_len;
+             i + sizeof (struct ifreq32) <= ifc32.ifc_len && j < ifc.ifc_len;
 	     i += sizeof (struct ifreq32), j += sizeof (struct ifreq)) {
 		if (copy_in_user(ifr32, ifr, sizeof (struct ifreq32)))
 			return -EFAULT;
-- 
cgit v1.2.2


From 0ef675d491bd65028fa838015ebc6ce8abefab6f Mon Sep 17 00:00:00 2001
From: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Date: Thu, 9 Mar 2006 17:33:38 -0800
Subject: [PATCH] mtd: 64 bit fixes

Fix some bugs in mtd/jffs2 on 64bit platform.

The MEMGETBADBLOCK/MEMSETBADBLOCK ioctl are not listed in compat_ioctl.h.

And some variables in jffs2 are declared as uint32_t but used to hold
size_t values.

Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jffs2/nodelist.c  | 3 ++-
 fs/jffs2/readinode.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index b635e167a3fa..d4d0c41490cd 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -406,7 +406,8 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
 	int err = 0, pointed = 0;
 	struct jffs2_eraseblock *jeb;
 	unsigned char *buffer;
-	uint32_t crc, ofs, retlen, len;
+	uint32_t crc, ofs, len;
+	size_t retlen;
 
 	BUG_ON(tn->csize == 0);
 
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 5f0652df5d47..f1695642d0f7 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -112,7 +112,7 @@ static struct jffs2_raw_node_ref *jffs2_first_valid_node(struct jffs2_raw_node_r
  * 	    negative error code on failure.
  */
 static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
-				struct jffs2_raw_dirent *rd, uint32_t read, struct jffs2_full_dirent **fdp,
+				struct jffs2_raw_dirent *rd, size_t read, struct jffs2_full_dirent **fdp,
 				uint32_t *latest_mctime, uint32_t *mctime_ver)
 {
 	struct jffs2_full_dirent *fd;
-- 
cgit v1.2.2


From 0adb25d2e71ab047423d6fc63d5d184590d0a66f Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@openvz.org>
Date: Sat, 11 Mar 2006 03:27:13 -0800
Subject: [PATCH] ext3: ext3_symlink should use GFP_NOFS allocations inside

This patch fixes illegal __GFP_FS allocation inside ext3 transaction in
ext3_symlink().  Such allocation may re-enter ext3 code from
try_to_free_pages.  But JBD/ext3 code keeps a pointer to current journal
handle in task_struct and, hence, is not reentrable.

This bug led to "Assertion failure in journal_dirty_metadata()" messages.

http://bugzilla.openvz.org/show_bug.cgi?id=115

Signed-off-by: Andrey Savochkin <saw@saw.sw.com.sg>
Signed-off-by: Kirill Korotaev <dev@openvz.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/namei.c |  3 ++-
 fs/namei.c      | 13 +++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 8bd8ac077704..b8f5cd1e540d 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2141,7 +2141,8 @@ retry:
 		 * We have a transaction open.  All is sweetness.  It also sets
 		 * i_size in generic_commit_write().
 		 */
-		err = page_symlink(inode, symname, l);
+		err = __page_symlink(inode, symname, l,
+				mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 		if (err) {
 			ext3_dec_count(handle, inode);
 			ext3_mark_inode_dirty(handle, inode);
diff --git a/fs/namei.c b/fs/namei.c
index 557dcf395ca1..8dc2b038d5d9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2613,13 +2613,15 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
 	}
 }
 
-int page_symlink(struct inode *inode, const char *symname, int len)
+int __page_symlink(struct inode *inode, const char *symname, int len,
+		gfp_t gfp_mask)
 {
 	struct address_space *mapping = inode->i_mapping;
-	struct page *page = grab_cache_page(mapping, 0);
+	struct page *page;
 	int err = -ENOMEM;
 	char *kaddr;
 
+	page = find_or_create_page(mapping, 0, gfp_mask);
 	if (!page)
 		goto fail;
 	err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
@@ -2654,6 +2656,12 @@ fail:
 	return err;
 }
 
+int page_symlink(struct inode *inode, const char *symname, int len)
+{
+	return __page_symlink(inode, symname, len,
+			mapping_gfp_mask(inode->i_mapping));
+}
+
 struct inode_operations page_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
@@ -2672,6 +2680,7 @@ EXPORT_SYMBOL(lookup_one_len);
 EXPORT_SYMBOL(page_follow_link_light);
 EXPORT_SYMBOL(page_put_link);
 EXPORT_SYMBOL(page_readlink);
+EXPORT_SYMBOL(__page_symlink);
 EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
 EXPORT_SYMBOL(path_lookup);
-- 
cgit v1.2.2


From cd6ef84e6ac9454080707f2f338360f5d7e556fc Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Sat, 11 Mar 2006 03:27:14 -0800
Subject: [PATCH] ext3: fix nobh mode for chattr +j inodes

One can do "chattr +j" on a file to change its journalling mode.  Fix
writeback mode with "nobh" handling for it.

Even though, we mount ext3 filesystem in writeback mode with "nobh" option,
some one can do "chattr +j" on a single file to force it to do journalled
mode.  In order to do journaling, ext3_block_truncate_page() need to
fallback to default case of creating buffers and adding them to transaction
etc.

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/inode.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 3fc4238e9703..0384e539b88f 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1624,15 +1624,14 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
 	 * For "nobh" option,  we can only work if we don't need to
 	 * read-in the page - otherwise we create buffers to do the IO.
 	 */
-	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH)) {
-		if (PageUptodate(page)) {
-			kaddr = kmap_atomic(page, KM_USER0);
-			memset(kaddr + offset, 0, length);
-			flush_dcache_page(page);
-			kunmap_atomic(kaddr, KM_USER0);
-			set_page_dirty(page);
-			goto unlock;
-		}
+	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
+	     ext3_should_writeback_data(inode) && PageUptodate(page)) {
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr + offset, 0, length);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+		set_page_dirty(page);
+		goto unlock;
 	}
 
 	if (!page_has_buffers(page))
-- 
cgit v1.2.2


From 143f412eb4c7cc48b9eb4381f9133b7d36c68075 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 13 Mar 2006 21:20:46 -0800
Subject: [PATCH] NFS: Fix a potential panic in O_DIRECT

Based on an original patch by Mike O'Connor and Greg Banks of SGI.

Mike states:

A normal user can panic an NFS client and cause a local DoS with
'judicious'(?) use of O_DIRECT.  Any O_DIRECT write to an NFS file where the
user buffer starts with a valid mapped page and contains an unmapped page,
will crash in this way.  I haven't followed the code, but O_DIRECT reads with
similar user buffers will probably also crash albeit in different ways.

Details: when nfs_get_user_pages() calls get_user_pages(), it detects and
correctly handles get_user_pages() returning an error, which happens if the
first page covered by the user buffer's address range is unmapped.  However,
if the first page is mapped but some subsequent page isn't, get_user_pages()
will return a positive number which is less than the number of pages requested
(this behaviour is sort of analagous to a short write() call and appears to be
intentional).  nfs_get_user_pages() doesn't detect this and hands off the
array of pages (whose last few elements are random rubbish from the newly
allocated array memory) to it's caller, whence they go to
nfs_direct_write_seg(), which then totally ignores the nr_pages it's given,
and calculates its own idea of how many pages are in the array from the user
buffer length.  Needless to say, when it comes to transmit those uninitialised
page* pointers, we see a crash in the network stack.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs/direct.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 04ab2fc360e7..4e9b3a1b36c5 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -57,6 +57,7 @@
 #define NFSDBG_FACILITY		NFSDBG_VFS
 #define MAX_DIRECTIO_SIZE	(4096UL << PAGE_SHIFT)
 
+static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty);
 static kmem_cache_t *nfs_direct_cachep;
 
 /*
@@ -107,6 +108,15 @@ nfs_get_user_pages(int rw, unsigned long user_addr, size_t size,
 					page_count, (rw == READ), 0,
 					*pages, NULL);
 		up_read(&current->mm->mmap_sem);
+		/*
+		 * If we got fewer pages than expected from get_user_pages(),
+		 * the user buffer runs off the end of a mapping; return EFAULT.
+		 */
+		if (result >= 0 && result < page_count) {
+			nfs_free_user_pages(*pages, result, 0);
+			*pages = NULL;
+			result = -EFAULT;
+		}
 	}
 	return result;
 }
-- 
cgit v1.2.2


From c12e87f4652b1ba3be168b4f63a440399b941928 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 13 Mar 2006 21:20:47 -0800
Subject: [PATCH] NFSv4: fix mount segfault on errors returned that are < -1000

It turns out that nfs4_proc_get_root() may return raw NFSv4 errors instead of
mapping them to kernel errors.  Problem spotted by Neil Horman
<nhorman@tuxdriver.com>

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs/nfs4proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 984ca3454d04..f8c0066e02e1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1430,7 +1430,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
 	if (status == 0)
 		status = nfs4_do_fsinfo(server, fhandle, info);
 out:
-	return status;
+	return nfs4_map_errors(status);
 }
 
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
-- 
cgit v1.2.2


From 30f4e20a0d3492668f5065af582b5af2d1e4256b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 13 Mar 2006 21:20:49 -0800
Subject: [PATCH] NLM: Ensure we do not Oops in the case of an unlock

In theory, NLM specs assure us that the server will only reply LCK_GRANTED or
LCK_DENIED_GRACE_PERIOD to our NLM_UNLOCK request.

In practice, we should not assume this to be the case, and the code will
currently Oops if we do.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/lockd/clntproc.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 220058d8616d..970b6a6aa337 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -662,12 +662,18 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	 * reclaimed while we're stuck in the unlock call. */
 	fl->fl_u.nfs_fl.flags &= ~NFS_LCK_GRANTED;
 
+	/*
+	 * Note: the server is supposed to either grant us the unlock
+	 * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either
+	 * case, we want to unlock.
+	 */
+	do_vfs_lock(fl);
+
 	if (req->a_flags & RPC_TASK_ASYNC) {
 		status = nlmclnt_async_call(req, NLMPROC_UNLOCK,
 					&nlmclnt_unlock_ops);
 		/* Hrmf... Do the unlock early since locks_remove_posix()
 		 * really expects us to free the lock synchronously */
-		do_vfs_lock(fl);
 		if (status < 0) {
 			nlmclnt_release_lockargs(req);
 			kfree(req);
@@ -680,7 +686,6 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	if (status < 0)
 		return status;
 
-	do_vfs_lock(fl);
 	if (resp->status == NLM_LCK_GRANTED)
 		return 0;
 
-- 
cgit v1.2.2


From a488edc914aa1d766a4e2c982b5ae03d5657ec1b Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@austin.ibm.com>
Date: Tue, 14 Mar 2006 13:44:00 -0600
Subject: [PATCH] JFS: Take logsync lock before testing mp->lsn

This fixes a race where lsn could be cleared before taking the lock

Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jfs/jfs_dmap.c | 7 ++-----
 fs/jfs/jfs_imap.c | 6 ++----
 2 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 2967b7393415..79b5404db100 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -532,10 +532,10 @@ dbUpdatePMap(struct inode *ipbmap,
 
 		lastlblkno = lblkno;
 
+		LOGSYNC_LOCK(log, flags);
 		if (mp->lsn != 0) {
 			/* inherit older/smaller lsn */
 			logdiff(diffp, mp->lsn, log);
-			LOGSYNC_LOCK(log, flags);
 			if (difft < diffp) {
 				mp->lsn = lsn;
 
@@ -548,20 +548,17 @@ dbUpdatePMap(struct inode *ipbmap,
 			logdiff(diffp, mp->clsn, log);
 			if (difft > diffp)
 				mp->clsn = tblk->clsn;
-			LOGSYNC_UNLOCK(log, flags);
 		} else {
 			mp->log = log;
 			mp->lsn = lsn;
 
 			/* insert bp after tblock in logsync list */
-			LOGSYNC_LOCK(log, flags);
-
 			log->count++;
 			list_add(&mp->synclist, &tblk->synclist);
 
 			mp->clsn = tblk->clsn;
-			LOGSYNC_UNLOCK(log, flags);
 		}
+		LOGSYNC_UNLOCK(log, flags);
 	}
 
 	/* write the last buffer. */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 31b4aa13dd4b..4efa0d0eec39 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -2844,11 +2844,11 @@ diUpdatePMap(struct inode *ipimap,
 	 */
 	lsn = tblk->lsn;
 	log = JFS_SBI(tblk->sb)->log;
+	LOGSYNC_LOCK(log, flags);
 	if (mp->lsn != 0) {
 		/* inherit older/smaller lsn */
 		logdiff(difft, lsn, log);
 		logdiff(diffp, mp->lsn, log);
-		LOGSYNC_LOCK(log, flags);
 		if (difft < diffp) {
 			mp->lsn = lsn;
 			/* move mp after tblock in logsync list */
@@ -2860,17 +2860,15 @@ diUpdatePMap(struct inode *ipimap,
 		logdiff(diffp, mp->clsn, log);
 		if (difft > diffp)
 			mp->clsn = tblk->clsn;
-		LOGSYNC_UNLOCK(log, flags);
 	} else {
 		mp->log = log;
 		mp->lsn = lsn;
 		/* insert mp after tblock in logsync list */
-		LOGSYNC_LOCK(log, flags);
 		log->count++;
 		list_add(&mp->synclist, &tblk->synclist);
 		mp->clsn = tblk->clsn;
-		LOGSYNC_UNLOCK(log, flags);
 	}
+	LOGSYNC_UNLOCK(log, flags);
 	write_metapage(mp);
 	return (0);
 }
-- 
cgit v1.2.2


From 3fb962bde48c413bfa419ec4413037e87955dcb6 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@bruce>
Date: Wed, 15 Mar 2006 15:14:45 +1100
Subject: Fix a direct I/O locking issue revealed by the new mutex code.
 Affects only XFS (i.e. DIO_OWN_LOCKING case) - currently it is not possible
 to get i_mutex locking correct when using DIO_OWN direct I/O locking in a
 filesystem due to indeterminism in the possible return code/lock/unlock
 combinations.  This can cause a direct read to attempt a double i_mutex
 unlock inside XFS.

We're now ensuring __blockdev_direct_IO always exits with the
inode i_mutex (still) held for a direct reader.

Tested with the three different locking modes (via direct block
device access, ext3 and XFS) - both reading and writing; cannot
find any regressions resulting from this change, and it clearly
fixes the mutex_unlock warning originally reported here:
http://marc.theaimsgroup.com/?l=linux-kernel&m=114189068126253&w=2

Signed-off-by: Nathan Scott <nathans@sgi.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 fs/direct-io.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 848044af7e16..27f3e787faca 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1155,15 +1155,16 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
  * For writes, i_mutex is not held on entry; it is never taken.
  *
  * DIO_LOCKING (simple locking for regular files)
- * For writes we are called under i_mutex and return with i_mutex held, even though
- * it is internally dropped.
+ * For writes we are called under i_mutex and return with i_mutex held, even
+ * though it is internally dropped.
  * For reads, i_mutex is not held on entry, but it is taken and dropped before
  * returning.
  *
  * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
  *	uninitialised data, allowing parallel direct readers and writers)
  * For writes we are called without i_mutex, return without it, never touch it.
- * For reads, i_mutex is held on entry and will be released before returning.
+ * For reads we are called under i_mutex and return with i_mutex held, even
+ * though it may be internally dropped.
  *
  * Additional i_alloc_sem locking requirements described inline below.
  */
@@ -1182,7 +1183,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	ssize_t retval = -EINVAL;
 	loff_t end = offset;
 	struct dio *dio;
-	int reader_with_isem = (rw == READ && dio_lock_type == DIO_OWN_LOCKING);
+	int release_i_mutex = 0;
+	int acquire_i_mutex = 0;
 
 	if (rw & WRITE)
 		current->flags |= PF_SYNCWRITE;
@@ -1225,7 +1227,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	 *	writers need to grab i_alloc_sem only (i_mutex is already held)
 	 * For regular files using DIO_OWN_LOCKING,
 	 *	neither readers nor writers take any locks here
-	 *	(i_mutex is already held and release for writers here)
 	 */
 	dio->lock_type = dio_lock_type;
 	if (dio_lock_type != DIO_NO_LOCKING) {
@@ -1236,7 +1237,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 			mapping = iocb->ki_filp->f_mapping;
 			if (dio_lock_type != DIO_OWN_LOCKING) {
 				mutex_lock(&inode->i_mutex);
-				reader_with_isem = 1;
+				release_i_mutex = 1;
 			}
 
 			retval = filemap_write_and_wait_range(mapping, offset,
@@ -1248,7 +1249,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 
 			if (dio_lock_type == DIO_OWN_LOCKING) {
 				mutex_unlock(&inode->i_mutex);
-				reader_with_isem = 0;
+				acquire_i_mutex = 1;
 			}
 		}
 
@@ -1269,11 +1270,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 				nr_segs, blkbits, get_blocks, end_io, dio);
 
 	if (rw == READ && dio_lock_type == DIO_LOCKING)
-		reader_with_isem = 0;
+		release_i_mutex = 0;
 
 out:
-	if (reader_with_isem)
+	if (release_i_mutex)
 		mutex_unlock(&inode->i_mutex);
+	else if (acquire_i_mutex)
+		mutex_lock(&inode->i_mutex);
 	if (rw & WRITE)
 		current->flags &= ~PF_SYNCWRITE;
 	return retval;
-- 
cgit v1.2.2


From 4983da07f1e2e8dc81cb9d640fbf35b899cdbdf2 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Tue, 14 Mar 2006 19:50:19 -0800
Subject: [PATCH] page migration: fail if page is in a vma flagged VM_LOCKED

page migration currently simply retries a couple of times if try_to_unmap()
fails without inspecting the return code.

However, SWAP_FAIL indicates that the page is in a vma that has the
VM_LOCKED flag set (if ignore_refs ==1).  We can check for that return code
and avoid retrying the migration.

migrate_page_remove_references() now needs to return a reason why the
failure occured.  So switch migrate_page_remove_references to use -Exx
style error messages.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 62cfd17dc5fe..a9b399402007 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3060,6 +3060,7 @@ int buffer_migrate_page(struct page *newpage, struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 	struct buffer_head *bh, *head;
+	int rc;
 
 	if (!mapping)
 		return -EAGAIN;
@@ -3069,8 +3070,9 @@ int buffer_migrate_page(struct page *newpage, struct page *page)
 
 	head = page_buffers(page);
 
-	if (migrate_page_remove_references(newpage, page, 3))
-		return -EAGAIN;
+	rc = migrate_page_remove_references(newpage, page, 3);
+	if (rc)
+		return rc;
 
 	bh = head;
 	do {
-- 
cgit v1.2.2


From f13b83580acef03a36c785dccc534ccdd7e43084 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 15 Mar 2006 17:37:32 +0100
Subject: [PATCH] fs/namespace.c:dup_namespace(): fix a use after free

The Coverity checker spotted the following bug in dup_namespace():

<--  snip  -->

        if (!new_ns->root) {
                up_write(&namespace_sem);
                kfree(new_ns);
                goto out;
        }
...
out:
        return new_ns;

<--  snip  -->

Callers expect a non-NULL result to not be freed.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 058a44865beb..39c81a8d6316 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1338,7 +1338,7 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs)
 
 	new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL);
 	if (!new_ns)
-		goto out;
+		return NULL;
 
 	atomic_set(&new_ns->count, 1);
 	INIT_LIST_HEAD(&new_ns->list);
@@ -1352,7 +1352,7 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs)
 	if (!new_ns->root) {
 		up_write(&namespace_sem);
 		kfree(new_ns);
-		goto out;
+		return NULL;
 	}
 	spin_lock(&vfsmount_lock);
 	list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
@@ -1393,7 +1393,6 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs)
 	if (altrootmnt)
 		mntput(altrootmnt);
 
-out:
 	return new_ns;
 }
 
-- 
cgit v1.2.2


From 2d7f2ea9c989853310c7f6e8be52cc090cc8e66b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Wed, 15 Mar 2006 21:41:59 +0000
Subject: [PATCH] Fix ext2 readdir f_pos re-validation logic

This fixes not one, but _two_, silly (but admittedly hard to hit) bugs
in the ext2 filesystem "readdir()" function.  It also cleans up the code
to avoid the unnecessary goto mess.

The bugs were related to re-valiating the f_pos value after somebody had
either done an "lseek()" on the directory to an invalid offset, or when
the offset had become invalid due to a file being unlinked in the
directory.  The code would not only set the f_version too eagerly, it
would also not update f_pos appropriately for when the offset fixup took
place.

When that happened, we'd occasionally subsequently fail the readdir()
even when we shouldn't (no real harm done, but an ugly printk, and
obviously you would end up not necessarily seeing all entries).

Thanks to Masoud Sharbiani <masouds@google.com> who noticed the problem
and had a test-case for it, and also fixed up a thinko in the first
version of this patch.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Masoud Sharbiani <masouds@google.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext2/dir.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 7442bdd1267a..b3dbd716cd3a 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -256,11 +256,10 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
 	unsigned long npages = dir_pages(inode);
 	unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
 	unsigned char *types = NULL;
-	int need_revalidate = (filp->f_version != inode->i_version);
-	int ret;
+	int need_revalidate = filp->f_version != inode->i_version;
 
 	if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
-		goto success;
+		return 0;
 
 	if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
 		types = ext2_filetype_table;
@@ -275,12 +274,15 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
 				   "bad page in #%lu",
 				   inode->i_ino);
 			filp->f_pos += PAGE_CACHE_SIZE - offset;
-			ret = -EIO;
-			goto done;
+			return -EIO;
 		}
 		kaddr = page_address(page);
-		if (need_revalidate) {
-			offset = ext2_validate_entry(kaddr, offset, chunk_mask);
+		if (unlikely(need_revalidate)) {
+			if (offset) {
+				offset = ext2_validate_entry(kaddr, offset, chunk_mask);
+				filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+			}
+			filp->f_version = inode->i_version;
 			need_revalidate = 0;
 		}
 		de = (ext2_dirent *)(kaddr+offset);
@@ -289,9 +291,8 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
 			if (de->rec_len == 0) {
 				ext2_error(sb, __FUNCTION__,
 					"zero-length directory entry");
-				ret = -EIO;
 				ext2_put_page(page);
-				goto done;
+				return -EIO;
 			}
 			if (de->inode) {
 				int over;
@@ -306,19 +307,14 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
 						le32_to_cpu(de->inode), d_type);
 				if (over) {
 					ext2_put_page(page);
-					goto success;
+					return 0;
 				}
 			}
 			filp->f_pos += le16_to_cpu(de->rec_len);
 		}
 		ext2_put_page(page);
 	}
-
-success:
-	ret = 0;
-done:
-	filp->f_version = inode->i_version;
-	return ret;
+	return 0;
 }
 
 /*
-- 
cgit v1.2.2


From 8532159f5521ba24e697f0d25970ae89ff62a1f2 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@hera.kernel.org>
Date: Thu, 16 Mar 2006 23:04:04 -0800
Subject: [PATCH] v9fs: fix overzealous dropping of dentry which breaks dcache

There is a d_drop in dir_release which caused problems as it invalidates
dcache entries too soon.  This was likely a part of the wierd cwd behavior
folks were seeing.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/vfs_dir.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index ae6d032b9b59..cd5eeb032d64 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -202,7 +202,6 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 		filp->private_data = NULL;
 	}
 
-	d_drop(filp->f_dentry);
 	return 0;
 }
 
-- 
cgit v1.2.2


From 85c6932ef0c7a82c309f8728ddf29768001d794e Mon Sep 17 00:00:00 2001
From: Peter Staubach <staubach@redhat.com>
Date: Thu, 16 Mar 2006 23:04:02 -0800
Subject: [PATCH] nfsservctl(): remove user-triggerable printk

A user can use nfsservctl() to spam the logs.

This can happen because the arguments to the nfsservctl() system call are
versioned.  This is a good thing.  However, when a bad version is detected,
the kernel prints a message and then returns an error.

Signed-off-by: Peter Staubach <staubach@redhat.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsctl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index 0d4cf9486068..1c72c7f85ddc 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -98,10 +98,8 @@ asmlinkage sys_nfsservctl(int cmd, struct nfsctl_arg __user *arg, void __user *r
 	if (copy_from_user(&version, &arg->ca_version, sizeof(int)))
 		return -EFAULT;
 
-	if (version != NFSCTL_VERSION) {
-		printk(KERN_WARNING "nfsd: incompatible version in syscall.\n");
+	if (version != NFSCTL_VERSION)
 		return -EINVAL;
-	}
 
 	if (cmd < 0 || cmd >= sizeof(map)/sizeof(map[0]) || !map[cmd].name)
 		return -EINVAL;
-- 
cgit v1.2.2