From f04de505e3fa322728d1a851e08bf7060b117743 Mon Sep 17 00:00:00 2001
From: Steve Glendinning <steve.glendinning@smsc.com>
Date: Tue, 21 Oct 2008 13:25:51 +0100
Subject: [JFFS2] Fix build failure with !CONFIG_JFFS2_FS_WRITEBUFFER

Build failure introduced by 5bf1723723487ddb0b9c9641b6559da96b27cc93
[JFFS2] Write buffer offset adjustment for NOR-ECC (Sibley) flash

Signed-off-by: Steve Glendinning <steve.glendinning@smsc.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/nodemgmt.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 0875b60b4bf7..21a052915aa9 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -261,9 +261,11 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
 
 	jffs2_sum_reset_collected(c->summary); /* reset collected summary */
 
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 	/* adjust write buffer offset, else we get a non contiguous write bug */
 	if (!(c->wbuf_ofs % c->sector_size) && !c->wbuf_len)
 		c->wbuf_ofs = 0xffffffff;
+#endif
 
 	D1(printk(KERN_DEBUG "jffs2_find_nextblock(): new nextblock = 0x%08x\n", c->nextblock->offset));
 
-- 
cgit v1.2.2


From 6c87df37dcb9c6c33923707fa5191e0a65874d60 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 27 Oct 2008 22:38:27 +0300
Subject: proc: revert /proc/uptime to ->read_proc hook

Turned out some VMware userspace does pread(2) on /proc/uptime, but
seqfiles currently don't allow pread() resulting in -ESPIPE.

Seqfiles in theory can do pread(), but this can be a long story,
so revert to ->read_proc until then.

http://bugzilla.kernel.org/show_bug.cgi?id=11856

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/uptime.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 0c10a0b3f146..df26aa88fa47 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -1,43 +1,45 @@
-#include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/sched.h>
-#include <linux/seq_file.h>
 #include <linux/time.h>
 #include <asm/cputime.h>
 
-static int uptime_proc_show(struct seq_file *m, void *v)
+static int proc_calc_metrics(char *page, char **start, off_t off,
+				 int count, int *eof, int len)
+{
+	if (len <= off + count)
+		*eof = 1;
+	*start = page + off;
+	len -= off;
+	if (len > count)
+		len = count;
+	if (len < 0)
+		len = 0;
+	return len;
+}
+
+static int uptime_read_proc(char *page, char **start, off_t off, int count,
+			    int *eof, void *data)
 {
 	struct timespec uptime;
 	struct timespec idle;
+	int len;
 	cputime_t idletime = cputime_add(init_task.utime, init_task.stime);
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	monotonic_to_bootbased(&uptime);
 	cputime_to_timespec(idletime, &idle);
-	seq_printf(m, "%lu.%02lu %lu.%02lu\n",
+	len = sprintf(page, "%lu.%02lu %lu.%02lu\n",
 			(unsigned long) uptime.tv_sec,
 			(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
 			(unsigned long) idle.tv_sec,
 			(idle.tv_nsec / (NSEC_PER_SEC / 100)));
-	return 0;
+	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
-static int uptime_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, uptime_proc_show, NULL);
-}
-
-static const struct file_operations uptime_proc_fops = {
-	.open		= uptime_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_uptime_init(void)
 {
-	proc_create("uptime", 0, NULL, &uptime_proc_fops);
+	create_proc_read_entry("uptime", 0, NULL, uptime_read_proc, NULL);
 	return 0;
 }
 module_init(proc_uptime_init);
-- 
cgit v1.2.2


From 44d6f78756560e95903de239e10f8a40a6eae444 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Mon, 27 Oct 2008 22:51:46 -0400
Subject: ext3: fix a bug accessing freed memory in ext3_abort

Vegard Nossum reported a bug which accesses freed memory (found via
kmemcheck).  When journal has been aborted, ext3_put_super() calls
ext3_abort() after freeing the journal_t object, and then ext3_abort()
accesses it.  This patch fix it.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext3/super.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 18eaa78ecb4e..e5717a4fae67 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -281,7 +281,8 @@ void ext3_abort (struct super_block * sb, const char * function,
 	EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
 	sb->s_flags |= MS_RDONLY;
 	EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
-	journal_abort(EXT3_SB(sb)->s_journal, -EIO);
+	if (EXT3_SB(sb)->s_journal)
+		journal_abort(EXT3_SB(sb)->s_journal, -EIO);
 }
 
 void ext3_warning (struct super_block * sb, const char * function,
@@ -390,11 +391,14 @@ static void ext3_put_super (struct super_block * sb)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	struct ext3_super_block *es = sbi->s_es;
-	int i;
+	int i, err;
 
 	ext3_xattr_put_super(sb);
-	if (journal_destroy(sbi->s_journal) < 0)
+	err = journal_destroy(sbi->s_journal);
+	sbi->s_journal = NULL;
+	if (err < 0)
 		ext3_abort(sb, __func__, "Couldn't clean up the journal");
+
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
-- 
cgit v1.2.2


From ef2cabf7c6d838eb0ee2b4fb8ef84f7c06ce16d9 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Mon, 27 Oct 2008 22:53:05 -0400
Subject: ext4: fix a bug accessing freed memory in ext4_abort

Vegard Nossum reported a bug which accesses freed memory (found via
kmemcheck).  When journal has been aborted, ext4_put_super() calls
ext4_abort() after freeing the journal_t object, and then ext4_abort()
accesses it.  This patch fix it.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index bdddea14e782..994859df010e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -333,7 +333,8 @@ void ext4_abort(struct super_block *sb, const char *function,
 	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 	sb->s_flags |= MS_RDONLY;
 	EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
-	jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+	if (EXT4_SB(sb)->s_journal)
+		jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
 
 void ext4_warning(struct super_block *sb, const char *function,
@@ -442,14 +443,16 @@ static void ext4_put_super(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
-	int i;
+	int i, err;
 
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
 	ext4_xattr_put_super(sb);
-	if (jbd2_journal_destroy(sbi->s_journal) < 0)
-		ext4_abort(sb, __func__, "Couldn't clean up the journal");
+	err = jbd2_journal_destroy(sbi->s_journal);
 	sbi->s_journal = NULL;
+	if (err < 0)
+		ext4_abort(sb, __func__, "Couldn't clean up the journal");
+
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
-- 
cgit v1.2.2


From 6c20ec850360bc6e5c66a787f0523a80450d65ab Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 28 Oct 2008 21:08:20 -0400
Subject: jbd2: Call the commit callback before the transaction could get
 dropped

The transaction can potentially get dropped if there are no buffers
that need to be written.  Make sure we call the commit callback before
potentially deciding to drop the transaction.  Also avoid
dereferencing the commit_transaction pointer in the marker for the
same reason.

This patch fixes the bug reported by Eric Paris at:
http://bugzilla.kernel.org/show_bug.cgi?id=11838

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Eric Sandeen <sandeen@redhat.com>
Tested-by: Eric Paris <eparis@redhat.com>
---
 fs/jbd2/commit.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 8b119e16aa36..ebc667bc54a8 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -974,6 +974,9 @@ restart_loop:
 	journal->j_committing_transaction = NULL;
 	spin_unlock(&journal->j_state_lock);
 
+	if (journal->j_commit_callback)
+		journal->j_commit_callback(journal, commit_transaction);
+
 	if (commit_transaction->t_checkpoint_list == NULL &&
 	    commit_transaction->t_checkpoint_io_list == NULL) {
 		__jbd2_journal_drop_transaction(journal, commit_transaction);
@@ -995,11 +998,8 @@ restart_loop:
 	}
 	spin_unlock(&journal->j_list_lock);
 
-	if (journal->j_commit_callback)
-		journal->j_commit_callback(journal, commit_transaction);
-
 	trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
-		   journal->j_devname, commit_transaction->t_tid,
+		   journal->j_devname, journal->j_commit_sequence,
 		   journal->j_tail_sequence);
 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
-- 
cgit v1.2.2


From 8c3bf8a01c005385e9be0bc992e10abfb355278c Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 28 Oct 2008 00:08:12 -0400
Subject: merge ext4_claim_free_blocks & ext4_has_free_blocks

Mingming pointed out that ext4_claim_free_blocks & ext4_has_free_blocks
are largely cut & pasted; they can be collapsed/merged as follows.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 57 ++++++++++++++++----------------------------------------
 fs/ext4/ext4.h   |  3 +--
 2 files changed, 17 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index b9821be709bd..e28203ec45bd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -589,8 +589,15 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	return;
 }
 
-int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-						s64 nblocks)
+/**
+ * ext4_has_free_blocks()
+ * @sbi:	in-core super block structure.
+ * @nblocks:	number of needed blocks
+ *
+ * Check if filesystem has nblocks free & available for allocation.
+ * On success return 1, return 0 on failure.
+ */
+int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 {
 	s64 free_blocks, dirty_blocks;
 	s64 root_blocks = 0;
@@ -620,53 +627,21 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 	 */
 	if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
 		/* we don't have free space */
-		return -ENOSPC;
+		return 0;
 
-	/* Add the blocks to nblocks */
-	percpu_counter_add(dbc, nblocks);
-	return 0;
+	return 1;
 }
 
-/**
- * ext4_has_free_blocks()
- * @sbi:	in-core super block structure.
- * @nblocks:	number of neeed blocks
- *
- * Check if filesystem has free blocks available for allocation.
- * Return the number of blocks avaible for allocation for this request
- * On success, return nblocks
- */
-ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 						s64 nblocks)
 {
-	s64 free_blocks, dirty_blocks;
-	s64 root_blocks = 0;
-	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
-	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
-
-	free_blocks  = percpu_counter_read_positive(fbc);
-	dirty_blocks = percpu_counter_read_positive(dbc);
-
-	if (!capable(CAP_SYS_RESOURCE) &&
-		sbi->s_resuid != current->fsuid &&
-		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
-		root_blocks = ext4_r_blocks_count(sbi->s_es);
-
-	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
-						EXT4_FREEBLOCKS_WATERMARK) {
-		free_blocks  = percpu_counter_sum(fbc);
-		dirty_blocks = percpu_counter_sum(dbc);
-	}
-	if (free_blocks <= (root_blocks + dirty_blocks))
-		/* we don't have free space */
+	if (ext4_has_free_blocks(sbi, nblocks)) {
+		percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
 		return 0;
-
-	if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
-		return free_blocks - (root_blocks + dirty_blocks);
-	return nblocks;
+	} else
+		return -ENOSPC;
 }
 
-
 /**
  * ext4_should_retry_alloc()
  * @sb:			super block
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4880cc3e6727..b0537c827024 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1003,8 +1003,7 @@ extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
 					ext4_lblk_t iblock, ext4_fsblk_t goal,
 					unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
-					 s64 nblocks);
+extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t block, unsigned long count, int metadata);
 extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
-- 
cgit v1.2.2


From a996031c87e093017c0763326a08896a3a4817f4 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 28 Oct 2008 00:08:17 -0400
Subject: delay capable() check in ext4_has_free_blocks()

As reported by Eric Paris, the capable() check in ext4_has_free_blocks()
sometimes causes SELinux denials.

We can rearrange the logic so that we only try to use the root-reserved
blocks when necessary, and even then we can move the capable() test
to last, to avoid the check most of the time.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e28203ec45bd..d2003cdc36aa 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -599,18 +599,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
  */
 int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 {
-	s64 free_blocks, dirty_blocks;
-	s64 root_blocks = 0;
+	s64 free_blocks, dirty_blocks, root_blocks;
 	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
 	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
 
 	free_blocks  = percpu_counter_read_positive(fbc);
 	dirty_blocks = percpu_counter_read_positive(dbc);
-
-	if (!capable(CAP_SYS_RESOURCE) &&
-		sbi->s_resuid != current->fsuid &&
-		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
-		root_blocks = ext4_r_blocks_count(sbi->s_es);
+	root_blocks = ext4_r_blocks_count(sbi->s_es);
 
 	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
 						EXT4_FREEBLOCKS_WATERMARK) {
@@ -623,13 +618,20 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 		}
 	}
 	/* Check whether we have space after
-	 * accounting for current dirty blocks
+	 * accounting for current dirty blocks & root reserved blocks.
 	 */
-	if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
-		/* we don't have free space */
-		return 0;
+	if (free_blocks >= ((root_blocks + nblocks) + dirty_blocks))
+		return 1;
 
-	return 1;
+	/* Hm, nope.  Are (enough) root reserved blocks available? */
+	if (sbi->s_resuid == current->fsuid ||
+	    ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
+	    capable(CAP_SYS_RESOURCE)) {
+		if (free_blocks >= (nblocks + dirty_blocks))
+			return 1;
+	}
+
+	return 0;
 }
 
 int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-- 
cgit v1.2.2


From ae05f269400533cbb32bfba131ab528d78dffd16 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 28 Oct 2008 15:21:40 -0400
Subject: NFS: Convert nfs_attr_generation_counter into an atomic_long

The most important property we need from nfs_attr_generation_counter is
monotonicity, which is not guaranteed by the current system of smp memory
barriers. We should convert it to an atomic_long_t, and drop the memory
barriers.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index dc52793ff8f8..d22eb383e1cf 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -908,21 +908,16 @@ static int nfs_size_need_update(const struct inode *inode, const struct nfs_fatt
 	return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
 }
 
-static unsigned long nfs_attr_generation_counter;
+static atomic_long_t nfs_attr_generation_counter;
 
 static unsigned long nfs_read_attr_generation_counter(void)
 {
-	smp_rmb();
-	return nfs_attr_generation_counter;
+	return atomic_long_read(&nfs_attr_generation_counter);
 }
 
 unsigned long nfs_inc_attr_generation_counter(void)
 {
-	unsigned long ret;
-	smp_rmb();
-	ret = ++nfs_attr_generation_counter;
-	smp_wmb();
-	return ret;
+	return atomic_long_inc_return(&nfs_attr_generation_counter);
 }
 
 void nfs_fattr_init(struct nfs_fattr *fattr)
-- 
cgit v1.2.2


From edf1ae403896cb7750800508b14996ba6be39a53 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 29 Oct 2008 00:47:57 +0000
Subject: [CIFS] Reduce number of socket retries in large write path

CIFS in some heavy stress conditions cifs could get EAGAIN
repeatedly in smb_send2 which led to repeated retries and eventually
failure of large writes which could lead to data corruption.

There are three changes that were suggested by various network
developers:

1) convert cifs from non-blocking to blocking tcp sendmsg
(we left in the retry on failure)
2) change cifs to not set sendbuf and rcvbuf size for the socket
(let tcp autotune the buffer sizes since that works much better
in the TCP stack now)
3) if we have a partial frame sent in smb_send2, mark the tcp
session as invalid (close the socket and reconnect) so we do
not corrupt the remaining part of the SMB with the beginning
of the next SMB.

This does not appear to hurt performance measurably and has
been run in various scenarios, but it definately removes
a corruption that we were seeing in some high stress
test cases.

Acked-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES     |  6 +++++-
 fs/cifs/cifsglob.h  |  2 ++
 fs/cifs/cifsproto.h |  2 +-
 fs/cifs/connect.c   | 50 +++++++++++++++++++++++++++++++++++++-------------
 fs/cifs/transport.c | 41 +++++++++++++++++++++++++++++++----------
 5 files changed, 76 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 8f528ea24c48..8855331b2fba 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -4,7 +4,11 @@ Various fixes to make delete of open files behavior more predictable
 (when delete of an open file fails we mark the file as "delete-on-close"
 in a way that more servers accept, but only if we can first rename the
 file to a temporary name).  Add experimental support for more safely
-handling fcntl(F_SETLEASE).
+handling fcntl(F_SETLEASE).  Convert cifs to using blocking tcp
+sends, and also let tcp autotune the socket send and receive buffers.
+This reduces the number of EAGAIN errors returned by TCP/IP in
+high stress workloads (and the number of retries on socket writes
+when sending large SMBWriteX requests).
 
 Version 1.54
 ------------
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index c791e5b5a914..1cb1189f24e0 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -141,6 +141,8 @@ struct TCP_Server_Info {
 	char versionMajor;
 	char versionMinor;
 	bool svlocal:1;			/* local server or remote */
+	bool noblocksnd;		/* use blocking sendmsg */
+	bool noautotune;		/* do not autotune send buf sizes */
 	atomic_t socketUseCount; /* number of open cifs sessions on socket */
 	atomic_t inFlight;  /* number of requests on the wire to server */
 #ifdef CONFIG_CIFS_STATS2
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 0cff7fe986e8..6f21ecb85ce5 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -36,7 +36,7 @@ extern void cifs_buf_release(void *);
 extern struct smb_hdr *cifs_small_buf_get(void);
 extern void cifs_small_buf_release(void *);
 extern int smb_send(struct socket *, struct smb_hdr *,
-			unsigned int /* length */ , struct sockaddr *);
+			unsigned int /* length */ , struct sockaddr *, bool);
 extern unsigned int _GetXid(void);
 extern void _FreeXid(unsigned int);
 #define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current->fsuid));
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 71b7661e2260..e9f9248cb3fe 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -92,6 +92,8 @@ struct smb_vol {
 	bool seal:1;       /* request transport encryption on share */
 	bool nodfs:1;      /* Do not request DFS, even if available */
 	bool local_lease:1; /* check leases only on local system, not remote */
+	bool noblocksnd:1;
+	bool noautotune:1;
 	unsigned int rsize;
 	unsigned int wsize;
 	unsigned int sockopt;
@@ -102,9 +104,11 @@ struct smb_vol {
 static int ipv4_connect(struct sockaddr_in *psin_server,
 			struct socket **csocket,
 			char *netb_name,
-			char *server_netb_name);
+			char *server_netb_name,
+			bool noblocksnd,
+			bool nosndbuf); /* ipv6 never set sndbuf size */
 static int ipv6_connect(struct sockaddr_in6 *psin_server,
-			struct socket **csocket);
+			struct socket **csocket, bool noblocksnd);
 
 
 	/*
@@ -191,12 +195,13 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		try_to_freeze();
 		if (server->protocolType == IPV6) {
 			rc = ipv6_connect(&server->addr.sockAddr6,
-					  &server->ssocket);
+					  &server->ssocket, server->noautotune);
 		} else {
 			rc = ipv4_connect(&server->addr.sockAddr,
 					&server->ssocket,
 					server->workstation_RFC1001_name,
-					server->server_RFC1001_name);
+					server->server_RFC1001_name,
+					server->noblocksnd, server->noautotune);
 		}
 		if (rc) {
 			cFYI(1, ("reconnect error %d", rc));
@@ -1192,6 +1197,10 @@ cifs_parse_mount_options(char *options, const char *devname,
 			/* ignore */
 		} else if (strnicmp(data, "rw", 2) == 0) {
 			vol->rw = true;
+		} else if (strnicmp(data, "noblocksend", 11) == 0) {
+			vol->noblocksnd = 1;
+		} else if (strnicmp(data, "noautotune", 10) == 0) {
+			vol->noautotune = 1;
 		} else if ((strnicmp(data, "suid", 4) == 0) ||
 				   (strnicmp(data, "nosuid", 6) == 0) ||
 				   (strnicmp(data, "exec", 4) == 0) ||
@@ -1518,7 +1527,8 @@ static void rfc1002mangle(char *target, char *source, unsigned int length)
 
 static int
 ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
-	     char *netbios_name, char *target_name)
+	     char *netbios_name, char *target_name,
+	     bool noblocksnd, bool noautotune)
 {
 	int rc = 0;
 	int connected = 0;
@@ -1590,11 +1600,16 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
 		 (*csocket)->sk->sk_sndbuf,
 		 (*csocket)->sk->sk_rcvbuf, (*csocket)->sk->sk_rcvtimeo));
 	(*csocket)->sk->sk_rcvtimeo = 7 * HZ;
+	if (!noblocksnd)
+		(*csocket)->sk->sk_sndtimeo = 3 * HZ;
+
 	/* make the bufsizes depend on wsize/rsize and max requests */
-	if ((*csocket)->sk->sk_sndbuf < (200 * 1024))
-		(*csocket)->sk->sk_sndbuf = 200 * 1024;
-	if ((*csocket)->sk->sk_rcvbuf < (140 * 1024))
-		(*csocket)->sk->sk_rcvbuf = 140 * 1024;
+	if (noautotune) {
+		if ((*csocket)->sk->sk_sndbuf < (200 * 1024))
+			(*csocket)->sk->sk_sndbuf = 200 * 1024;
+		if ((*csocket)->sk->sk_rcvbuf < (140 * 1024))
+			(*csocket)->sk->sk_rcvbuf = 140 * 1024;
+	}
 
 	/* send RFC1001 sessinit */
 	if (psin_server->sin_port == htons(RFC1001_PORT)) {
@@ -1631,7 +1646,7 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
 			/* sizeof RFC1002_SESSION_REQUEST with no scope */
 			smb_buf->smb_buf_length = 0x81000044;
 			rc = smb_send(*csocket, smb_buf, 0x44,
-				(struct sockaddr *)psin_server);
+				(struct sockaddr *)psin_server, noblocksnd);
 			kfree(ses_init_buf);
 			msleep(1); /* RFC1001 layer in at least one server
 				      requires very short break before negprot
@@ -1651,7 +1666,8 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
 }
 
 static int
-ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket)
+ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket,
+	     bool noblocksnd)
 {
 	int rc = 0;
 	int connected = 0;
@@ -1720,6 +1736,9 @@ ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket)
 		the default. sock_setsockopt not used because it expects
 		user space buffer */
 	(*csocket)->sk->sk_rcvtimeo = 7 * HZ;
+	if (!noblocksnd)
+		(*csocket)->sk->sk_sndtimeo = 3 * HZ;
+
 
 	return rc;
 }
@@ -1983,11 +2002,14 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			cFYI(1, ("attempting ipv6 connect"));
 			/* BB should we allow ipv6 on port 139? */
 			/* other OS never observed in Wild doing 139 with v6 */
-			rc = ipv6_connect(&sin_server6, &csocket);
+			rc = ipv6_connect(&sin_server6, &csocket,
+					volume_info.noblocksnd);
 		} else
 			rc = ipv4_connect(&sin_server, &csocket,
 				  volume_info.source_rfc1001_name,
-				  volume_info.target_rfc1001_name);
+				  volume_info.target_rfc1001_name,
+				  volume_info.noblocksnd,
+				  volume_info.noautotune);
 		if (rc < 0) {
 			cERROR(1, ("Error connecting to IPv4 socket. "
 				   "Aborting operation"));
@@ -2002,6 +2024,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			sock_release(csocket);
 			goto out;
 		} else {
+			srvTcp->noblocksnd = volume_info.noblocksnd;
+			srvTcp->noautotune = volume_info.noautotune;
 			memcpy(&srvTcp->addr.sockAddr, &sin_server,
 				sizeof(struct sockaddr_in));
 			atomic_set(&srvTcp->inFlight, 0);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index bf0e6d8e382a..ba4d66644ebf 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -161,7 +161,7 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
 
 int
 smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
-	 unsigned int smb_buf_length, struct sockaddr *sin)
+	 unsigned int smb_buf_length, struct sockaddr *sin, bool noblocksnd)
 {
 	int rc = 0;
 	int i = 0;
@@ -178,7 +178,10 @@ smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
 	smb_msg.msg_namelen = sizeof(struct sockaddr);
 	smb_msg.msg_control = NULL;
 	smb_msg.msg_controllen = 0;
-	smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; /* BB add more flags?*/
+	if (noblocksnd)
+		smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
+	else
+		smb_msg.msg_flags = MSG_NOSIGNAL;
 
 	/* smb header is converted in header_assemble. bcc and rest of SMB word
 	   area, and byte area if necessary, is converted to littleendian in
@@ -229,8 +232,8 @@ smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
 }
 
 static int
-smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
-	  struct sockaddr *sin)
+smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
+	  struct sockaddr *sin, bool noblocksnd)
 {
 	int rc = 0;
 	int i = 0;
@@ -240,6 +243,7 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
 	unsigned int total_len;
 	int first_vec = 0;
 	unsigned int smb_buf_length = smb_buffer->smb_buf_length;
+	struct socket *ssocket = server->ssocket;
 
 	if (ssocket == NULL)
 		return -ENOTSOCK; /* BB eventually add reconnect code here */
@@ -248,7 +252,10 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
 	smb_msg.msg_namelen = sizeof(struct sockaddr);
 	smb_msg.msg_control = NULL;
 	smb_msg.msg_controllen = 0;
-	smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; /* BB add more flags?*/
+	if (noblocksnd)
+		smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
+	else
+		smb_msg.msg_flags = MSG_NOSIGNAL;
 
 	/* smb header is converted in header_assemble. bcc and rest of SMB word
 	   area, and byte area if necessary, is converted to littleendian in
@@ -312,6 +319,16 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
 		i = 0; /* in case we get ENOSPC on the next send */
 	}
 
+	if ((total_len > 0) && (total_len != smb_buf_length + 4)) {
+		cFYI(1, ("partial send (%d remaining), terminating session",
+			total_len));
+		/* If we have only sent part of an SMB then the next SMB
+		   could be taken as the remainder of this one.  We need
+		   to kill the socket so the server throws away the partial
+		   SMB */
+		server->tcpStatus = CifsNeedReconnect;
+	}
+
 	if (rc < 0) {
 		cERROR(1, ("Error %d sending data on socket to server", rc));
 	} else
@@ -518,8 +535,9 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 #ifdef CONFIG_CIFS_STATS2
 	atomic_inc(&ses->server->inSend);
 #endif
-	rc = smb_send2(ses->server->ssocket, iov, n_vec,
-		      (struct sockaddr *) &(ses->server->addr.sockAddr));
+	rc = smb_send2(ses->server, iov, n_vec,
+		      (struct sockaddr *) &(ses->server->addr.sockAddr),
+		       ses->server->noblocksnd);
 #ifdef CONFIG_CIFS_STATS2
 	atomic_dec(&ses->server->inSend);
 	midQ->when_sent = jiffies;
@@ -711,7 +729,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	atomic_inc(&ses->server->inSend);
 #endif
 	rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
-		      (struct sockaddr *) &(ses->server->addr.sockAddr));
+		      (struct sockaddr *) &(ses->server->addr.sockAddr),
+		      ses->server->noblocksnd);
 #ifdef CONFIG_CIFS_STATS2
 	atomic_dec(&ses->server->inSend);
 	midQ->when_sent = jiffies;
@@ -851,7 +870,8 @@ send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
 		return rc;
 	}
 	rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
-	      (struct sockaddr *) &(ses->server->addr.sockAddr));
+	      (struct sockaddr *) &(ses->server->addr.sockAddr),
+	      ses->server->noblocksnd);
 	up(&ses->server->tcpSem);
 	return rc;
 }
@@ -941,7 +961,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	atomic_inc(&ses->server->inSend);
 #endif
 	rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
-		      (struct sockaddr *) &(ses->server->addr.sockAddr));
+		      (struct sockaddr *) &(ses->server->addr.sockAddr),
+		      ses->server->noblocksnd);
 #ifdef CONFIG_CIFS_STATS2
 	atomic_dec(&ses->server->inSend);
 	midQ->when_sent = jiffies;
-- 
cgit v1.2.2


From 4e02ed4b4a2fae34aae766a5bb93ae235f60adb8 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 29 Oct 2008 14:00:55 -0700
Subject: fs: remove prepare_write/commit_write

Nothing uses prepare_write or commit_write. Remove them from the tree
completely.

[akpm@linux-foundation.org: schedule simple_prepare_write() for unexporting]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/inode.c  | 2 +-
 fs/libfs.c      | 2 +-
 fs/ocfs2/file.c | 3 +--
 fs/splice.c     | 4 ++--
 4 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 19eafbe3c379..2b2eec1283bf 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -175,7 +175,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
 
 	if (rw == WRITE) {
 		/*
-		 * FIXME: blockdev_direct_IO() doesn't use ->prepare_write(),
+		 * FIXME: blockdev_direct_IO() doesn't use ->write_begin(),
 		 * so we need to update the ->mmu_private to block boundary.
 		 *
 		 * But we must fill the remaining area or hole by nul for
diff --git a/fs/libfs.c b/fs/libfs.c
index 74688598bcf7..e960a8321902 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -814,7 +814,7 @@ EXPORT_SYMBOL(simple_getattr);
 EXPORT_SYMBOL(simple_link);
 EXPORT_SYMBOL(simple_lookup);
 EXPORT_SYMBOL(simple_pin_fs);
-EXPORT_SYMBOL(simple_prepare_write);
+EXPORT_UNUSED_SYMBOL(simple_prepare_write);
 EXPORT_SYMBOL(simple_readpage);
 EXPORT_SYMBOL(simple_release_fs);
 EXPORT_SYMBOL(simple_rename);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8d3225a78073..7efe937a415f 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -679,8 +679,7 @@ leave:
 
 /* Some parts of this taken from generic_cont_expand, which turned out
  * to be too fragile to do exactly what we need without us having to
- * worry about recursive locking in ->prepare_write() and
- * ->commit_write(). */
+ * worry about recursive locking in ->write_begin() and ->write_end(). */
 static int ocfs2_write_zero_page(struct inode *inode,
 				 u64 size)
 {
diff --git a/fs/splice.c b/fs/splice.c
index a1e701c27156..1abab5cee4ba 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -731,8 +731,8 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 	};
 
 	/*
-	 * The actor worker might be calling ->prepare_write and
-	 * ->commit_write. Most of the time, these expect i_mutex to
+	 * The actor worker might be calling ->write_begin and
+	 * ->write_end. Most of the time, these expect i_mutex to
 	 * be held. Since this may result in an ABBA deadlock with
 	 * pipe->inode, we have to order lock acquiry here.
 	 */
-- 
cgit v1.2.2


From 87b811c3f96559e466403e22b1fa99d472571625 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 29 Oct 2008 14:01:08 -0700
Subject: ecryptfs: fix memory corruption when storing crypto info in xattrs

When ecryptfs allocates space to write crypto headers into, before copying
it out to file headers or to xattrs, it looks at the value of
crypt_stat->num_header_bytes_at_front to determine how much space it
needs.  This is also used as the file offset to the actual encrypted data,
so for xattr-stored crypto info, the value was zero.

So, we kzalloc'd 0 bytes, and then ran off to write to that memory.
(Which returned as ZERO_SIZE_PTR, so we explode quickly).

The right answer is to always allocate a page to write into; the current
code won't ever write more than that (this is enforced by the
(PAGE_CACHE_SIZE - offset) length in the call to
ecryptfs_generate_key_packet_set).  To be explicit about this, we now send
in a "max" parameter, rather than magically using PAGE_CACHE_SIZE there.

Also, since the pointer we pass down the callchain eventually gets the
virt_to_page() treatment, we should be using a alloc_page variant, not
kzalloc (see also 7fcba054373d5dfc43d26e243a5c9b92069972ee)

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Acked-by: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/crypto.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 06db79d05c12..6046239465a1 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1251,6 +1251,7 @@ struct kmem_cache *ecryptfs_header_cache_2;
 /**
  * ecryptfs_write_headers_virt
  * @page_virt: The virtual address to write the headers to
+ * @max: The size of memory allocated at page_virt
  * @size: Set to the number of bytes written by this function
  * @crypt_stat: The cryptographic context
  * @ecryptfs_dentry: The eCryptfs dentry
@@ -1278,7 +1279,8 @@ struct kmem_cache *ecryptfs_header_cache_2;
  *
  * Returns zero on success
  */
-static int ecryptfs_write_headers_virt(char *page_virt, size_t *size,
+static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
+				       size_t *size,
 				       struct ecryptfs_crypt_stat *crypt_stat,
 				       struct dentry *ecryptfs_dentry)
 {
@@ -1296,7 +1298,7 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t *size,
 	offset += written;
 	rc = ecryptfs_generate_key_packet_set((page_virt + offset), crypt_stat,
 					      ecryptfs_dentry, &written,
-					      PAGE_CACHE_SIZE - offset);
+					      max - offset);
 	if (rc)
 		ecryptfs_printk(KERN_WARNING, "Error generating key packet "
 				"set; rc = [%d]\n", rc);
@@ -1368,14 +1370,14 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
 		goto out;
 	}
 	/* Released in this function */
-	virt = kzalloc(crypt_stat->num_header_bytes_at_front, GFP_KERNEL);
+	virt = (char *)get_zeroed_page(GFP_KERNEL);
 	if (!virt) {
 		printk(KERN_ERR "%s: Out of memory\n", __func__);
 		rc = -ENOMEM;
 		goto out;
 	}
-	rc = ecryptfs_write_headers_virt(virt, &size, crypt_stat,
-					 ecryptfs_dentry);
+	rc = ecryptfs_write_headers_virt(virt, PAGE_CACHE_SIZE, &size,
+					 crypt_stat, ecryptfs_dentry);
 	if (unlikely(rc)) {
 		printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n",
 		       __func__, rc);
@@ -1393,8 +1395,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
 		goto out_free;
 	}
 out_free:
-	memset(virt, 0, crypt_stat->num_header_bytes_at_front);
-	kfree(virt);
+	free_page((unsigned long)virt);
 out:
 	return rc;
 }
-- 
cgit v1.2.2


From e74481e23283fb080d4591c258de20785cc3b6c3 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 29 Oct 2008 14:01:10 -0700
Subject: fs: remove excess kernel-doc

Delete excess kernel-doc notation in fs/ subdirectory:

Warning(linux-2.6.27-git10//fs/jbd/transaction.c:886): Excess function parameter or struct member 'credits' description in 'journal_get_undo_access'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/transaction.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index d15cd6e7251e..60d4c32c8808 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -860,7 +860,6 @@ out:
  * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences
  * @handle: transaction
  * @bh: buffer to undo
- * @credits: store the number of taken credits here (if not NULL)
  *
  * Sometimes there is a need to distinguish between metadata which has
  * been committed to disk and that which has not.  The ext3fs code uses
-- 
cgit v1.2.2


From 61de800d33af585cb7e6f27b5cdd51029c6855cb Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 30 Oct 2008 20:15:22 +0000
Subject: [CIFS] fix error in smb_send2

smb_send2 exit logic was strange, and with the previous change
could cause us to fail large
smb writes when all of the smb was not sent as one chunk.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c   | 2 +-
 fs/cifs/file.c      | 2 +-
 fs/cifs/transport.c | 7 +++++--
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 843a85fb8b9a..d5eac48fc415 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1536,7 +1536,7 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
 	__u32 bytes_sent;
 	__u16 byte_count;
 
-	/* cFYI(1,("write at %lld %d bytes",offset,count));*/
+	/* cFYI(1, ("write at %lld %d bytes", offset, count));*/
 	if (tcon->ses == NULL)
 		return -ECONNABORTED;
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 62d8bd8f14c0..ead1a3bb0256 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1824,7 +1824,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 	pTcon = cifs_sb->tcon;
 
 	pagevec_init(&lru_pvec, 0);
-		cFYI(DBG2, ("rpages: num pages %d", num_pages));
+	cFYI(DBG2, ("rpages: num pages %d", num_pages));
 	for (i = 0; i < num_pages; ) {
 		unsigned contig_pages;
 		struct page *tmp_page;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index ba4d66644ebf..ff8243a8fe3e 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -290,8 +290,11 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
 		if (rc < 0)
 			break;
 
-		if (rc >= total_len) {
-			WARN_ON(rc > total_len);
+		if (rc == total_len) {
+			total_len = 0;
+			break;
+		} else if (rc > total_len) {
+			cERROR(1, ("sent %d requested %d", rc, total_len));
 			break;
 		}
 		if (rc == 0) {
-- 
cgit v1.2.2


From 8d7c4203c681a3ec359eccff4e53bc8c0ccf403b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Thu, 30 Oct 2008 13:48:33 -0400
Subject: nfsd: fix failure to set eof in readdir in some situations

Before 14f7dd632011bb89c035722edd6ea0d90ca6b078 "[PATCH] Copy XFS
readdir hack into nfsd code", readdir_cd->err was reset to eof before
each call to vfs_readdir; afterwards, it is set only once.  Similarly,
c002a6c7977320f95b5edede5ce4e0eeecf291ff "[PATCH] Optimise NFS readdir
hack slightly", can cause us to exit without nfserr_eof set.  Fix this.

This ensures the "eof" bit is set when needed in readdir replies.  (The
particular case I saw was an nfsv4 readdir of an empty directory, which
returned with no entries (the protocol requires "." and ".." to be
filtered out), but with eof unset.)

Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/vfs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 0bc56f6d9276..848a03e83a42 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1912,6 +1912,7 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
 		offset = vfs_llseek(file, 0, SEEK_CUR);
+		cdp->err = nfserr_eof;
 		if (!buf.full)
 			break;
 	}
-- 
cgit v1.2.2


From d7dc61d0a70371b1c6557ea8ffbc60fff94c8168 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@ORACLE.COM>
Date: Thu, 23 Oct 2008 00:50:35 -0400
Subject: NLM: Set address family before calling nlm_host_rebooted()

The nlm_host_rebooted() function uses nlm_cmp_addr() to find an
nsm_handle that matches the rebooted peer.  In order for this to work,
the passed-in address must have a proper address family.

This fixes a post-2.6.28 regression introduced by commit 781b61a6, which
added AF_INET6 support to nlm_cmp_addr().  Before that commit,
nlm_cmp_addr() didn't care about the address family; it compared only
the sin_addr.s_addr field for equality.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svc4proc.c | 1 +
 fs/lockd/svcproc.c  | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 014f6ce48172..4dfdcbc6bf68 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -434,6 +434,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 	 * reclaim all locks we hold on this server.
 	 */
 	memset(&saddr, 0, sizeof(saddr));
+	saddr.sin_family = AF_INET;
 	saddr.sin_addr.s_addr = argp->addr;
 	nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
 
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 548b0bb2b84d..3ca89e2a9381 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -466,6 +466,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 	 * reclaim all locks we hold on this server.
 	 */
 	memset(&saddr, 0, sizeof(saddr));
+	saddr.sin_family = AF_INET;
 	saddr.sin_addr.s_addr = argp->addr;
 	nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
 
-- 
cgit v1.2.2


From b27cf88e9592953ae292d05324887f2f44979433 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Fri, 31 Oct 2008 14:52:24 +0000
Subject: [JFFS2] Fix lack of locking in thread_should_wake()

The thread_should_wake() function trawls through the list of 'very
dirty' eraseblocks, determining whether the background GC thread should
wake. Doing this without holding the appropriate locks is a bad idea.

OLPC Trac #8615

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: stable@kernel.org
---
 fs/jffs2/background.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 8adebd3e43c6..3cceef4ad2b7 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -85,15 +85,15 @@ static int jffs2_garbage_collect_thread(void *_c)
 	for (;;) {
 		allow_signal(SIGHUP);
 	again:
+		spin_lock(&c->erase_completion_lock);
 		if (!jffs2_thread_should_wake(c)) {
 			set_current_state (TASK_INTERRUPTIBLE);
+			spin_unlock(&c->erase_completion_lock);
 			D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread sleeping...\n"));
-			/* Yes, there's a race here; we checked jffs2_thread_should_wake()
-			   before setting current->state to TASK_INTERRUPTIBLE. But it doesn't
-			   matter - We don't care if we miss a wakeup, because the GC thread
-			   is only an optimisation anyway. */
 			schedule();
-		}
+		} else
+			spin_unlock(&c->erase_completion_lock);
+			
 
 		/* This thread is purely an optimisation. But if it runs when
 		   other things could be running, it actually makes things a
-- 
cgit v1.2.2


From 233e70f4228e78eb2f80dc6650f65d3ae3dbf17c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Fri, 31 Oct 2008 23:28:30 +0000
Subject: saner FASYNC handling on file close

As it is, all instances of ->release() for files that have ->fasync()
need to remember to evict file from fasync lists; forgetting that
creates a hole and we actually have a bunch that *does* forget.

So let's keep our lives simple - let __fput() check FASYNC in
file->f_flags and call ->fasync() there if it's been set.  And lose that
crap in ->release() instances - leaving it there is still valid, but we
don't have to bother anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/file_table.c   | 4 ++++
 fs/fuse/dev.c     | 1 -
 fs/inotify_user.c | 3 ---
 fs/pipe.c         | 3 ---
 4 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/file_table.c b/fs/file_table.c
index efc06faede6c..5ad0eca6eea2 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -269,6 +269,10 @@ void __fput(struct file *file)
 	eventpoll_release(file);
 	locks_remove_flock(file);
 
+	if (unlikely(file->f_flags & FASYNC)) {
+		if (file->f_op && file->f_op->fasync)
+			file->f_op->fasync(-1, file, 0);
+	}
 	if (file->f_op && file->f_op->release)
 		file->f_op->release(inode, file);
 	security_file_free(file);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 87250b6a8682..b72361479be2 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1056,7 +1056,6 @@ static int fuse_dev_release(struct inode *inode, struct file *file)
 		end_requests(fc, &fc->pending);
 		end_requests(fc, &fc->processing);
 		spin_unlock(&fc->lock);
-		fasync_helper(-1, file, 0, &fc->fasync);
 		fuse_conn_put(fc);
 	}
 
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index d85c7d931cdf..d367e9b92862 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -537,9 +537,6 @@ static int inotify_release(struct inode *ignored, struct file *file)
 		inotify_dev_event_dequeue(dev);
 	mutex_unlock(&dev->ev_mutex);
 
-	if (file->f_flags & FASYNC)
-		inotify_fasync(-1, file, 0);
-
 	/* free this device: the put matching the get in inotify_init() */
 	put_inotify_dev(dev);
 
diff --git a/fs/pipe.c b/fs/pipe.c
index fcba6542b8d0..7aea8b89baac 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -717,14 +717,12 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on)
 static int
 pipe_read_release(struct inode *inode, struct file *filp)
 {
-	pipe_read_fasync(-1, filp, 0);
 	return pipe_release(inode, 1, 0);
 }
 
 static int
 pipe_write_release(struct inode *inode, struct file *filp)
 {
-	pipe_write_fasync(-1, filp, 0);
 	return pipe_release(inode, 0, 1);
 }
 
@@ -733,7 +731,6 @@ pipe_rdwr_release(struct inode *inode, struct file *filp)
 {
 	int decr, decw;
 
-	pipe_rdwr_fasync(-1, filp, 0);
 	decr = (filp->f_mode & FMODE_READ) != 0;
 	decw = (filp->f_mode & FMODE_WRITE) != 0;
 	return pipe_release(inode, decr, decw);
-- 
cgit v1.2.2


From e219cca082f52e7dfea41f3be264b7b5eb204227 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 6 Nov 2008 22:37:59 -0500
Subject: jbd: don't give up looking for space so easily in
 __log_wait_for_space

Commit be07c4ed introducd a regression because it assumed that if
there were no transactions ready to be checkpointed, that no progress
could be made on making space available in the journal, and so the
journal should be aborted.  This assumption is false; it could be the
case that simply calling cleanup_journal_tail() will recover the
necessary space, or, for small journals, the currently committing
transaction could be responsible for chewing up the required space in
the log, so we need to wait for the currently committing transaction
to finish before trying to force a checkpoint operation.

This patch fixes the bug reported by Meelis Roos at:
http://bugzilla.kernel.org/show_bug.cgi?id=11937

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Duane Griffin <duaneg@dghda.com>
Cc: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
---
 fs/jbd/checkpoint.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 1bd8d4acc6f2..61f32f3868cd 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -115,7 +115,7 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
  */
 void __log_wait_for_space(journal_t *journal)
 {
-	int nblocks;
+	int nblocks, space_left;
 	assert_spin_locked(&journal->j_state_lock);
 
 	nblocks = jbd_space_needed(journal);
@@ -128,25 +128,42 @@ void __log_wait_for_space(journal_t *journal)
 		/*
 		 * Test again, another process may have checkpointed while we
 		 * were waiting for the checkpoint lock. If there are no
-		 * outstanding transactions there is nothing to checkpoint and
-		 * we can't make progress. Abort the journal in this case.
+		 * transactions ready to be checkpointed, try to recover
+		 * journal space by calling cleanup_journal_tail(), and if
+		 * that doesn't work, by waiting for the currently committing
+		 * transaction to complete.  If there is absolutely no way
+		 * to make progress, this is either a BUG or corrupted
+		 * filesystem, so abort the journal and leave a stack
+		 * trace for forensic evidence.
 		 */
 		spin_lock(&journal->j_state_lock);
 		spin_lock(&journal->j_list_lock);
 		nblocks = jbd_space_needed(journal);
-		if (__log_space_left(journal) < nblocks) {
+		space_left = __log_space_left(journal);
+		if (space_left < nblocks) {
 			int chkpt = journal->j_checkpoint_transactions != NULL;
+			tid_t tid = 0;
 
+			if (journal->j_committing_transaction)
+				tid = journal->j_committing_transaction->t_tid;
 			spin_unlock(&journal->j_list_lock);
 			spin_unlock(&journal->j_state_lock);
 			if (chkpt) {
 				log_do_checkpoint(journal);
+			} else if (cleanup_journal_tail(journal) == 0) {
+				/* We were able to recover space; yay! */
+				;
+			} else if (tid) {
+				log_wait_commit(journal, tid);
 			} else {
-				printk(KERN_ERR "%s: no transactions\n",
-				       __func__);
+				printk(KERN_ERR "%s: needed %d blocks and "
+				       "only had %d space available\n",
+				       __func__, nblocks, space_left);
+				printk(KERN_ERR "%s: no way to get more "
+				       "journal space\n", __func__);
+				WARN_ON(1);
 				journal_abort(journal, 0);
 			}
-
 			spin_lock(&journal->j_state_lock);
 		} else {
 			spin_unlock(&journal->j_list_lock);
-- 
cgit v1.2.2


From 8c3f25d8950c3e9fe6c9849f88679b3f2a071550 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 6 Nov 2008 22:38:07 -0500
Subject: jbd2: don't give up looking for space so easily in
 __jbd2_log_wait_for_space
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 23f8b79e introducd a regression because it assumed that if
there were no transactions ready to be checkpointed, that no progress
could be made on making space available in the journal, and so the
journal should be aborted.  This assumption is false; it could be the
case that simply calling jbd2_cleanup_journal_tail() will recover the
necessary space, or, for small journals, the currently committing
transaction could be responsible for chewing up the required space in
the log, so we need to wait for the currently committing transaction
to finish before trying to force a checkpoint operation.

This patch fixes a bug reported by Mihai Harpau at:
https://bugzilla.redhat.com/show_bug.cgi?id=469582

This patch fixes a bug reported by François Valenduc at:
http://bugzilla.kernel.org/show_bug.cgi?id=11840

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Duane Griffin <duaneg@dghda.com>
Cc: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
---
 fs/jbd2/checkpoint.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 9203c3332f17..9497718fe920 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -116,7 +116,7 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
  */
 void __jbd2_log_wait_for_space(journal_t *journal)
 {
-	int nblocks;
+	int nblocks, space_left;
 	assert_spin_locked(&journal->j_state_lock);
 
 	nblocks = jbd_space_needed(journal);
@@ -129,25 +129,43 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 		/*
 		 * Test again, another process may have checkpointed while we
 		 * were waiting for the checkpoint lock. If there are no
-		 * outstanding transactions there is nothing to checkpoint and
-		 * we can't make progress. Abort the journal in this case.
+		 * transactions ready to be checkpointed, try to recover
+		 * journal space by calling cleanup_journal_tail(), and if
+		 * that doesn't work, by waiting for the currently committing
+		 * transaction to complete.  If there is absolutely no way
+		 * to make progress, this is either a BUG or corrupted
+		 * filesystem, so abort the journal and leave a stack
+		 * trace for forensic evidence.
 		 */
 		spin_lock(&journal->j_state_lock);
 		spin_lock(&journal->j_list_lock);
 		nblocks = jbd_space_needed(journal);
-		if (__jbd2_log_space_left(journal) < nblocks) {
+		space_left = __jbd2_log_space_left(journal);
+		if (space_left < nblocks) {
 			int chkpt = journal->j_checkpoint_transactions != NULL;
+			tid_t tid = 0;
 
+			if (journal->j_committing_transaction)
+				tid = journal->j_committing_transaction->t_tid;
 			spin_unlock(&journal->j_list_lock);
 			spin_unlock(&journal->j_state_lock);
 			if (chkpt) {
 				jbd2_log_do_checkpoint(journal);
+			} else if (jbd2_cleanup_journal_tail(journal) == 0) {
+				/* We were able to recover space; yay! */
+				;
+			} else if (tid) {
+				jbd2_log_wait_commit(journal, tid);
 			} else {
-				printk(KERN_ERR "%s: no transactions\n",
-				       __func__);
+				printk(KERN_ERR "%s: needed %d blocks and "
+				       "only had %d space available\n",
+				       __func__, nblocks, space_left);
+				printk(KERN_ERR "%s: no way to get more "
+				       "journal space in %s\n", __func__,
+				       journal->j_devname);
+				WARN_ON(1);
 				jbd2_journal_abort(journal, 0);
 			}
-
 			spin_lock(&journal->j_state_lock);
 		} else {
 			spin_unlock(&journal->j_list_lock);
-- 
cgit v1.2.2


From 2423840ded13e6d3b52d88aff8d033bb78fafd08 Mon Sep 17 00:00:00 2001
From: Sami Liedes <sliedes@cc.hut.fi>
Date: Sun, 2 Nov 2008 19:23:30 -0500
Subject: jbd2: deregister proc on failure in jbd2_journal_init_inode

jbd2_journal_init_inode() does not call jbd2_stats_proc_exit() on all
failure paths after calling jbd2_stats_proc_init(). This leaves
dangling references to the fs in proc.

This patch fixes a bug reported by Sami Leides at:
http://bugzilla.kernel.org/show_bug.cgi?id=11493

Signed-off-by: Sami Liedes <sliedes@cc.hut.fi>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/journal.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 783de118de92..e70d657a19f8 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1089,6 +1089,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 	if (!journal->j_wbuf) {
 		printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
 			__func__);
+		jbd2_stats_proc_exit(journal);
 		kfree(journal);
 		return NULL;
 	}
@@ -1098,6 +1099,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 	if (err) {
 		printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
 		       __func__);
+		jbd2_stats_proc_exit(journal);
 		kfree(journal);
 		return NULL;
 	}
-- 
cgit v1.2.2


From ae6884a9da56f8921e432e663b4ccb4a1851b2ea Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 3 Nov 2008 14:05:08 -0500
Subject: cifs: fix renaming one hardlink on top of another

cifs: fix renaming one hardlink on top of another

POSIX says that renaming one hardlink on top of another to the same
inode is a no-op. We had the logic mostly right, but forgot to clear
the return code.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index d54fa8aeaea9..ff8c68de4a92 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1361,9 +1361,11 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 
 		if (tmprc == 0 && (info_buf_source->UniqueId ==
-				   info_buf_target->UniqueId))
+				   info_buf_target->UniqueId)) {
 			/* same file, POSIX says that this is a noop */
+			rc = 0;
 			goto cifs_rename_exit;
+		}
 	} /* else ... BB we could add the same check for Windows by
 		     checking the UniqueId via FILE_INTERNAL_INFO */
 
-- 
cgit v1.2.2


From c527c8a7ffa18400c2c1488f7ab5aff5e83f3c8e Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 3 Nov 2008 20:46:21 +0000
Subject: [CIFS] Can't rely on iov length and base when kernel_recvmsg returns
 error

When retrying kernel_recvmsg, reset iov_base and iov_len.

Note comment from Sridhar: "In the normal path, iov.iov_len is clearly set to 4. But i think you are
running into a case where kernel_recvmsg() is called via 'goto incomplete_rcv'
It happens if the previous call fails with EAGAIN.
If you want to call recvmsg() after EAGAIN failure, you need to reset iov."

Signed-off-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e9f9248cb3fe..c682be8f2984 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -417,9 +417,14 @@ incomplete_rcv:
 			msleep(1); /* minimum sleep to prevent looping
 				allowing socket to clear and app threads to set
 				tcpStatus CifsNeedReconnect if server hung */
-			if (pdu_length < 4)
+			if (pdu_length < 4) {
+				iov.iov_base = (4 - pdu_length) +
+							(char *)smb_buffer;
+				iov.iov_len = pdu_length;
+				smb_msg.msg_control = NULL;
+				smb_msg.msg_controllen = 0;
 				goto incomplete_rcv;
-			else
+			} else
 				continue;
 		} else if (length <= 0) {
 			if (server->tcpStatus == CifsNew) {
-- 
cgit v1.2.2


From ae2d9fb18e575ed37ffc241ece4bf68f0be4ae32 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 4 Nov 2008 09:10:50 -0500
Subject: ext4: fix missing ext4_unlock_group in error path

If we try to free a block which is already freed, the code was
returning without first unlocking the group.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index dfe17a134052..444ad998f72e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4441,6 +4441,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 		else if (block >= (entry->start_blk + entry->count))
 			n = &(*n)->rb_right;
 		else {
+			ext4_unlock_group(sb, group);
 			ext4_error(sb, __func__,
 			    "Double free of blocks %d (%d %d)\n",
 			    block, entry->start_blk, entry->count);
-- 
cgit v1.2.2


From d94e99a64c3beece22dbfb2b335771a59184eb0a Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 4 Nov 2008 09:11:26 -0500
Subject: ext4: Convert to host order before using the values.

Use le16_to_cpu to read the s_reserved_gdt_blocks values
from super block.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 994859df010e..e27acd18b4b0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1458,9 +1458,8 @@ static int ext4_fill_flex_info(struct super_block *sb)
 
 	/* We allocate both existing and potentially added groups */
 	flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
-			    ((sbi->s_es->s_reserved_gdt_blocks +1 ) <<
-			      EXT4_DESC_PER_BLOCK_BITS(sb))) /
-			   groups_per_flex;
+			((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
+			      EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
 	sbi->s_flex_groups = kzalloc(flex_group_count *
 				     sizeof(struct flex_groups), GFP_KERNEL);
 	if (sbi->s_flex_groups == NULL) {
-- 
cgit v1.2.2


From 14ce0cb411c88681ab8f3a4c9caa7f42e97a3184 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 3 Nov 2008 18:10:55 -0500
Subject: ext4: wait on all pending commits in ext4_sync_fs()

In ext4_sync_fs, we only wait for a commit to finish if we started it,
but there may be one already in progress which will not be synced.

In the case of a data=ordered umount with pending long symlinks which
are delayed due to a long list of other I/O on the backing block
device, this causes the buffer associated with the long symlinks to
not be moved to the inode dirty list in the second phase of
fsync_super.  Then, before they can be dirtied again, kjournald exits,
seeing the UMOUNT flag and the dirty pages are never written to the
backing block device, causing long symlink corruption and exposing new
or previously freed block data to userspace.

To ensure all commits are synced, we flush all journal commits now
when sync_fs'ing ext4.

Signed-off-by: Arthur Jones <ajones@riverbed.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: <linux-ext4@vger.kernel.org>
---
 fs/ext4/super.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e27acd18b4b0..e4a241c65dbe 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2884,12 +2884,9 @@ int ext4_force_commit(struct super_block *sb)
 /*
  * Ext4 always journals updates to the superblock itself, so we don't
  * have to propagate any other updates to the superblock on disk at this
- * point.  Just start an async writeback to get the buffers on their way
- * to the disk.
- *
- * This implicitly triggers the writebehind on sync().
+ * point.  (We can probably nuke this function altogether, and remove
+ * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...)
  */
-
 static void ext4_write_super(struct super_block *sb)
 {
 	if (mutex_trylock(&sb->s_lock) != 0)
@@ -2899,15 +2896,15 @@ static void ext4_write_super(struct super_block *sb)
 
 static int ext4_sync_fs(struct super_block *sb, int wait)
 {
-	tid_t target;
+	int ret = 0;
 
 	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
 	sb->s_dirt = 0;
-	if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
-		if (wait)
-			jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
-	}
-	return 0;
+	if (wait)
+		ret = ext4_force_commit(sb);
+	else
+		jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+	return ret;
 }
 
 /*
-- 
cgit v1.2.2


From dc8a0843a435b2c0891e7eaea64faaf1ebec9b11 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Date: Wed, 5 Nov 2008 23:21:16 +0100
Subject: [JFFS2] fix race condition in jffs2_lzo_compress()

deflate_mutex protects the globals lzo_mem and lzo_compress_buf.  However,
jffs2_lzo_compress() unlocks deflate_mutex _before_ it has copied out the
compressed data from lzo_compress_buf.  Correct this by moving the mutex
unlock after the copy.

In addition, document what deflate_mutex actually protects.

Cc: stable@kernel.org
Signed-off-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Acked-by: Richard Purdie <rpurdie@openedhand.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/compr_lzo.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index 47b045797e42..90cb60d09787 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -19,7 +19,7 @@
 
 static void *lzo_mem;
 static void *lzo_compress_buf;
-static DEFINE_MUTEX(deflate_mutex);
+static DEFINE_MUTEX(deflate_mutex);	/* for lzo_mem and lzo_compress_buf */
 
 static void free_workspace(void)
 {
@@ -49,18 +49,21 @@ static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
 
 	mutex_lock(&deflate_mutex);
 	ret = lzo1x_1_compress(data_in, *sourcelen, lzo_compress_buf, &compress_size, lzo_mem);
-	mutex_unlock(&deflate_mutex);
-
 	if (ret != LZO_E_OK)
-		return -1;
+		goto fail;
 
 	if (compress_size > *dstlen)
-		return -1;
+		goto fail;
 
 	memcpy(cpage_out, lzo_compress_buf, compress_size);
-	*dstlen = compress_size;
+	mutex_unlock(&deflate_mutex);
 
+	*dstlen = compress_size;
 	return 0;
+
+ fail:
+	mutex_unlock(&deflate_mutex);
+	return -1;
 }
 
 static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out,
-- 
cgit v1.2.2


From 89f97496e81d2112b5e41416fe3020688c443818 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 5 Nov 2008 10:21:06 +0100
Subject: block: fix __blkdev_get() for removable devices

Commit 0762b8bde9729f10f8e6249809660ff2ec3ad735 moved disk_get_part()
in front of recursive get on the whole disk, which caused removable
devices to try disk_get_part() before rescanning after a new media is
inserted, which might fail legit open attempts or give the old
partition.

This patch fixes the problem by moving disk_get_part() after
__blkdev_get() on the whole disk.

This problem was spotted by Borislav Petkov.

Signed-off-by: Tejun Heo <tj@kernel.org>
Tested-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 88a776fa0ef6..db831efbdbbd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -986,7 +986,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	struct gendisk *disk;
-	struct hd_struct *part = NULL;
 	int ret;
 	int partno;
 	int perm = 0;
@@ -1004,24 +1003,25 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		return ret;
 	}
 
-	ret = -ENXIO;
-
 	lock_kernel();
 
+	ret = -ENXIO;
 	disk = get_gendisk(bdev->bd_dev, &partno);
 	if (!disk)
 		goto out_unlock_kernel;
-	part = disk_get_part(disk, partno);
-	if (!part)
-		goto out_unlock_kernel;
 
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
 	if (!bdev->bd_openers) {
 		bdev->bd_disk = disk;
-		bdev->bd_part = part;
 		bdev->bd_contains = bdev;
 		if (!partno) {
 			struct backing_dev_info *bdi;
+
+			ret = -ENXIO;
+			bdev->bd_part = disk_get_part(disk, partno);
+			if (!bdev->bd_part)
+				goto out_clear;
+
 			if (disk->fops->open) {
 				ret = disk->fops->open(bdev, mode);
 				if (ret)
@@ -1049,18 +1049,17 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			bdev->bd_contains = whole;
 			bdev->bd_inode->i_data.backing_dev_info =
 			   whole->bd_inode->i_data.backing_dev_info;
+			bdev->bd_part = disk_get_part(disk, partno);
 			if (!(disk->flags & GENHD_FL_UP) ||
-			    !part || !part->nr_sects) {
+			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
 				ret = -ENXIO;
 				goto out_clear;
 			}
-			bd_set_size(bdev, (loff_t)part->nr_sects << 9);
+			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
 		}
 	} else {
-		disk_put_part(part);
 		put_disk(disk);
 		module_put(disk->fops->owner);
-		part = NULL;
 		disk = NULL;
 		if (bdev->bd_contains == bdev) {
 			if (bdev->bd_disk->fops->open) {
@@ -1080,6 +1079,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	return 0;
 
  out_clear:
+	disk_put_part(bdev->bd_part);
 	bdev->bd_disk = NULL;
 	bdev->bd_part = NULL;
 	bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
@@ -1091,7 +1091,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
  out_unlock_kernel:
 	unlock_kernel();
 
-	disk_put_part(part);
 	if (disk)
 		module_put(disk->fops->owner);
 	put_disk(disk);
-- 
cgit v1.2.2


From ac51d83705c2a38c71f39cde99708b14e6212a60 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 6 Nov 2008 16:49:36 -0500
Subject: ext4: calculate journal credits correctly

This fixes a 2.6.27 regression which was introduced in commit a02908f1.

We weren't passing the chunk parameter down to the two subections,
ext4_indirect_trans_blocks() and ext4_ext_index_trans_blocks(), with
the result that massively overestimate the amount of credits needed by
ext4_da_writepages, especially in the non-extents case.  This causes
failures especially on /boot partitions, which tend to be small and
non-extent using since GRUB doesn't handle extents.

This patch fixes the bug reported by Joseph Fannin at:
http://bugzilla.kernel.org/show_bug.cgi?id=11964

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8dbf6953845b..5a130b56f1cf 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4580,9 +4580,10 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
 	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
-		return ext4_indirect_trans_blocks(inode, nrblocks, 0);
-	return ext4_ext_index_trans_blocks(inode, nrblocks, 0);
+		return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
+	return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
 }
+
 /*
  * Account for index blocks, block groups bitmaps and block group
  * descriptor blocks if modify datablocks and index blocks
-- 
cgit v1.2.2


From bc9c4068388eea01d3b5da31016879f2341ecec5 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Thu, 6 Nov 2008 12:53:22 -0800
Subject: autofs4: correct offset mount expire check

When checking a directory tree in autofs_tree_busy() we can incorrectly
decide that the tree isn't busy.  This happens for the case of an active
offset mount as autofs4_follow_mount() follows past the active offset
mount, which has an open file handle used for expires, causing the file
handle not to count toward the busyness check.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/expire.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index cde2f8e8935a..4b6fb3f628c0 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -56,12 +56,23 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 	mntget(mnt);
 	dget(dentry);
 
-	if (!autofs4_follow_mount(&mnt, &dentry))
+	if (!follow_down(&mnt, &dentry))
 		goto done;
 
-	/* This is an autofs submount, we can't expire it */
-	if (is_autofs4_dentry(dentry))
-		goto done;
+	if (is_autofs4_dentry(dentry)) {
+		struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+
+		/* This is an autofs submount, we can't expire it */
+		if (sbi->type == AUTOFS_TYPE_INDIRECT)
+			goto done;
+
+		/*
+		 * Otherwise it's an offset mount and we need to check
+		 * if we can umount its mount, if there is one.
+		 */
+		if (!d_mountpoint(dentry))
+			goto done;
+	}
 
 	/* Update the expiry counter if fs is busy */
 	if (!may_umount_tree(mnt)) {
-- 
cgit v1.2.2


From 96b0317906690997c16c7efffbc4c0fafcd6f7f2 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Thu, 6 Nov 2008 12:53:23 -0800
Subject: autofs4: collect version check return

The function check_dev_ioctl_version() returns an error code upon fail but
it isn't captured and returned in validate_dev_ioctl() as it should be.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/dev-ioctl.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 625abf5422e2..33bf8cbfd051 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -128,9 +128,10 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
  */
 static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
 {
-	int err = -EINVAL;
+	int err;
 
-	if (check_dev_ioctl_version(cmd, param)) {
+	err = check_dev_ioctl_version(cmd, param);
+	if (err) {
 		AUTOFS_WARN("invalid device control module version "
 		     "supplied for cmd(0x%08x)", cmd);
 		goto out;
-- 
cgit v1.2.2


From c87591b719737b4e91eb1a9fa8fd55a4ff1886d6 Mon Sep 17 00:00:00 2001
From: Arthur Jones <ajones@riverbed.com>
Date: Thu, 6 Nov 2008 12:53:35 -0800
Subject: ext3: wait on all pending commits in ext3_sync_fs

In ext3_sync_fs, we only wait for a commit to finish if we started it, but
there may be one already in progress which will not be synced.

In the case of a data=ordered umount with pending long symlinks which are
delayed due to a long list of other I/O on the backing block device, this
causes the buffer associated with the long symlinks to not be moved to the
inode dirty list in the second phase of fsync_super.  Then, before they
can be dirtied again, kjournald exits, seeing the UMOUNT flag and the
dirty pages are never written to the backing block device, causing long
symlink corruption and exposing new or previously freed block data to
userspace.

This can be reproduced with a script created
by Eric Sandeen <sandeen@redhat.com>:

	#!/bin/bash

	umount /mnt/test2
	mount /dev/sdb4 /mnt/test2
	rm -f /mnt/test2/*
	dd if=/dev/zero of=/mnt/test2/bigfile bs=1M count=512
	touch
	/mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename
	ln -s
	/mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename
	/mnt/test2/link
	umount /mnt/test2
	mount /dev/sdb4 /mnt/test2
	ls /mnt/test2/
	umount /mnt/test2

To ensure all commits are synced, we flush all journal commits now when
sync_fs'ing ext3.

Signed-off-by: Arthur Jones <ajones@riverbed.com>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: <linux-ext4@vger.kernel.org>
Cc: <stable@kernel.org>		[2.6.everything]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index e5717a4fae67..5dec6d1356c4 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2390,13 +2390,12 @@ static void ext3_write_super (struct super_block * sb)
 
 static int ext3_sync_fs(struct super_block *sb, int wait)
 {
-	tid_t target;
-
 	sb->s_dirt = 0;
-	if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
-		if (wait)
-			log_wait_commit(EXT3_SB(sb)->s_journal, target);
-	}
+	if (wait)
+		ext3_force_commit(sb);
+	else
+		journal_start_commit(EXT3_SB(sb)->s_journal, NULL);
+
 	return 0;
 }
 
-- 
cgit v1.2.2


From 990e194e69009028e029b7d25da68c38241ec4f0 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:45 -0800
Subject: fat: move fs/vfat/* and fs/msdos/* to fs/fat

This just moves those files, but change link order from MSDOS, VFAT to
VFAT, MSDOS.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Makefile          |    2 -
 fs/fat/Makefile      |    6 +-
 fs/fat/namei_msdos.c |  702 +++++++++++++++++++++++++++++++++
 fs/fat/namei_vfat.c  | 1055 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/msdos/Makefile    |    7 -
 fs/msdos/namei.c     |  702 ---------------------------------
 fs/vfat/Makefile     |    7 -
 fs/vfat/namei.c      | 1055 --------------------------------------------------
 8 files changed, 1762 insertions(+), 1774 deletions(-)
 create mode 100644 fs/fat/namei_msdos.c
 create mode 100644 fs/fat/namei_vfat.c
 delete mode 100644 fs/msdos/Makefile
 delete mode 100644 fs/msdos/namei.c
 delete mode 100644 fs/vfat/Makefile
 delete mode 100644 fs/vfat/namei.c

(limited to 'fs')

diff --git a/fs/Makefile b/fs/Makefile
index 2168c902d5ca..d9f8afe6f0c4 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -81,8 +81,6 @@ obj-$(CONFIG_HUGETLBFS)		+= hugetlbfs/
 obj-$(CONFIG_CODA_FS)		+= coda/
 obj-$(CONFIG_MINIX_FS)		+= minix/
 obj-$(CONFIG_FAT_FS)		+= fat/
-obj-$(CONFIG_MSDOS_FS)		+= msdos/
-obj-$(CONFIG_VFAT_FS)		+= vfat/
 obj-$(CONFIG_BFS_FS)		+= bfs/
 obj-$(CONFIG_ISO9660_FS)	+= isofs/
 obj-$(CONFIG_HFSPLUS_FS)	+= hfsplus/ # Before hfs to find wrapped HFS+
diff --git a/fs/fat/Makefile b/fs/fat/Makefile
index bfb5f06cf2c8..e06190322c1c 100644
--- a/fs/fat/Makefile
+++ b/fs/fat/Makefile
@@ -3,5 +3,9 @@
 #
 
 obj-$(CONFIG_FAT_FS) += fat.o
+obj-$(CONFIG_VFAT_FS) += vfat.o
+obj-$(CONFIG_MSDOS_FS) += msdos.o
 
-fat-objs := cache.o dir.o fatent.o file.o inode.o misc.o
+fat-y := cache.o dir.o fatent.o file.o inode.o misc.o
+vfat-y := namei_vfat.o
+msdos-y := namei_msdos.o
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
new file mode 100644
index 000000000000..e844b9809d27
--- /dev/null
+++ b/fs/fat/namei_msdos.c
@@ -0,0 +1,702 @@
+/*
+ *  linux/fs/msdos/namei.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *  Hidden files 1995 by Albert Cahalan <albert@ccs.neu.edu> <adc@coe.neu.edu>
+ *  Rewritten for constant inumbers 1999 by Al Viro
+ */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/buffer_head.h>
+#include <linux/msdos_fs.h>
+#include <linux/smp_lock.h>
+
+/* Characters that are undesirable in an MS-DOS file name */
+static unsigned char bad_chars[] = "*?<>|\"";
+static unsigned char bad_if_strict[] = "+=,; ";
+
+/***** Formats an MS-DOS file name. Rejects invalid names. */
+static int msdos_format_name(const unsigned char *name, int len,
+			     unsigned char *res, struct fat_mount_options *opts)
+	/*
+	 * name is the proposed name, len is its length, res is
+	 * the resulting name, opts->name_check is either (r)elaxed,
+	 * (n)ormal or (s)trict, opts->dotsOK allows dots at the
+	 * beginning of name (for hidden files)
+	 */
+{
+	unsigned char *walk;
+	unsigned char c;
+	int space;
+
+	if (name[0] == '.') {	/* dotfile because . and .. already done */
+		if (opts->dotsOK) {
+			/* Get rid of dot - test for it elsewhere */
+			name++;
+			len--;
+		} else
+			return -EINVAL;
+	}
+	/*
+	 * disallow names that _really_ start with a dot
+	 */
+	space = 1;
+	c = 0;
+	for (walk = res; len && walk - res < 8; walk++) {
+		c = *name++;
+		len--;
+		if (opts->name_check != 'r' && strchr(bad_chars, c))
+			return -EINVAL;
+		if (opts->name_check == 's' && strchr(bad_if_strict, c))
+			return -EINVAL;
+		if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
+			return -EINVAL;
+		if (c < ' ' || c == ':' || c == '\\')
+			return -EINVAL;
+	/*
+	 * 0xE5 is legal as a first character, but we must substitute
+	 * 0x05 because 0xE5 marks deleted files.  Yes, DOS really
+	 * does this.
+	 * It seems that Microsoft hacked DOS to support non-US
+	 * characters after the 0xE5 character was already in use to
+	 * mark deleted files.
+	 */
+		if ((res == walk) && (c == 0xE5))
+			c = 0x05;
+		if (c == '.')
+			break;
+		space = (c == ' ');
+		*walk = (!opts->nocase && c >= 'a' && c <= 'z') ? c - 32 : c;
+	}
+	if (space)
+		return -EINVAL;
+	if (opts->name_check == 's' && len && c != '.') {
+		c = *name++;
+		len--;
+		if (c != '.')
+			return -EINVAL;
+	}
+	while (c != '.' && len--)
+		c = *name++;
+	if (c == '.') {
+		while (walk - res < 8)
+			*walk++ = ' ';
+		while (len > 0 && walk - res < MSDOS_NAME) {
+			c = *name++;
+			len--;
+			if (opts->name_check != 'r' && strchr(bad_chars, c))
+				return -EINVAL;
+			if (opts->name_check == 's' &&
+			    strchr(bad_if_strict, c))
+				return -EINVAL;
+			if (c < ' ' || c == ':' || c == '\\')
+				return -EINVAL;
+			if (c == '.') {
+				if (opts->name_check == 's')
+					return -EINVAL;
+				break;
+			}
+			if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
+				return -EINVAL;
+			space = c == ' ';
+			if (!opts->nocase && c >= 'a' && c <= 'z')
+				*walk++ = c - 32;
+			else
+				*walk++ = c;
+		}
+		if (space)
+			return -EINVAL;
+		if (opts->name_check == 's' && len)
+			return -EINVAL;
+	}
+	while (walk - res < MSDOS_NAME)
+		*walk++ = ' ';
+
+	return 0;
+}
+
+/***** Locates a directory entry.  Uses unformatted name. */
+static int msdos_find(struct inode *dir, const unsigned char *name, int len,
+		      struct fat_slot_info *sinfo)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
+	unsigned char msdos_name[MSDOS_NAME];
+	int err;
+
+	err = msdos_format_name(name, len, msdos_name, &sbi->options);
+	if (err)
+		return -ENOENT;
+
+	err = fat_scan(dir, msdos_name, sinfo);
+	if (!err && sbi->options.dotsOK) {
+		if (name[0] == '.') {
+			if (!(sinfo->de->attr & ATTR_HIDDEN))
+				err = -ENOENT;
+		} else {
+			if (sinfo->de->attr & ATTR_HIDDEN)
+				err = -ENOENT;
+		}
+		if (err)
+			brelse(sinfo->bh);
+	}
+	return err;
+}
+
+/*
+ * Compute the hash for the msdos name corresponding to the dentry.
+ * Note: if the name is invalid, we leave the hash code unchanged so
+ * that the existing dentry can be used. The msdos fs routines will
+ * return ENOENT or EINVAL as appropriate.
+ */
+static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
+{
+	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
+	unsigned char msdos_name[MSDOS_NAME];
+	int error;
+
+	error = msdos_format_name(qstr->name, qstr->len, msdos_name, options);
+	if (!error)
+		qstr->hash = full_name_hash(msdos_name, MSDOS_NAME);
+	return 0;
+}
+
+/*
+ * Compare two msdos names. If either of the names are invalid,
+ * we fall back to doing the standard name comparison.
+ */
+static int msdos_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
+{
+	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
+	unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME];
+	int error;
+
+	error = msdos_format_name(a->name, a->len, a_msdos_name, options);
+	if (error)
+		goto old_compare;
+	error = msdos_format_name(b->name, b->len, b_msdos_name, options);
+	if (error)
+		goto old_compare;
+	error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME);
+out:
+	return error;
+
+old_compare:
+	error = 1;
+	if (a->len == b->len)
+		error = memcmp(a->name, b->name, a->len);
+	goto out;
+}
+
+static struct dentry_operations msdos_dentry_operations = {
+	.d_hash		= msdos_hash,
+	.d_compare	= msdos_cmp,
+};
+
+/*
+ * AV. Wrappers for FAT sb operations. Is it wise?
+ */
+
+/***** Get inode using directory and name */
+static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	struct inode *inode = NULL;
+	int res;
+
+	dentry->d_op = &msdos_dentry_operations;
+
+	lock_super(sb);
+	res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
+	if (res == -ENOENT)
+		goto add;
+	if (res < 0)
+		goto out;
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		res = PTR_ERR(inode);
+		goto out;
+	}
+add:
+	res = 0;
+	dentry = d_splice_alias(inode, dentry);
+	if (dentry)
+		dentry->d_op = &msdos_dentry_operations;
+out:
+	unlock_super(sb);
+	if (!res)
+		return dentry;
+	return ERR_PTR(res);
+}
+
+/***** Creates a directory entry (name is already formatted). */
+static int msdos_add_entry(struct inode *dir, const unsigned char *name,
+			   int is_dir, int is_hid, int cluster,
+			   struct timespec *ts, struct fat_slot_info *sinfo)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
+	struct msdos_dir_entry de;
+	__le16 time, date;
+	int err;
+
+	memcpy(de.name, name, MSDOS_NAME);
+	de.attr = is_dir ? ATTR_DIR : ATTR_ARCH;
+	if (is_hid)
+		de.attr |= ATTR_HIDDEN;
+	de.lcase = 0;
+	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
+	de.cdate = de.adate = 0;
+	de.ctime = 0;
+	de.ctime_cs = 0;
+	de.time = time;
+	de.date = date;
+	de.start = cpu_to_le16(cluster);
+	de.starthi = cpu_to_le16(cluster >> 16);
+	de.size = 0;
+
+	err = fat_add_entries(dir, &de, 1, sinfo);
+	if (err)
+		return err;
+
+	dir->i_ctime = dir->i_mtime = *ts;
+	if (IS_DIRSYNC(dir))
+		(void)fat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+
+	return 0;
+}
+
+/***** Create a file */
+static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
+			struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode = NULL;
+	struct fat_slot_info sinfo;
+	struct timespec ts;
+	unsigned char msdos_name[MSDOS_NAME];
+	int err, is_hid;
+
+	lock_super(sb);
+
+	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
+				msdos_name, &MSDOS_SB(sb)->options);
+	if (err)
+		goto out;
+	is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.');
+	/* Have to do it due to foo vs. .foo conflicts */
+	if (!fat_scan(dir, msdos_name, &sinfo)) {
+		brelse(sinfo.bh);
+		err = -EINVAL;
+		goto out;
+	}
+
+	ts = CURRENT_TIME_SEC;
+	err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo);
+	if (err)
+		goto out;
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	d_instantiate(dentry, inode);
+out:
+	unlock_super(sb);
+	if (!err)
+		err = fat_flush_inodes(sb, dir, inode);
+	return err;
+}
+
+/***** Remove a directory */
+static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode = dentry->d_inode;
+	struct fat_slot_info sinfo;
+	int err;
+
+	lock_super(sb);
+	/*
+	 * Check whether the directory is not in use, then check
+	 * whether it is empty.
+	 */
+	err = fat_dir_empty(inode);
+	if (err)
+		goto out;
+	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	drop_nlink(dir);
+
+	clear_nlink(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+out:
+	unlock_super(sb);
+	if (!err)
+		err = fat_flush_inodes(sb, dir, inode);
+
+	return err;
+}
+
+/***** Make a directory */
+static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	struct inode *inode;
+	unsigned char msdos_name[MSDOS_NAME];
+	struct timespec ts;
+	int err, is_hid, cluster;
+
+	lock_super(sb);
+
+	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
+				msdos_name, &MSDOS_SB(sb)->options);
+	if (err)
+		goto out;
+	is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.');
+	/* foo vs .foo situation */
+	if (!fat_scan(dir, msdos_name, &sinfo)) {
+		brelse(sinfo.bh);
+		err = -EINVAL;
+		goto out;
+	}
+
+	ts = CURRENT_TIME_SEC;
+	cluster = fat_alloc_new_dir(dir, &ts);
+	if (cluster < 0) {
+		err = cluster;
+		goto out;
+	}
+	err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo);
+	if (err)
+		goto out_free;
+	inc_nlink(dir);
+
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		/* the directory was completed, just return a error */
+		goto out;
+	}
+	inode->i_nlink = 2;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	d_instantiate(dentry, inode);
+
+	unlock_super(sb);
+	fat_flush_inodes(sb, dir, inode);
+	return 0;
+
+out_free:
+	fat_free_clusters(dir, cluster);
+out:
+	unlock_super(sb);
+	return err;
+}
+
+/***** Unlink a file */
+static int msdos_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb= inode->i_sb;
+	struct fat_slot_info sinfo;
+	int err;
+
+	lock_super(sb);
+	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	clear_nlink(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+out:
+	unlock_super(sb);
+	if (!err)
+		err = fat_flush_inodes(sb, dir, inode);
+
+	return err;
+}
+
+static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
+			   struct dentry *old_dentry,
+			   struct inode *new_dir, unsigned char *new_name,
+			   struct dentry *new_dentry, int is_hid)
+{
+	struct buffer_head *dotdot_bh;
+	struct msdos_dir_entry *dotdot_de;
+	struct inode *old_inode, *new_inode;
+	struct fat_slot_info old_sinfo, sinfo;
+	struct timespec ts;
+	loff_t dotdot_i_pos, new_i_pos;
+	int err, old_attrs, is_dir, update_dotdot, corrupt = 0;
+
+	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
+	old_inode = old_dentry->d_inode;
+	new_inode = new_dentry->d_inode;
+
+	err = fat_scan(old_dir, old_name, &old_sinfo);
+	if (err) {
+		err = -EIO;
+		goto out;
+	}
+
+	is_dir = S_ISDIR(old_inode->i_mode);
+	update_dotdot = (is_dir && old_dir != new_dir);
+	if (update_dotdot) {
+		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
+					 &dotdot_i_pos) < 0) {
+			err = -EIO;
+			goto out;
+		}
+	}
+
+	old_attrs = MSDOS_I(old_inode)->i_attrs;
+	err = fat_scan(new_dir, new_name, &sinfo);
+	if (!err) {
+		if (!new_inode) {
+			/* "foo" -> ".foo" case. just change the ATTR_HIDDEN */
+			if (sinfo.de != old_sinfo.de) {
+				err = -EINVAL;
+				goto out;
+			}
+			if (is_hid)
+				MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN;
+			else
+				MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN;
+			if (IS_DIRSYNC(old_dir)) {
+				err = fat_sync_inode(old_inode);
+				if (err) {
+					MSDOS_I(old_inode)->i_attrs = old_attrs;
+					goto out;
+				}
+			} else
+				mark_inode_dirty(old_inode);
+
+			old_dir->i_version++;
+			old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+			if (IS_DIRSYNC(old_dir))
+				(void)fat_sync_inode(old_dir);
+			else
+				mark_inode_dirty(old_dir);
+			goto out;
+		}
+	}
+
+	ts = CURRENT_TIME_SEC;
+	if (new_inode) {
+		if (err)
+			goto out;
+		if (is_dir) {
+			err = fat_dir_empty(new_inode);
+			if (err)
+				goto out;
+		}
+		new_i_pos = MSDOS_I(new_inode)->i_pos;
+		fat_detach(new_inode);
+	} else {
+		err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0,
+				      &ts, &sinfo);
+		if (err)
+			goto out;
+		new_i_pos = sinfo.i_pos;
+	}
+	new_dir->i_version++;
+
+	fat_detach(old_inode);
+	fat_attach(old_inode, new_i_pos);
+	if (is_hid)
+		MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN;
+	else
+		MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN;
+	if (IS_DIRSYNC(new_dir)) {
+		err = fat_sync_inode(old_inode);
+		if (err)
+			goto error_inode;
+	} else
+		mark_inode_dirty(old_inode);
+
+	if (update_dotdot) {
+		int start = MSDOS_I(new_dir)->i_logstart;
+		dotdot_de->start = cpu_to_le16(start);
+		dotdot_de->starthi = cpu_to_le16(start >> 16);
+		mark_buffer_dirty(dotdot_bh);
+		if (IS_DIRSYNC(new_dir)) {
+			err = sync_dirty_buffer(dotdot_bh);
+			if (err)
+				goto error_dotdot;
+		}
+		drop_nlink(old_dir);
+		if (!new_inode)
+			inc_nlink(new_dir);
+	}
+
+	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
+	old_sinfo.bh = NULL;
+	if (err)
+		goto error_dotdot;
+	old_dir->i_version++;
+	old_dir->i_ctime = old_dir->i_mtime = ts;
+	if (IS_DIRSYNC(old_dir))
+		(void)fat_sync_inode(old_dir);
+	else
+		mark_inode_dirty(old_dir);
+
+	if (new_inode) {
+		drop_nlink(new_inode);
+		if (is_dir)
+			drop_nlink(new_inode);
+		new_inode->i_ctime = ts;
+	}
+out:
+	brelse(sinfo.bh);
+	brelse(dotdot_bh);
+	brelse(old_sinfo.bh);
+	return err;
+
+error_dotdot:
+	/* data cluster is shared, serious corruption */
+	corrupt = 1;
+
+	if (update_dotdot) {
+		int start = MSDOS_I(old_dir)->i_logstart;
+		dotdot_de->start = cpu_to_le16(start);
+		dotdot_de->starthi = cpu_to_le16(start >> 16);
+		mark_buffer_dirty(dotdot_bh);
+		corrupt |= sync_dirty_buffer(dotdot_bh);
+	}
+error_inode:
+	fat_detach(old_inode);
+	fat_attach(old_inode, old_sinfo.i_pos);
+	MSDOS_I(old_inode)->i_attrs = old_attrs;
+	if (new_inode) {
+		fat_attach(new_inode, new_i_pos);
+		if (corrupt)
+			corrupt |= fat_sync_inode(new_inode);
+	} else {
+		/*
+		 * If new entry was not sharing the data cluster, it
+		 * shouldn't be serious corruption.
+		 */
+		int err2 = fat_remove_entries(new_dir, &sinfo);
+		if (corrupt)
+			corrupt |= err2;
+		sinfo.bh = NULL;
+	}
+	if (corrupt < 0) {
+		fat_fs_panic(new_dir->i_sb,
+			     "%s: Filesystem corrupted (i_pos %lld)",
+			     __func__, sinfo.i_pos);
+	}
+	goto out;
+}
+
+/***** Rename, a wrapper for rename_same_dir & rename_diff_dir */
+static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct super_block *sb = old_dir->i_sb;
+	unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
+	int err, is_hid;
+
+	lock_super(sb);
+
+	err = msdos_format_name(old_dentry->d_name.name,
+				old_dentry->d_name.len, old_msdos_name,
+				&MSDOS_SB(old_dir->i_sb)->options);
+	if (err)
+		goto out;
+	err = msdos_format_name(new_dentry->d_name.name,
+				new_dentry->d_name.len, new_msdos_name,
+				&MSDOS_SB(new_dir->i_sb)->options);
+	if (err)
+		goto out;
+
+	is_hid =
+	     (new_dentry->d_name.name[0] == '.') && (new_msdos_name[0] != '.');
+
+	err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
+			      new_dir, new_msdos_name, new_dentry, is_hid);
+out:
+	unlock_super(sb);
+	if (!err)
+		err = fat_flush_inodes(sb, old_dir, new_dir);
+	return err;
+}
+
+static const struct inode_operations msdos_dir_inode_operations = {
+	.create		= msdos_create,
+	.lookup		= msdos_lookup,
+	.unlink		= msdos_unlink,
+	.mkdir		= msdos_mkdir,
+	.rmdir		= msdos_rmdir,
+	.rename		= msdos_rename,
+	.setattr	= fat_setattr,
+	.getattr	= fat_getattr,
+};
+
+static int msdos_fill_super(struct super_block *sb, void *data, int silent)
+{
+	int res;
+
+	res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
+	if (res)
+		return res;
+
+	sb->s_flags |= MS_NOATIME;
+	sb->s_root->d_op = &msdos_dentry_operations;
+	return 0;
+}
+
+static int msdos_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
+{
+	return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super,
+			   mnt);
+}
+
+static struct file_system_type msdos_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "msdos",
+	.get_sb		= msdos_get_sb,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_msdos_fs(void)
+{
+	return register_filesystem(&msdos_fs_type);
+}
+
+static void __exit exit_msdos_fs(void)
+{
+	unregister_filesystem(&msdos_fs_type);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Werner Almesberger");
+MODULE_DESCRIPTION("MS-DOS filesystem support");
+
+module_init(init_msdos_fs)
+module_exit(exit_msdos_fs)
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
new file mode 100644
index 000000000000..155c10b4adbd
--- /dev/null
+++ b/fs/fat/namei_vfat.c
@@ -0,0 +1,1055 @@
+/*
+ *  linux/fs/vfat/namei.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *
+ *  Windows95/Windows NT compatible extended MSDOS filesystem
+ *    by Gordon Chaffee Copyright (C) 1995.  Send bug reports for the
+ *    VFAT filesystem to <chaffee@cs.berkeley.edu>.  Specify
+ *    what file operation caused you trouble and if you can duplicate
+ *    the problem, send a script that demonstrates it.
+ *
+ *  Short name translation 1999, 2001 by Wolfram Pienkoss <wp@bszh.de>
+ *
+ *  Support Multibyte characters and cleanup by
+ *				OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
+ */
+
+#include <linux/module.h>
+
+#include <linux/jiffies.h>
+#include <linux/msdos_fs.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/buffer_head.h>
+#include <linux/namei.h>
+
+static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	int ret = 1;
+
+	if (!dentry->d_inode &&
+	    nd && !(nd->flags & LOOKUP_CONTINUE) && (nd->flags & LOOKUP_CREATE))
+		/*
+		 * negative dentry is dropped, in order to make sure
+		 * to use the name which a user desires if this is
+		 * create path.
+		 */
+		ret = 0;
+	else {
+		spin_lock(&dentry->d_lock);
+		if (dentry->d_time != dentry->d_parent->d_inode->i_version)
+			ret = 0;
+		spin_unlock(&dentry->d_lock);
+	}
+	return ret;
+}
+
+/* returns the length of a struct qstr, ignoring trailing dots */
+static unsigned int vfat_striptail_len(struct qstr *qstr)
+{
+	unsigned int len = qstr->len;
+
+	while (len && qstr->name[len - 1] == '.')
+		len--;
+	return len;
+}
+
+/*
+ * Compute the hash for the vfat name corresponding to the dentry.
+ * Note: if the name is invalid, we leave the hash code unchanged so
+ * that the existing dentry can be used. The vfat fs routines will
+ * return ENOENT or EINVAL as appropriate.
+ */
+static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
+{
+	qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
+	return 0;
+}
+
+/*
+ * Compute the hash for the vfat name corresponding to the dentry.
+ * Note: if the name is invalid, we leave the hash code unchanged so
+ * that the existing dentry can be used. The vfat fs routines will
+ * return ENOENT or EINVAL as appropriate.
+ */
+static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
+{
+	struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
+	const unsigned char *name;
+	unsigned int len;
+	unsigned long hash;
+
+	name = qstr->name;
+	len = vfat_striptail_len(qstr);
+
+	hash = init_name_hash();
+	while (len--)
+		hash = partial_name_hash(nls_tolower(t, *name++), hash);
+	qstr->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+/*
+ * Case insensitive compare of two vfat names.
+ */
+static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
+{
+	struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
+	unsigned int alen, blen;
+
+	/* A filename cannot end in '.' or we treat it like it has none */
+	alen = vfat_striptail_len(a);
+	blen = vfat_striptail_len(b);
+	if (alen == blen) {
+		if (nls_strnicmp(t, a->name, b->name, alen) == 0)
+			return 0;
+	}
+	return 1;
+}
+
+/*
+ * Case sensitive compare of two vfat names.
+ */
+static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
+{
+	unsigned int alen, blen;
+
+	/* A filename cannot end in '.' or we treat it like it has none */
+	alen = vfat_striptail_len(a);
+	blen = vfat_striptail_len(b);
+	if (alen == blen) {
+		if (strncmp(a->name, b->name, alen) == 0)
+			return 0;
+	}
+	return 1;
+}
+
+static struct dentry_operations vfat_dentry_ops[4] = {
+	{
+		.d_hash		= vfat_hashi,
+		.d_compare	= vfat_cmpi,
+	},
+	{
+		.d_revalidate	= vfat_revalidate,
+		.d_hash		= vfat_hashi,
+		.d_compare	= vfat_cmpi,
+	},
+	{
+		.d_hash		= vfat_hash,
+		.d_compare	= vfat_cmp,
+	},
+	{
+		.d_revalidate	= vfat_revalidate,
+		.d_hash		= vfat_hash,
+		.d_compare	= vfat_cmp,
+	}
+};
+
+/* Characters that are undesirable in an MS-DOS file name */
+
+static inline wchar_t vfat_bad_char(wchar_t w)
+{
+	return (w < 0x0020)
+	    || (w == '*') || (w == '?') || (w == '<') || (w == '>')
+	    || (w == '|') || (w == '"') || (w == ':') || (w == '/')
+	    || (w == '\\');
+}
+
+static inline wchar_t vfat_replace_char(wchar_t w)
+{
+	return (w == '[') || (w == ']') || (w == ';') || (w == ',')
+	    || (w == '+') || (w == '=');
+}
+
+static wchar_t vfat_skip_char(wchar_t w)
+{
+	return (w == '.') || (w == ' ');
+}
+
+static inline int vfat_is_used_badchars(const wchar_t *s, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		if (vfat_bad_char(s[i]))
+			return -EINVAL;
+
+	if (s[i - 1] == ' ') /* last character cannot be space */
+		return -EINVAL;
+
+	return 0;
+}
+
+static int vfat_find_form(struct inode *dir, unsigned char *name)
+{
+	struct fat_slot_info sinfo;
+	int err = fat_scan(dir, name, &sinfo);
+	if (err)
+		return -ENOENT;
+	brelse(sinfo.bh);
+	return 0;
+}
+
+/*
+ * 1) Valid characters for the 8.3 format alias are any combination of
+ * letters, uppercase alphabets, digits, any of the
+ * following special characters:
+ *     $ % ' ` - @ { } ~ ! # ( ) & _ ^
+ * In this case Longfilename is not stored in disk.
+ *
+ * WinNT's Extension:
+ * File name and extension name is contain uppercase/lowercase
+ * only. And it is expressed by CASE_LOWER_BASE and CASE_LOWER_EXT.
+ *
+ * 2) File name is 8.3 format, but it contain the uppercase and
+ * lowercase char, muliti bytes char, etc. In this case numtail is not
+ * added, but Longfilename is stored.
+ *
+ * 3) When the one except for the above, or the following special
+ * character are contained:
+ *        .   [ ] ; , + =
+ * numtail is added, and Longfilename must be stored in disk .
+ */
+struct shortname_info {
+	unsigned char lower:1,
+		      upper:1,
+		      valid:1;
+};
+#define INIT_SHORTNAME_INFO(x)	do {		\
+	(x)->lower = 1;				\
+	(x)->upper = 1;				\
+	(x)->valid = 1;				\
+} while (0)
+
+static inline int to_shortname_char(struct nls_table *nls,
+				    unsigned char *buf, int buf_size,
+				    wchar_t *src, struct shortname_info *info)
+{
+	int len;
+
+	if (vfat_skip_char(*src)) {
+		info->valid = 0;
+		return 0;
+	}
+	if (vfat_replace_char(*src)) {
+		info->valid = 0;
+		buf[0] = '_';
+		return 1;
+	}
+
+	len = nls->uni2char(*src, buf, buf_size);
+	if (len <= 0) {
+		info->valid = 0;
+		buf[0] = '_';
+		len = 1;
+	} else if (len == 1) {
+		unsigned char prev = buf[0];
+
+		if (buf[0] >= 0x7F) {
+			info->lower = 0;
+			info->upper = 0;
+		}
+
+		buf[0] = nls_toupper(nls, buf[0]);
+		if (isalpha(buf[0])) {
+			if (buf[0] == prev)
+				info->lower = 0;
+			else
+				info->upper = 0;
+		}
+	} else {
+		info->lower = 0;
+		info->upper = 0;
+	}
+
+	return len;
+}
+
+/*
+ * Given a valid longname, create a unique shortname.  Make sure the
+ * shortname does not exist
+ * Returns negative number on error, 0 for a normal
+ * return, and 1 for valid shortname
+ */
+static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
+				 wchar_t *uname, int ulen,
+				 unsigned char *name_res, unsigned char *lcase)
+{
+	struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
+	wchar_t *ip, *ext_start, *end, *name_start;
+	unsigned char base[9], ext[4], buf[8], *p;
+	unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
+	int chl, chi;
+	int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
+	int is_shortname;
+	struct shortname_info base_info, ext_info;
+
+	is_shortname = 1;
+	INIT_SHORTNAME_INFO(&base_info);
+	INIT_SHORTNAME_INFO(&ext_info);
+
+	/* Now, we need to create a shortname from the long name */
+	ext_start = end = &uname[ulen];
+	while (--ext_start >= uname) {
+		if (*ext_start == 0x002E) {	/* is `.' */
+			if (ext_start == end - 1) {
+				sz = ulen;
+				ext_start = NULL;
+			}
+			break;
+		}
+	}
+
+	if (ext_start == uname - 1) {
+		sz = ulen;
+		ext_start = NULL;
+	} else if (ext_start) {
+		/*
+		 * Names which start with a dot could be just
+		 * an extension eg. "...test".  In this case Win95
+		 * uses the extension as the name and sets no extension.
+		 */
+		name_start = &uname[0];
+		while (name_start < ext_start) {
+			if (!vfat_skip_char(*name_start))
+				break;
+			name_start++;
+		}
+		if (name_start != ext_start) {
+			sz = ext_start - uname;
+			ext_start++;
+		} else {
+			sz = ulen;
+			ext_start = NULL;
+		}
+	}
+
+	numtail_baselen = 6;
+	numtail2_baselen = 2;
+	for (baselen = i = 0, p = base, ip = uname; i < sz; i++, ip++) {
+		chl = to_shortname_char(nls, charbuf, sizeof(charbuf),
+					ip, &base_info);
+		if (chl == 0)
+			continue;
+
+		if (baselen < 2 && (baselen + chl) > 2)
+			numtail2_baselen = baselen;
+		if (baselen < 6 && (baselen + chl) > 6)
+			numtail_baselen = baselen;
+		for (chi = 0; chi < chl; chi++) {
+			*p++ = charbuf[chi];
+			baselen++;
+			if (baselen >= 8)
+				break;
+		}
+		if (baselen >= 8) {
+			if ((chi < chl - 1) || (ip + 1) - uname < sz)
+				is_shortname = 0;
+			break;
+		}
+	}
+	if (baselen == 0) {
+		return -EINVAL;
+	}
+
+	extlen = 0;
+	if (ext_start) {
+		for (p = ext, ip = ext_start; extlen < 3 && ip < end; ip++) {
+			chl = to_shortname_char(nls, charbuf, sizeof(charbuf),
+						ip, &ext_info);
+			if (chl == 0)
+				continue;
+
+			if ((extlen + chl) > 3) {
+				is_shortname = 0;
+				break;
+			}
+			for (chi = 0; chi < chl; chi++) {
+				*p++ = charbuf[chi];
+				extlen++;
+			}
+			if (extlen >= 3) {
+				if (ip + 1 != end)
+					is_shortname = 0;
+				break;
+			}
+		}
+	}
+	ext[extlen] = '\0';
+	base[baselen] = '\0';
+
+	/* Yes, it can happen. ".\xe5" would do it. */
+	if (base[0] == DELETED_FLAG)
+		base[0] = 0x05;
+
+	/* OK, at this point we know that base is not longer than 8 symbols,
+	 * ext is not longer than 3, base is nonempty, both don't contain
+	 * any bad symbols (lowercase transformed to uppercase).
+	 */
+
+	memset(name_res, ' ', MSDOS_NAME);
+	memcpy(name_res, base, baselen);
+	memcpy(name_res + 8, ext, extlen);
+	*lcase = 0;
+	if (is_shortname && base_info.valid && ext_info.valid) {
+		if (vfat_find_form(dir, name_res) == 0)
+			return -EEXIST;
+
+		if (opts->shortname & VFAT_SFN_CREATE_WIN95) {
+			return (base_info.upper && ext_info.upper);
+		} else if (opts->shortname & VFAT_SFN_CREATE_WINNT) {
+			if ((base_info.upper || base_info.lower) &&
+			    (ext_info.upper || ext_info.lower)) {
+				if (!base_info.upper && base_info.lower)
+					*lcase |= CASE_LOWER_BASE;
+				if (!ext_info.upper && ext_info.lower)
+					*lcase |= CASE_LOWER_EXT;
+				return 1;
+			}
+			return 0;
+		} else {
+			BUG();
+		}
+	}
+
+	if (opts->numtail == 0)
+		if (vfat_find_form(dir, name_res) < 0)
+			return 0;
+
+	/*
+	 * Try to find a unique extension.  This used to
+	 * iterate through all possibilities sequentially,
+	 * but that gave extremely bad performance.  Windows
+	 * only tries a few cases before using random
+	 * values for part of the base.
+	 */
+
+	if (baselen > 6) {
+		baselen = numtail_baselen;
+		name_res[7] = ' ';
+	}
+	name_res[baselen] = '~';
+	for (i = 1; i < 10; i++) {
+		name_res[baselen + 1] = i + '0';
+		if (vfat_find_form(dir, name_res) < 0)
+			return 0;
+	}
+
+	i = jiffies & 0xffff;
+	sz = (jiffies >> 16) & 0x7;
+	if (baselen > 2) {
+		baselen = numtail2_baselen;
+		name_res[7] = ' ';
+	}
+	name_res[baselen + 4] = '~';
+	name_res[baselen + 5] = '1' + sz;
+	while (1) {
+		sprintf(buf, "%04X", i);
+		memcpy(&name_res[baselen], buf, 4);
+		if (vfat_find_form(dir, name_res) < 0)
+			break;
+		i -= 11;
+	}
+	return 0;
+}
+
+/* Translate a string, including coded sequences into Unicode */
+static int
+xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
+	     int *longlen, int *outlen, int escape, int utf8,
+	     struct nls_table *nls)
+{
+	const unsigned char *ip;
+	unsigned char nc;
+	unsigned char *op;
+	unsigned int ec;
+	int i, k, fill;
+	int charlen;
+
+	if (utf8) {
+		int name_len = strlen(name);
+
+		*outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX);
+
+		/*
+		 * We stripped '.'s before and set len appropriately,
+		 * but utf8_mbstowcs doesn't care about len
+		 */
+		*outlen -= (name_len - len);
+
+		if (*outlen > 255)
+			return -ENAMETOOLONG;
+
+		op = &outname[*outlen * sizeof(wchar_t)];
+	} else {
+		if (nls) {
+			for (i = 0, ip = name, op = outname, *outlen = 0;
+			     i < len && *outlen <= 255;
+			     *outlen += 1)
+			{
+				if (escape && (*ip == ':')) {
+					if (i > len - 5)
+						return -EINVAL;
+					ec = 0;
+					for (k = 1; k < 5; k++) {
+						nc = ip[k];
+						ec <<= 4;
+						if (nc >= '0' && nc <= '9') {
+							ec |= nc - '0';
+							continue;
+						}
+						if (nc >= 'a' && nc <= 'f') {
+							ec |= nc - ('a' - 10);
+							continue;
+						}
+						if (nc >= 'A' && nc <= 'F') {
+							ec |= nc - ('A' - 10);
+							continue;
+						}
+						return -EINVAL;
+					}
+					*op++ = ec & 0xFF;
+					*op++ = ec >> 8;
+					ip += 5;
+					i += 5;
+				} else {
+					if ((charlen = nls->char2uni(ip, len - i, (wchar_t *)op)) < 0)
+						return -EINVAL;
+					ip += charlen;
+					i += charlen;
+					op += 2;
+				}
+			}
+			if (i < len)
+				return -ENAMETOOLONG;
+		} else {
+			for (i = 0, ip = name, op = outname, *outlen = 0;
+			     i < len && *outlen <= 255;
+			     i++, *outlen += 1)
+			{
+				*op++ = *ip++;
+				*op++ = 0;
+			}
+			if (i < len)
+				return -ENAMETOOLONG;
+		}
+	}
+
+	*longlen = *outlen;
+	if (*outlen % 13) {
+		*op++ = 0;
+		*op++ = 0;
+		*outlen += 1;
+		if (*outlen % 13) {
+			fill = 13 - (*outlen % 13);
+			for (i = 0; i < fill; i++) {
+				*op++ = 0xff;
+				*op++ = 0xff;
+			}
+			*outlen += fill;
+		}
+	}
+
+	return 0;
+}
+
+static int vfat_build_slots(struct inode *dir, const unsigned char *name,
+			    int len, int is_dir, int cluster,
+			    struct timespec *ts,
+			    struct msdos_dir_slot *slots, int *nr_slots)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
+	struct fat_mount_options *opts = &sbi->options;
+	struct msdos_dir_slot *ps;
+	struct msdos_dir_entry *de;
+	unsigned char cksum, lcase;
+	unsigned char msdos_name[MSDOS_NAME];
+	wchar_t *uname;
+	__le16 time, date;
+	int err, ulen, usize, i;
+	loff_t offset;
+
+	*nr_slots = 0;
+
+	uname = __getname();
+	if (!uname)
+		return -ENOMEM;
+
+	err = xlate_to_uni(name, len, (unsigned char *)uname, &ulen, &usize,
+			   opts->unicode_xlate, opts->utf8, sbi->nls_io);
+	if (err)
+		goto out_free;
+
+	err = vfat_is_used_badchars(uname, ulen);
+	if (err)
+		goto out_free;
+
+	err = vfat_create_shortname(dir, sbi->nls_disk, uname, ulen,
+				    msdos_name, &lcase);
+	if (err < 0)
+		goto out_free;
+	else if (err == 1) {
+		de = (struct msdos_dir_entry *)slots;
+		err = 0;
+		goto shortname;
+	}
+
+	/* build the entry of long file name */
+	cksum = fat_checksum(msdos_name);
+
+	*nr_slots = usize / 13;
+	for (ps = slots, i = *nr_slots; i > 0; i--, ps++) {
+		ps->id = i;
+		ps->attr = ATTR_EXT;
+		ps->reserved = 0;
+		ps->alias_checksum = cksum;
+		ps->start = 0;
+		offset = (i - 1) * 13;
+		fatwchar_to16(ps->name0_4, uname + offset, 5);
+		fatwchar_to16(ps->name5_10, uname + offset + 5, 6);
+		fatwchar_to16(ps->name11_12, uname + offset + 11, 2);
+	}
+	slots[0].id |= 0x40;
+	de = (struct msdos_dir_entry *)ps;
+
+shortname:
+	/* build the entry of 8.3 alias name */
+	(*nr_slots)++;
+	memcpy(de->name, msdos_name, MSDOS_NAME);
+	de->attr = is_dir ? ATTR_DIR : ATTR_ARCH;
+	de->lcase = lcase;
+	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
+	de->time = de->ctime = time;
+	de->date = de->cdate = de->adate = date;
+	de->ctime_cs = 0;
+	de->start = cpu_to_le16(cluster);
+	de->starthi = cpu_to_le16(cluster >> 16);
+	de->size = 0;
+out_free:
+	__putname(uname);
+	return err;
+}
+
+static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
+			  int cluster, struct timespec *ts,
+			  struct fat_slot_info *sinfo)
+{
+	struct msdos_dir_slot *slots;
+	unsigned int len;
+	int err, nr_slots;
+
+	len = vfat_striptail_len(qname);
+	if (len == 0)
+		return -ENOENT;
+
+	slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
+	if (slots == NULL)
+		return -ENOMEM;
+
+	err = vfat_build_slots(dir, qname->name, len, is_dir, cluster, ts,
+			       slots, &nr_slots);
+	if (err)
+		goto cleanup;
+
+	err = fat_add_entries(dir, slots, nr_slots, sinfo);
+	if (err)
+		goto cleanup;
+
+	/* update timestamp */
+	dir->i_ctime = dir->i_mtime = dir->i_atime = *ts;
+	if (IS_DIRSYNC(dir))
+		(void)fat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+cleanup:
+	kfree(slots);
+	return err;
+}
+
+static int vfat_find(struct inode *dir, struct qstr *qname,
+		     struct fat_slot_info *sinfo)
+{
+	unsigned int len = vfat_striptail_len(qname);
+	if (len == 0)
+		return -ENOENT;
+	return fat_search_long(dir, qname->name, len, sinfo);
+}
+
+static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
+				  struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	struct inode *inode = NULL;
+	struct dentry *alias;
+	int err, table;
+
+	lock_super(sb);
+	table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
+	dentry->d_op = &vfat_dentry_ops[table];
+
+	err = vfat_find(dir, &dentry->d_name, &sinfo);
+	if (err) {
+		table++;
+		goto error;
+	}
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		unlock_super(sb);
+		return ERR_CAST(inode);
+	}
+	alias = d_find_alias(inode);
+	if (alias) {
+		if (d_invalidate(alias) == 0)
+			dput(alias);
+		else {
+			iput(inode);
+			unlock_super(sb);
+			return alias;
+		}
+
+	}
+error:
+	unlock_super(sb);
+	dentry->d_op = &vfat_dentry_ops[table];
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+	dentry = d_splice_alias(inode, dentry);
+	if (dentry) {
+		dentry->d_op = &vfat_dentry_ops[table];
+		dentry->d_time = dentry->d_parent->d_inode->i_version;
+	}
+	return dentry;
+}
+
+static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
+		       struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+	struct fat_slot_info sinfo;
+	struct timespec ts;
+	int err;
+
+	lock_super(sb);
+
+	ts = CURRENT_TIME_SEC;
+	err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
+	if (err)
+		goto out;
+	dir->i_version++;
+
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	inode->i_version++;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+	d_instantiate(dentry, inode);
+out:
+	unlock_super(sb);
+	return err;
+}
+
+static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	int err;
+
+	lock_super(sb);
+
+	err = fat_dir_empty(inode);
+	if (err)
+		goto out;
+	err = vfat_find(dir, &dentry->d_name, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	drop_nlink(dir);
+
+	clear_nlink(inode);
+	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+out:
+	unlock_super(sb);
+
+	return err;
+}
+
+static int vfat_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	int err;
+
+	lock_super(sb);
+
+	err = vfat_find(dir, &dentry->d_name, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	clear_nlink(inode);
+	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+out:
+	unlock_super(sb);
+
+	return err;
+}
+
+static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+	struct fat_slot_info sinfo;
+	struct timespec ts;
+	int err, cluster;
+
+	lock_super(sb);
+
+	ts = CURRENT_TIME_SEC;
+	cluster = fat_alloc_new_dir(dir, &ts);
+	if (cluster < 0) {
+		err = cluster;
+		goto out;
+	}
+	err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo);
+	if (err)
+		goto out_free;
+	dir->i_version++;
+	inc_nlink(dir);
+
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		/* the directory was completed, just return a error */
+		goto out;
+	}
+	inode->i_version++;
+	inode->i_nlink = 2;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+	d_instantiate(dentry, inode);
+
+	unlock_super(sb);
+	return 0;
+
+out_free:
+	fat_free_clusters(dir, cluster);
+out:
+	unlock_super(sb);
+	return err;
+}
+
+static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct buffer_head *dotdot_bh;
+	struct msdos_dir_entry *dotdot_de;
+	struct inode *old_inode, *new_inode;
+	struct fat_slot_info old_sinfo, sinfo;
+	struct timespec ts;
+	loff_t dotdot_i_pos, new_i_pos;
+	int err, is_dir, update_dotdot, corrupt = 0;
+	struct super_block *sb = old_dir->i_sb;
+
+	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
+	old_inode = old_dentry->d_inode;
+	new_inode = new_dentry->d_inode;
+	lock_super(sb);
+	err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
+	if (err)
+		goto out;
+
+	is_dir = S_ISDIR(old_inode->i_mode);
+	update_dotdot = (is_dir && old_dir != new_dir);
+	if (update_dotdot) {
+		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
+					 &dotdot_i_pos) < 0) {
+			err = -EIO;
+			goto out;
+		}
+	}
+
+	ts = CURRENT_TIME_SEC;
+	if (new_inode) {
+		if (is_dir) {
+			err = fat_dir_empty(new_inode);
+			if (err)
+				goto out;
+		}
+		new_i_pos = MSDOS_I(new_inode)->i_pos;
+		fat_detach(new_inode);
+	} else {
+		err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0,
+				     &ts, &sinfo);
+		if (err)
+			goto out;
+		new_i_pos = sinfo.i_pos;
+	}
+	new_dir->i_version++;
+
+	fat_detach(old_inode);
+	fat_attach(old_inode, new_i_pos);
+	if (IS_DIRSYNC(new_dir)) {
+		err = fat_sync_inode(old_inode);
+		if (err)
+			goto error_inode;
+	} else
+		mark_inode_dirty(old_inode);
+
+	if (update_dotdot) {
+		int start = MSDOS_I(new_dir)->i_logstart;
+		dotdot_de->start = cpu_to_le16(start);
+		dotdot_de->starthi = cpu_to_le16(start >> 16);
+		mark_buffer_dirty(dotdot_bh);
+		if (IS_DIRSYNC(new_dir)) {
+			err = sync_dirty_buffer(dotdot_bh);
+			if (err)
+				goto error_dotdot;
+		}
+		drop_nlink(old_dir);
+		if (!new_inode)
+ 			inc_nlink(new_dir);
+	}
+
+	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
+	old_sinfo.bh = NULL;
+	if (err)
+		goto error_dotdot;
+	old_dir->i_version++;
+	old_dir->i_ctime = old_dir->i_mtime = ts;
+	if (IS_DIRSYNC(old_dir))
+		(void)fat_sync_inode(old_dir);
+	else
+		mark_inode_dirty(old_dir);
+
+	if (new_inode) {
+		drop_nlink(new_inode);
+		if (is_dir)
+			drop_nlink(new_inode);
+		new_inode->i_ctime = ts;
+	}
+out:
+	brelse(sinfo.bh);
+	brelse(dotdot_bh);
+	brelse(old_sinfo.bh);
+	unlock_super(sb);
+
+	return err;
+
+error_dotdot:
+	/* data cluster is shared, serious corruption */
+	corrupt = 1;
+
+	if (update_dotdot) {
+		int start = MSDOS_I(old_dir)->i_logstart;
+		dotdot_de->start = cpu_to_le16(start);
+		dotdot_de->starthi = cpu_to_le16(start >> 16);
+		mark_buffer_dirty(dotdot_bh);
+		corrupt |= sync_dirty_buffer(dotdot_bh);
+	}
+error_inode:
+	fat_detach(old_inode);
+	fat_attach(old_inode, old_sinfo.i_pos);
+	if (new_inode) {
+		fat_attach(new_inode, new_i_pos);
+		if (corrupt)
+			corrupt |= fat_sync_inode(new_inode);
+	} else {
+		/*
+		 * If new entry was not sharing the data cluster, it
+		 * shouldn't be serious corruption.
+		 */
+		int err2 = fat_remove_entries(new_dir, &sinfo);
+		if (corrupt)
+			corrupt |= err2;
+		sinfo.bh = NULL;
+	}
+	if (corrupt < 0) {
+		fat_fs_panic(new_dir->i_sb,
+			     "%s: Filesystem corrupted (i_pos %lld)",
+			     __func__, sinfo.i_pos);
+	}
+	goto out;
+}
+
+static const struct inode_operations vfat_dir_inode_operations = {
+	.create		= vfat_create,
+	.lookup		= vfat_lookup,
+	.unlink		= vfat_unlink,
+	.mkdir		= vfat_mkdir,
+	.rmdir		= vfat_rmdir,
+	.rename		= vfat_rename,
+	.setattr	= fat_setattr,
+	.getattr	= fat_getattr,
+};
+
+static int vfat_fill_super(struct super_block *sb, void *data, int silent)
+{
+	int res;
+
+	res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
+	if (res)
+		return res;
+
+	if (MSDOS_SB(sb)->options.name_check != 's')
+		sb->s_root->d_op = &vfat_dentry_ops[0];
+	else
+		sb->s_root->d_op = &vfat_dentry_ops[2];
+
+	return 0;
+}
+
+static int vfat_get_sb(struct file_system_type *fs_type,
+		       int flags, const char *dev_name,
+		       void *data, struct vfsmount *mnt)
+{
+	return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super,
+			   mnt);
+}
+
+static struct file_system_type vfat_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "vfat",
+	.get_sb		= vfat_get_sb,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_vfat_fs(void)
+{
+	return register_filesystem(&vfat_fs_type);
+}
+
+static void __exit exit_vfat_fs(void)
+{
+	unregister_filesystem(&vfat_fs_type);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("VFAT filesystem support");
+MODULE_AUTHOR("Gordon Chaffee");
+
+module_init(init_vfat_fs)
+module_exit(exit_vfat_fs)
diff --git a/fs/msdos/Makefile b/fs/msdos/Makefile
deleted file mode 100644
index ea67646fcb95..000000000000
--- a/fs/msdos/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# Makefile for the Linux msdos filesystem routines.
-#
-
-obj-$(CONFIG_MSDOS_FS) += msdos.o
-
-msdos-y := namei.o
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
deleted file mode 100644
index e844b9809d27..000000000000
--- a/fs/msdos/namei.c
+++ /dev/null
@@ -1,702 +0,0 @@
-/*
- *  linux/fs/msdos/namei.c
- *
- *  Written 1992,1993 by Werner Almesberger
- *  Hidden files 1995 by Albert Cahalan <albert@ccs.neu.edu> <adc@coe.neu.edu>
- *  Rewritten for constant inumbers 1999 by Al Viro
- */
-
-#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/buffer_head.h>
-#include <linux/msdos_fs.h>
-#include <linux/smp_lock.h>
-
-/* Characters that are undesirable in an MS-DOS file name */
-static unsigned char bad_chars[] = "*?<>|\"";
-static unsigned char bad_if_strict[] = "+=,; ";
-
-/***** Formats an MS-DOS file name. Rejects invalid names. */
-static int msdos_format_name(const unsigned char *name, int len,
-			     unsigned char *res, struct fat_mount_options *opts)
-	/*
-	 * name is the proposed name, len is its length, res is
-	 * the resulting name, opts->name_check is either (r)elaxed,
-	 * (n)ormal or (s)trict, opts->dotsOK allows dots at the
-	 * beginning of name (for hidden files)
-	 */
-{
-	unsigned char *walk;
-	unsigned char c;
-	int space;
-
-	if (name[0] == '.') {	/* dotfile because . and .. already done */
-		if (opts->dotsOK) {
-			/* Get rid of dot - test for it elsewhere */
-			name++;
-			len--;
-		} else
-			return -EINVAL;
-	}
-	/*
-	 * disallow names that _really_ start with a dot
-	 */
-	space = 1;
-	c = 0;
-	for (walk = res; len && walk - res < 8; walk++) {
-		c = *name++;
-		len--;
-		if (opts->name_check != 'r' && strchr(bad_chars, c))
-			return -EINVAL;
-		if (opts->name_check == 's' && strchr(bad_if_strict, c))
-			return -EINVAL;
-		if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
-			return -EINVAL;
-		if (c < ' ' || c == ':' || c == '\\')
-			return -EINVAL;
-	/*
-	 * 0xE5 is legal as a first character, but we must substitute
-	 * 0x05 because 0xE5 marks deleted files.  Yes, DOS really
-	 * does this.
-	 * It seems that Microsoft hacked DOS to support non-US
-	 * characters after the 0xE5 character was already in use to
-	 * mark deleted files.
-	 */
-		if ((res == walk) && (c == 0xE5))
-			c = 0x05;
-		if (c == '.')
-			break;
-		space = (c == ' ');
-		*walk = (!opts->nocase && c >= 'a' && c <= 'z') ? c - 32 : c;
-	}
-	if (space)
-		return -EINVAL;
-	if (opts->name_check == 's' && len && c != '.') {
-		c = *name++;
-		len--;
-		if (c != '.')
-			return -EINVAL;
-	}
-	while (c != '.' && len--)
-		c = *name++;
-	if (c == '.') {
-		while (walk - res < 8)
-			*walk++ = ' ';
-		while (len > 0 && walk - res < MSDOS_NAME) {
-			c = *name++;
-			len--;
-			if (opts->name_check != 'r' && strchr(bad_chars, c))
-				return -EINVAL;
-			if (opts->name_check == 's' &&
-			    strchr(bad_if_strict, c))
-				return -EINVAL;
-			if (c < ' ' || c == ':' || c == '\\')
-				return -EINVAL;
-			if (c == '.') {
-				if (opts->name_check == 's')
-					return -EINVAL;
-				break;
-			}
-			if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
-				return -EINVAL;
-			space = c == ' ';
-			if (!opts->nocase && c >= 'a' && c <= 'z')
-				*walk++ = c - 32;
-			else
-				*walk++ = c;
-		}
-		if (space)
-			return -EINVAL;
-		if (opts->name_check == 's' && len)
-			return -EINVAL;
-	}
-	while (walk - res < MSDOS_NAME)
-		*walk++ = ' ';
-
-	return 0;
-}
-
-/***** Locates a directory entry.  Uses unformatted name. */
-static int msdos_find(struct inode *dir, const unsigned char *name, int len,
-		      struct fat_slot_info *sinfo)
-{
-	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
-	unsigned char msdos_name[MSDOS_NAME];
-	int err;
-
-	err = msdos_format_name(name, len, msdos_name, &sbi->options);
-	if (err)
-		return -ENOENT;
-
-	err = fat_scan(dir, msdos_name, sinfo);
-	if (!err && sbi->options.dotsOK) {
-		if (name[0] == '.') {
-			if (!(sinfo->de->attr & ATTR_HIDDEN))
-				err = -ENOENT;
-		} else {
-			if (sinfo->de->attr & ATTR_HIDDEN)
-				err = -ENOENT;
-		}
-		if (err)
-			brelse(sinfo->bh);
-	}
-	return err;
-}
-
-/*
- * Compute the hash for the msdos name corresponding to the dentry.
- * Note: if the name is invalid, we leave the hash code unchanged so
- * that the existing dentry can be used. The msdos fs routines will
- * return ENOENT or EINVAL as appropriate.
- */
-static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
-{
-	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
-	unsigned char msdos_name[MSDOS_NAME];
-	int error;
-
-	error = msdos_format_name(qstr->name, qstr->len, msdos_name, options);
-	if (!error)
-		qstr->hash = full_name_hash(msdos_name, MSDOS_NAME);
-	return 0;
-}
-
-/*
- * Compare two msdos names. If either of the names are invalid,
- * we fall back to doing the standard name comparison.
- */
-static int msdos_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
-{
-	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
-	unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME];
-	int error;
-
-	error = msdos_format_name(a->name, a->len, a_msdos_name, options);
-	if (error)
-		goto old_compare;
-	error = msdos_format_name(b->name, b->len, b_msdos_name, options);
-	if (error)
-		goto old_compare;
-	error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME);
-out:
-	return error;
-
-old_compare:
-	error = 1;
-	if (a->len == b->len)
-		error = memcmp(a->name, b->name, a->len);
-	goto out;
-}
-
-static struct dentry_operations msdos_dentry_operations = {
-	.d_hash		= msdos_hash,
-	.d_compare	= msdos_cmp,
-};
-
-/*
- * AV. Wrappers for FAT sb operations. Is it wise?
- */
-
-/***** Get inode using directory and name */
-static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
-				   struct nameidata *nd)
-{
-	struct super_block *sb = dir->i_sb;
-	struct fat_slot_info sinfo;
-	struct inode *inode = NULL;
-	int res;
-
-	dentry->d_op = &msdos_dentry_operations;
-
-	lock_super(sb);
-	res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
-	if (res == -ENOENT)
-		goto add;
-	if (res < 0)
-		goto out;
-	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
-	brelse(sinfo.bh);
-	if (IS_ERR(inode)) {
-		res = PTR_ERR(inode);
-		goto out;
-	}
-add:
-	res = 0;
-	dentry = d_splice_alias(inode, dentry);
-	if (dentry)
-		dentry->d_op = &msdos_dentry_operations;
-out:
-	unlock_super(sb);
-	if (!res)
-		return dentry;
-	return ERR_PTR(res);
-}
-
-/***** Creates a directory entry (name is already formatted). */
-static int msdos_add_entry(struct inode *dir, const unsigned char *name,
-			   int is_dir, int is_hid, int cluster,
-			   struct timespec *ts, struct fat_slot_info *sinfo)
-{
-	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
-	struct msdos_dir_entry de;
-	__le16 time, date;
-	int err;
-
-	memcpy(de.name, name, MSDOS_NAME);
-	de.attr = is_dir ? ATTR_DIR : ATTR_ARCH;
-	if (is_hid)
-		de.attr |= ATTR_HIDDEN;
-	de.lcase = 0;
-	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
-	de.cdate = de.adate = 0;
-	de.ctime = 0;
-	de.ctime_cs = 0;
-	de.time = time;
-	de.date = date;
-	de.start = cpu_to_le16(cluster);
-	de.starthi = cpu_to_le16(cluster >> 16);
-	de.size = 0;
-
-	err = fat_add_entries(dir, &de, 1, sinfo);
-	if (err)
-		return err;
-
-	dir->i_ctime = dir->i_mtime = *ts;
-	if (IS_DIRSYNC(dir))
-		(void)fat_sync_inode(dir);
-	else
-		mark_inode_dirty(dir);
-
-	return 0;
-}
-
-/***** Create a file */
-static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
-			struct nameidata *nd)
-{
-	struct super_block *sb = dir->i_sb;
-	struct inode *inode = NULL;
-	struct fat_slot_info sinfo;
-	struct timespec ts;
-	unsigned char msdos_name[MSDOS_NAME];
-	int err, is_hid;
-
-	lock_super(sb);
-
-	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
-				msdos_name, &MSDOS_SB(sb)->options);
-	if (err)
-		goto out;
-	is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.');
-	/* Have to do it due to foo vs. .foo conflicts */
-	if (!fat_scan(dir, msdos_name, &sinfo)) {
-		brelse(sinfo.bh);
-		err = -EINVAL;
-		goto out;
-	}
-
-	ts = CURRENT_TIME_SEC;
-	err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo);
-	if (err)
-		goto out;
-	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
-	brelse(sinfo.bh);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		goto out;
-	}
-	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
-	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
-
-	d_instantiate(dentry, inode);
-out:
-	unlock_super(sb);
-	if (!err)
-		err = fat_flush_inodes(sb, dir, inode);
-	return err;
-}
-
-/***** Remove a directory */
-static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	struct super_block *sb = dir->i_sb;
-	struct inode *inode = dentry->d_inode;
-	struct fat_slot_info sinfo;
-	int err;
-
-	lock_super(sb);
-	/*
-	 * Check whether the directory is not in use, then check
-	 * whether it is empty.
-	 */
-	err = fat_dir_empty(inode);
-	if (err)
-		goto out;
-	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
-	if (err)
-		goto out;
-
-	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
-	if (err)
-		goto out;
-	drop_nlink(dir);
-
-	clear_nlink(inode);
-	inode->i_ctime = CURRENT_TIME_SEC;
-	fat_detach(inode);
-out:
-	unlock_super(sb);
-	if (!err)
-		err = fat_flush_inodes(sb, dir, inode);
-
-	return err;
-}
-
-/***** Make a directory */
-static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-	struct super_block *sb = dir->i_sb;
-	struct fat_slot_info sinfo;
-	struct inode *inode;
-	unsigned char msdos_name[MSDOS_NAME];
-	struct timespec ts;
-	int err, is_hid, cluster;
-
-	lock_super(sb);
-
-	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
-				msdos_name, &MSDOS_SB(sb)->options);
-	if (err)
-		goto out;
-	is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.');
-	/* foo vs .foo situation */
-	if (!fat_scan(dir, msdos_name, &sinfo)) {
-		brelse(sinfo.bh);
-		err = -EINVAL;
-		goto out;
-	}
-
-	ts = CURRENT_TIME_SEC;
-	cluster = fat_alloc_new_dir(dir, &ts);
-	if (cluster < 0) {
-		err = cluster;
-		goto out;
-	}
-	err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo);
-	if (err)
-		goto out_free;
-	inc_nlink(dir);
-
-	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
-	brelse(sinfo.bh);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		/* the directory was completed, just return a error */
-		goto out;
-	}
-	inode->i_nlink = 2;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
-	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
-
-	d_instantiate(dentry, inode);
-
-	unlock_super(sb);
-	fat_flush_inodes(sb, dir, inode);
-	return 0;
-
-out_free:
-	fat_free_clusters(dir, cluster);
-out:
-	unlock_super(sb);
-	return err;
-}
-
-/***** Unlink a file */
-static int msdos_unlink(struct inode *dir, struct dentry *dentry)
-{
-	struct inode *inode = dentry->d_inode;
-	struct super_block *sb= inode->i_sb;
-	struct fat_slot_info sinfo;
-	int err;
-
-	lock_super(sb);
-	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
-	if (err)
-		goto out;
-
-	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
-	if (err)
-		goto out;
-	clear_nlink(inode);
-	inode->i_ctime = CURRENT_TIME_SEC;
-	fat_detach(inode);
-out:
-	unlock_super(sb);
-	if (!err)
-		err = fat_flush_inodes(sb, dir, inode);
-
-	return err;
-}
-
-static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
-			   struct dentry *old_dentry,
-			   struct inode *new_dir, unsigned char *new_name,
-			   struct dentry *new_dentry, int is_hid)
-{
-	struct buffer_head *dotdot_bh;
-	struct msdos_dir_entry *dotdot_de;
-	struct inode *old_inode, *new_inode;
-	struct fat_slot_info old_sinfo, sinfo;
-	struct timespec ts;
-	loff_t dotdot_i_pos, new_i_pos;
-	int err, old_attrs, is_dir, update_dotdot, corrupt = 0;
-
-	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
-	old_inode = old_dentry->d_inode;
-	new_inode = new_dentry->d_inode;
-
-	err = fat_scan(old_dir, old_name, &old_sinfo);
-	if (err) {
-		err = -EIO;
-		goto out;
-	}
-
-	is_dir = S_ISDIR(old_inode->i_mode);
-	update_dotdot = (is_dir && old_dir != new_dir);
-	if (update_dotdot) {
-		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
-					 &dotdot_i_pos) < 0) {
-			err = -EIO;
-			goto out;
-		}
-	}
-
-	old_attrs = MSDOS_I(old_inode)->i_attrs;
-	err = fat_scan(new_dir, new_name, &sinfo);
-	if (!err) {
-		if (!new_inode) {
-			/* "foo" -> ".foo" case. just change the ATTR_HIDDEN */
-			if (sinfo.de != old_sinfo.de) {
-				err = -EINVAL;
-				goto out;
-			}
-			if (is_hid)
-				MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN;
-			else
-				MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN;
-			if (IS_DIRSYNC(old_dir)) {
-				err = fat_sync_inode(old_inode);
-				if (err) {
-					MSDOS_I(old_inode)->i_attrs = old_attrs;
-					goto out;
-				}
-			} else
-				mark_inode_dirty(old_inode);
-
-			old_dir->i_version++;
-			old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
-			if (IS_DIRSYNC(old_dir))
-				(void)fat_sync_inode(old_dir);
-			else
-				mark_inode_dirty(old_dir);
-			goto out;
-		}
-	}
-
-	ts = CURRENT_TIME_SEC;
-	if (new_inode) {
-		if (err)
-			goto out;
-		if (is_dir) {
-			err = fat_dir_empty(new_inode);
-			if (err)
-				goto out;
-		}
-		new_i_pos = MSDOS_I(new_inode)->i_pos;
-		fat_detach(new_inode);
-	} else {
-		err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0,
-				      &ts, &sinfo);
-		if (err)
-			goto out;
-		new_i_pos = sinfo.i_pos;
-	}
-	new_dir->i_version++;
-
-	fat_detach(old_inode);
-	fat_attach(old_inode, new_i_pos);
-	if (is_hid)
-		MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN;
-	else
-		MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN;
-	if (IS_DIRSYNC(new_dir)) {
-		err = fat_sync_inode(old_inode);
-		if (err)
-			goto error_inode;
-	} else
-		mark_inode_dirty(old_inode);
-
-	if (update_dotdot) {
-		int start = MSDOS_I(new_dir)->i_logstart;
-		dotdot_de->start = cpu_to_le16(start);
-		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
-		if (IS_DIRSYNC(new_dir)) {
-			err = sync_dirty_buffer(dotdot_bh);
-			if (err)
-				goto error_dotdot;
-		}
-		drop_nlink(old_dir);
-		if (!new_inode)
-			inc_nlink(new_dir);
-	}
-
-	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
-	old_sinfo.bh = NULL;
-	if (err)
-		goto error_dotdot;
-	old_dir->i_version++;
-	old_dir->i_ctime = old_dir->i_mtime = ts;
-	if (IS_DIRSYNC(old_dir))
-		(void)fat_sync_inode(old_dir);
-	else
-		mark_inode_dirty(old_dir);
-
-	if (new_inode) {
-		drop_nlink(new_inode);
-		if (is_dir)
-			drop_nlink(new_inode);
-		new_inode->i_ctime = ts;
-	}
-out:
-	brelse(sinfo.bh);
-	brelse(dotdot_bh);
-	brelse(old_sinfo.bh);
-	return err;
-
-error_dotdot:
-	/* data cluster is shared, serious corruption */
-	corrupt = 1;
-
-	if (update_dotdot) {
-		int start = MSDOS_I(old_dir)->i_logstart;
-		dotdot_de->start = cpu_to_le16(start);
-		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
-		corrupt |= sync_dirty_buffer(dotdot_bh);
-	}
-error_inode:
-	fat_detach(old_inode);
-	fat_attach(old_inode, old_sinfo.i_pos);
-	MSDOS_I(old_inode)->i_attrs = old_attrs;
-	if (new_inode) {
-		fat_attach(new_inode, new_i_pos);
-		if (corrupt)
-			corrupt |= fat_sync_inode(new_inode);
-	} else {
-		/*
-		 * If new entry was not sharing the data cluster, it
-		 * shouldn't be serious corruption.
-		 */
-		int err2 = fat_remove_entries(new_dir, &sinfo);
-		if (corrupt)
-			corrupt |= err2;
-		sinfo.bh = NULL;
-	}
-	if (corrupt < 0) {
-		fat_fs_panic(new_dir->i_sb,
-			     "%s: Filesystem corrupted (i_pos %lld)",
-			     __func__, sinfo.i_pos);
-	}
-	goto out;
-}
-
-/***** Rename, a wrapper for rename_same_dir & rename_diff_dir */
-static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
-			struct inode *new_dir, struct dentry *new_dentry)
-{
-	struct super_block *sb = old_dir->i_sb;
-	unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
-	int err, is_hid;
-
-	lock_super(sb);
-
-	err = msdos_format_name(old_dentry->d_name.name,
-				old_dentry->d_name.len, old_msdos_name,
-				&MSDOS_SB(old_dir->i_sb)->options);
-	if (err)
-		goto out;
-	err = msdos_format_name(new_dentry->d_name.name,
-				new_dentry->d_name.len, new_msdos_name,
-				&MSDOS_SB(new_dir->i_sb)->options);
-	if (err)
-		goto out;
-
-	is_hid =
-	     (new_dentry->d_name.name[0] == '.') && (new_msdos_name[0] != '.');
-
-	err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
-			      new_dir, new_msdos_name, new_dentry, is_hid);
-out:
-	unlock_super(sb);
-	if (!err)
-		err = fat_flush_inodes(sb, old_dir, new_dir);
-	return err;
-}
-
-static const struct inode_operations msdos_dir_inode_operations = {
-	.create		= msdos_create,
-	.lookup		= msdos_lookup,
-	.unlink		= msdos_unlink,
-	.mkdir		= msdos_mkdir,
-	.rmdir		= msdos_rmdir,
-	.rename		= msdos_rename,
-	.setattr	= fat_setattr,
-	.getattr	= fat_getattr,
-};
-
-static int msdos_fill_super(struct super_block *sb, void *data, int silent)
-{
-	int res;
-
-	res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
-	if (res)
-		return res;
-
-	sb->s_flags |= MS_NOATIME;
-	sb->s_root->d_op = &msdos_dentry_operations;
-	return 0;
-}
-
-static int msdos_get_sb(struct file_system_type *fs_type,
-			int flags, const char *dev_name,
-			void *data, struct vfsmount *mnt)
-{
-	return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super,
-			   mnt);
-}
-
-static struct file_system_type msdos_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "msdos",
-	.get_sb		= msdos_get_sb,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-
-static int __init init_msdos_fs(void)
-{
-	return register_filesystem(&msdos_fs_type);
-}
-
-static void __exit exit_msdos_fs(void)
-{
-	unregister_filesystem(&msdos_fs_type);
-}
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Werner Almesberger");
-MODULE_DESCRIPTION("MS-DOS filesystem support");
-
-module_init(init_msdos_fs)
-module_exit(exit_msdos_fs)
diff --git a/fs/vfat/Makefile b/fs/vfat/Makefile
deleted file mode 100644
index 40f2798a4f08..000000000000
--- a/fs/vfat/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# Makefile for the linux vfat-filesystem routines.
-#
-
-obj-$(CONFIG_VFAT_FS) += vfat.o
-
-vfat-y := namei.o
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
deleted file mode 100644
index 155c10b4adbd..000000000000
--- a/fs/vfat/namei.c
+++ /dev/null
@@ -1,1055 +0,0 @@
-/*
- *  linux/fs/vfat/namei.c
- *
- *  Written 1992,1993 by Werner Almesberger
- *
- *  Windows95/Windows NT compatible extended MSDOS filesystem
- *    by Gordon Chaffee Copyright (C) 1995.  Send bug reports for the
- *    VFAT filesystem to <chaffee@cs.berkeley.edu>.  Specify
- *    what file operation caused you trouble and if you can duplicate
- *    the problem, send a script that demonstrates it.
- *
- *  Short name translation 1999, 2001 by Wolfram Pienkoss <wp@bszh.de>
- *
- *  Support Multibyte characters and cleanup by
- *				OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
- */
-
-#include <linux/module.h>
-
-#include <linux/jiffies.h>
-#include <linux/msdos_fs.h>
-#include <linux/ctype.h>
-#include <linux/slab.h>
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
-#include <linux/namei.h>
-
-static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
-	int ret = 1;
-
-	if (!dentry->d_inode &&
-	    nd && !(nd->flags & LOOKUP_CONTINUE) && (nd->flags & LOOKUP_CREATE))
-		/*
-		 * negative dentry is dropped, in order to make sure
-		 * to use the name which a user desires if this is
-		 * create path.
-		 */
-		ret = 0;
-	else {
-		spin_lock(&dentry->d_lock);
-		if (dentry->d_time != dentry->d_parent->d_inode->i_version)
-			ret = 0;
-		spin_unlock(&dentry->d_lock);
-	}
-	return ret;
-}
-
-/* returns the length of a struct qstr, ignoring trailing dots */
-static unsigned int vfat_striptail_len(struct qstr *qstr)
-{
-	unsigned int len = qstr->len;
-
-	while (len && qstr->name[len - 1] == '.')
-		len--;
-	return len;
-}
-
-/*
- * Compute the hash for the vfat name corresponding to the dentry.
- * Note: if the name is invalid, we leave the hash code unchanged so
- * that the existing dentry can be used. The vfat fs routines will
- * return ENOENT or EINVAL as appropriate.
- */
-static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
-{
-	qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
-	return 0;
-}
-
-/*
- * Compute the hash for the vfat name corresponding to the dentry.
- * Note: if the name is invalid, we leave the hash code unchanged so
- * that the existing dentry can be used. The vfat fs routines will
- * return ENOENT or EINVAL as appropriate.
- */
-static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
-{
-	struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
-	const unsigned char *name;
-	unsigned int len;
-	unsigned long hash;
-
-	name = qstr->name;
-	len = vfat_striptail_len(qstr);
-
-	hash = init_name_hash();
-	while (len--)
-		hash = partial_name_hash(nls_tolower(t, *name++), hash);
-	qstr->hash = end_name_hash(hash);
-
-	return 0;
-}
-
-/*
- * Case insensitive compare of two vfat names.
- */
-static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
-{
-	struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
-	unsigned int alen, blen;
-
-	/* A filename cannot end in '.' or we treat it like it has none */
-	alen = vfat_striptail_len(a);
-	blen = vfat_striptail_len(b);
-	if (alen == blen) {
-		if (nls_strnicmp(t, a->name, b->name, alen) == 0)
-			return 0;
-	}
-	return 1;
-}
-
-/*
- * Case sensitive compare of two vfat names.
- */
-static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
-{
-	unsigned int alen, blen;
-
-	/* A filename cannot end in '.' or we treat it like it has none */
-	alen = vfat_striptail_len(a);
-	blen = vfat_striptail_len(b);
-	if (alen == blen) {
-		if (strncmp(a->name, b->name, alen) == 0)
-			return 0;
-	}
-	return 1;
-}
-
-static struct dentry_operations vfat_dentry_ops[4] = {
-	{
-		.d_hash		= vfat_hashi,
-		.d_compare	= vfat_cmpi,
-	},
-	{
-		.d_revalidate	= vfat_revalidate,
-		.d_hash		= vfat_hashi,
-		.d_compare	= vfat_cmpi,
-	},
-	{
-		.d_hash		= vfat_hash,
-		.d_compare	= vfat_cmp,
-	},
-	{
-		.d_revalidate	= vfat_revalidate,
-		.d_hash		= vfat_hash,
-		.d_compare	= vfat_cmp,
-	}
-};
-
-/* Characters that are undesirable in an MS-DOS file name */
-
-static inline wchar_t vfat_bad_char(wchar_t w)
-{
-	return (w < 0x0020)
-	    || (w == '*') || (w == '?') || (w == '<') || (w == '>')
-	    || (w == '|') || (w == '"') || (w == ':') || (w == '/')
-	    || (w == '\\');
-}
-
-static inline wchar_t vfat_replace_char(wchar_t w)
-{
-	return (w == '[') || (w == ']') || (w == ';') || (w == ',')
-	    || (w == '+') || (w == '=');
-}
-
-static wchar_t vfat_skip_char(wchar_t w)
-{
-	return (w == '.') || (w == ' ');
-}
-
-static inline int vfat_is_used_badchars(const wchar_t *s, int len)
-{
-	int i;
-
-	for (i = 0; i < len; i++)
-		if (vfat_bad_char(s[i]))
-			return -EINVAL;
-
-	if (s[i - 1] == ' ') /* last character cannot be space */
-		return -EINVAL;
-
-	return 0;
-}
-
-static int vfat_find_form(struct inode *dir, unsigned char *name)
-{
-	struct fat_slot_info sinfo;
-	int err = fat_scan(dir, name, &sinfo);
-	if (err)
-		return -ENOENT;
-	brelse(sinfo.bh);
-	return 0;
-}
-
-/*
- * 1) Valid characters for the 8.3 format alias are any combination of
- * letters, uppercase alphabets, digits, any of the
- * following special characters:
- *     $ % ' ` - @ { } ~ ! # ( ) & _ ^
- * In this case Longfilename is not stored in disk.
- *
- * WinNT's Extension:
- * File name and extension name is contain uppercase/lowercase
- * only. And it is expressed by CASE_LOWER_BASE and CASE_LOWER_EXT.
- *
- * 2) File name is 8.3 format, but it contain the uppercase and
- * lowercase char, muliti bytes char, etc. In this case numtail is not
- * added, but Longfilename is stored.
- *
- * 3) When the one except for the above, or the following special
- * character are contained:
- *        .   [ ] ; , + =
- * numtail is added, and Longfilename must be stored in disk .
- */
-struct shortname_info {
-	unsigned char lower:1,
-		      upper:1,
-		      valid:1;
-};
-#define INIT_SHORTNAME_INFO(x)	do {		\
-	(x)->lower = 1;				\
-	(x)->upper = 1;				\
-	(x)->valid = 1;				\
-} while (0)
-
-static inline int to_shortname_char(struct nls_table *nls,
-				    unsigned char *buf, int buf_size,
-				    wchar_t *src, struct shortname_info *info)
-{
-	int len;
-
-	if (vfat_skip_char(*src)) {
-		info->valid = 0;
-		return 0;
-	}
-	if (vfat_replace_char(*src)) {
-		info->valid = 0;
-		buf[0] = '_';
-		return 1;
-	}
-
-	len = nls->uni2char(*src, buf, buf_size);
-	if (len <= 0) {
-		info->valid = 0;
-		buf[0] = '_';
-		len = 1;
-	} else if (len == 1) {
-		unsigned char prev = buf[0];
-
-		if (buf[0] >= 0x7F) {
-			info->lower = 0;
-			info->upper = 0;
-		}
-
-		buf[0] = nls_toupper(nls, buf[0]);
-		if (isalpha(buf[0])) {
-			if (buf[0] == prev)
-				info->lower = 0;
-			else
-				info->upper = 0;
-		}
-	} else {
-		info->lower = 0;
-		info->upper = 0;
-	}
-
-	return len;
-}
-
-/*
- * Given a valid longname, create a unique shortname.  Make sure the
- * shortname does not exist
- * Returns negative number on error, 0 for a normal
- * return, and 1 for valid shortname
- */
-static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
-				 wchar_t *uname, int ulen,
-				 unsigned char *name_res, unsigned char *lcase)
-{
-	struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
-	wchar_t *ip, *ext_start, *end, *name_start;
-	unsigned char base[9], ext[4], buf[8], *p;
-	unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
-	int chl, chi;
-	int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
-	int is_shortname;
-	struct shortname_info base_info, ext_info;
-
-	is_shortname = 1;
-	INIT_SHORTNAME_INFO(&base_info);
-	INIT_SHORTNAME_INFO(&ext_info);
-
-	/* Now, we need to create a shortname from the long name */
-	ext_start = end = &uname[ulen];
-	while (--ext_start >= uname) {
-		if (*ext_start == 0x002E) {	/* is `.' */
-			if (ext_start == end - 1) {
-				sz = ulen;
-				ext_start = NULL;
-			}
-			break;
-		}
-	}
-
-	if (ext_start == uname - 1) {
-		sz = ulen;
-		ext_start = NULL;
-	} else if (ext_start) {
-		/*
-		 * Names which start with a dot could be just
-		 * an extension eg. "...test".  In this case Win95
-		 * uses the extension as the name and sets no extension.
-		 */
-		name_start = &uname[0];
-		while (name_start < ext_start) {
-			if (!vfat_skip_char(*name_start))
-				break;
-			name_start++;
-		}
-		if (name_start != ext_start) {
-			sz = ext_start - uname;
-			ext_start++;
-		} else {
-			sz = ulen;
-			ext_start = NULL;
-		}
-	}
-
-	numtail_baselen = 6;
-	numtail2_baselen = 2;
-	for (baselen = i = 0, p = base, ip = uname; i < sz; i++, ip++) {
-		chl = to_shortname_char(nls, charbuf, sizeof(charbuf),
-					ip, &base_info);
-		if (chl == 0)
-			continue;
-
-		if (baselen < 2 && (baselen + chl) > 2)
-			numtail2_baselen = baselen;
-		if (baselen < 6 && (baselen + chl) > 6)
-			numtail_baselen = baselen;
-		for (chi = 0; chi < chl; chi++) {
-			*p++ = charbuf[chi];
-			baselen++;
-			if (baselen >= 8)
-				break;
-		}
-		if (baselen >= 8) {
-			if ((chi < chl - 1) || (ip + 1) - uname < sz)
-				is_shortname = 0;
-			break;
-		}
-	}
-	if (baselen == 0) {
-		return -EINVAL;
-	}
-
-	extlen = 0;
-	if (ext_start) {
-		for (p = ext, ip = ext_start; extlen < 3 && ip < end; ip++) {
-			chl = to_shortname_char(nls, charbuf, sizeof(charbuf),
-						ip, &ext_info);
-			if (chl == 0)
-				continue;
-
-			if ((extlen + chl) > 3) {
-				is_shortname = 0;
-				break;
-			}
-			for (chi = 0; chi < chl; chi++) {
-				*p++ = charbuf[chi];
-				extlen++;
-			}
-			if (extlen >= 3) {
-				if (ip + 1 != end)
-					is_shortname = 0;
-				break;
-			}
-		}
-	}
-	ext[extlen] = '\0';
-	base[baselen] = '\0';
-
-	/* Yes, it can happen. ".\xe5" would do it. */
-	if (base[0] == DELETED_FLAG)
-		base[0] = 0x05;
-
-	/* OK, at this point we know that base is not longer than 8 symbols,
-	 * ext is not longer than 3, base is nonempty, both don't contain
-	 * any bad symbols (lowercase transformed to uppercase).
-	 */
-
-	memset(name_res, ' ', MSDOS_NAME);
-	memcpy(name_res, base, baselen);
-	memcpy(name_res + 8, ext, extlen);
-	*lcase = 0;
-	if (is_shortname && base_info.valid && ext_info.valid) {
-		if (vfat_find_form(dir, name_res) == 0)
-			return -EEXIST;
-
-		if (opts->shortname & VFAT_SFN_CREATE_WIN95) {
-			return (base_info.upper && ext_info.upper);
-		} else if (opts->shortname & VFAT_SFN_CREATE_WINNT) {
-			if ((base_info.upper || base_info.lower) &&
-			    (ext_info.upper || ext_info.lower)) {
-				if (!base_info.upper && base_info.lower)
-					*lcase |= CASE_LOWER_BASE;
-				if (!ext_info.upper && ext_info.lower)
-					*lcase |= CASE_LOWER_EXT;
-				return 1;
-			}
-			return 0;
-		} else {
-			BUG();
-		}
-	}
-
-	if (opts->numtail == 0)
-		if (vfat_find_form(dir, name_res) < 0)
-			return 0;
-
-	/*
-	 * Try to find a unique extension.  This used to
-	 * iterate through all possibilities sequentially,
-	 * but that gave extremely bad performance.  Windows
-	 * only tries a few cases before using random
-	 * values for part of the base.
-	 */
-
-	if (baselen > 6) {
-		baselen = numtail_baselen;
-		name_res[7] = ' ';
-	}
-	name_res[baselen] = '~';
-	for (i = 1; i < 10; i++) {
-		name_res[baselen + 1] = i + '0';
-		if (vfat_find_form(dir, name_res) < 0)
-			return 0;
-	}
-
-	i = jiffies & 0xffff;
-	sz = (jiffies >> 16) & 0x7;
-	if (baselen > 2) {
-		baselen = numtail2_baselen;
-		name_res[7] = ' ';
-	}
-	name_res[baselen + 4] = '~';
-	name_res[baselen + 5] = '1' + sz;
-	while (1) {
-		sprintf(buf, "%04X", i);
-		memcpy(&name_res[baselen], buf, 4);
-		if (vfat_find_form(dir, name_res) < 0)
-			break;
-		i -= 11;
-	}
-	return 0;
-}
-
-/* Translate a string, including coded sequences into Unicode */
-static int
-xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
-	     int *longlen, int *outlen, int escape, int utf8,
-	     struct nls_table *nls)
-{
-	const unsigned char *ip;
-	unsigned char nc;
-	unsigned char *op;
-	unsigned int ec;
-	int i, k, fill;
-	int charlen;
-
-	if (utf8) {
-		int name_len = strlen(name);
-
-		*outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX);
-
-		/*
-		 * We stripped '.'s before and set len appropriately,
-		 * but utf8_mbstowcs doesn't care about len
-		 */
-		*outlen -= (name_len - len);
-
-		if (*outlen > 255)
-			return -ENAMETOOLONG;
-
-		op = &outname[*outlen * sizeof(wchar_t)];
-	} else {
-		if (nls) {
-			for (i = 0, ip = name, op = outname, *outlen = 0;
-			     i < len && *outlen <= 255;
-			     *outlen += 1)
-			{
-				if (escape && (*ip == ':')) {
-					if (i > len - 5)
-						return -EINVAL;
-					ec = 0;
-					for (k = 1; k < 5; k++) {
-						nc = ip[k];
-						ec <<= 4;
-						if (nc >= '0' && nc <= '9') {
-							ec |= nc - '0';
-							continue;
-						}
-						if (nc >= 'a' && nc <= 'f') {
-							ec |= nc - ('a' - 10);
-							continue;
-						}
-						if (nc >= 'A' && nc <= 'F') {
-							ec |= nc - ('A' - 10);
-							continue;
-						}
-						return -EINVAL;
-					}
-					*op++ = ec & 0xFF;
-					*op++ = ec >> 8;
-					ip += 5;
-					i += 5;
-				} else {
-					if ((charlen = nls->char2uni(ip, len - i, (wchar_t *)op)) < 0)
-						return -EINVAL;
-					ip += charlen;
-					i += charlen;
-					op += 2;
-				}
-			}
-			if (i < len)
-				return -ENAMETOOLONG;
-		} else {
-			for (i = 0, ip = name, op = outname, *outlen = 0;
-			     i < len && *outlen <= 255;
-			     i++, *outlen += 1)
-			{
-				*op++ = *ip++;
-				*op++ = 0;
-			}
-			if (i < len)
-				return -ENAMETOOLONG;
-		}
-	}
-
-	*longlen = *outlen;
-	if (*outlen % 13) {
-		*op++ = 0;
-		*op++ = 0;
-		*outlen += 1;
-		if (*outlen % 13) {
-			fill = 13 - (*outlen % 13);
-			for (i = 0; i < fill; i++) {
-				*op++ = 0xff;
-				*op++ = 0xff;
-			}
-			*outlen += fill;
-		}
-	}
-
-	return 0;
-}
-
-static int vfat_build_slots(struct inode *dir, const unsigned char *name,
-			    int len, int is_dir, int cluster,
-			    struct timespec *ts,
-			    struct msdos_dir_slot *slots, int *nr_slots)
-{
-	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
-	struct fat_mount_options *opts = &sbi->options;
-	struct msdos_dir_slot *ps;
-	struct msdos_dir_entry *de;
-	unsigned char cksum, lcase;
-	unsigned char msdos_name[MSDOS_NAME];
-	wchar_t *uname;
-	__le16 time, date;
-	int err, ulen, usize, i;
-	loff_t offset;
-
-	*nr_slots = 0;
-
-	uname = __getname();
-	if (!uname)
-		return -ENOMEM;
-
-	err = xlate_to_uni(name, len, (unsigned char *)uname, &ulen, &usize,
-			   opts->unicode_xlate, opts->utf8, sbi->nls_io);
-	if (err)
-		goto out_free;
-
-	err = vfat_is_used_badchars(uname, ulen);
-	if (err)
-		goto out_free;
-
-	err = vfat_create_shortname(dir, sbi->nls_disk, uname, ulen,
-				    msdos_name, &lcase);
-	if (err < 0)
-		goto out_free;
-	else if (err == 1) {
-		de = (struct msdos_dir_entry *)slots;
-		err = 0;
-		goto shortname;
-	}
-
-	/* build the entry of long file name */
-	cksum = fat_checksum(msdos_name);
-
-	*nr_slots = usize / 13;
-	for (ps = slots, i = *nr_slots; i > 0; i--, ps++) {
-		ps->id = i;
-		ps->attr = ATTR_EXT;
-		ps->reserved = 0;
-		ps->alias_checksum = cksum;
-		ps->start = 0;
-		offset = (i - 1) * 13;
-		fatwchar_to16(ps->name0_4, uname + offset, 5);
-		fatwchar_to16(ps->name5_10, uname + offset + 5, 6);
-		fatwchar_to16(ps->name11_12, uname + offset + 11, 2);
-	}
-	slots[0].id |= 0x40;
-	de = (struct msdos_dir_entry *)ps;
-
-shortname:
-	/* build the entry of 8.3 alias name */
-	(*nr_slots)++;
-	memcpy(de->name, msdos_name, MSDOS_NAME);
-	de->attr = is_dir ? ATTR_DIR : ATTR_ARCH;
-	de->lcase = lcase;
-	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
-	de->time = de->ctime = time;
-	de->date = de->cdate = de->adate = date;
-	de->ctime_cs = 0;
-	de->start = cpu_to_le16(cluster);
-	de->starthi = cpu_to_le16(cluster >> 16);
-	de->size = 0;
-out_free:
-	__putname(uname);
-	return err;
-}
-
-static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
-			  int cluster, struct timespec *ts,
-			  struct fat_slot_info *sinfo)
-{
-	struct msdos_dir_slot *slots;
-	unsigned int len;
-	int err, nr_slots;
-
-	len = vfat_striptail_len(qname);
-	if (len == 0)
-		return -ENOENT;
-
-	slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
-	if (slots == NULL)
-		return -ENOMEM;
-
-	err = vfat_build_slots(dir, qname->name, len, is_dir, cluster, ts,
-			       slots, &nr_slots);
-	if (err)
-		goto cleanup;
-
-	err = fat_add_entries(dir, slots, nr_slots, sinfo);
-	if (err)
-		goto cleanup;
-
-	/* update timestamp */
-	dir->i_ctime = dir->i_mtime = dir->i_atime = *ts;
-	if (IS_DIRSYNC(dir))
-		(void)fat_sync_inode(dir);
-	else
-		mark_inode_dirty(dir);
-cleanup:
-	kfree(slots);
-	return err;
-}
-
-static int vfat_find(struct inode *dir, struct qstr *qname,
-		     struct fat_slot_info *sinfo)
-{
-	unsigned int len = vfat_striptail_len(qname);
-	if (len == 0)
-		return -ENOENT;
-	return fat_search_long(dir, qname->name, len, sinfo);
-}
-
-static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
-				  struct nameidata *nd)
-{
-	struct super_block *sb = dir->i_sb;
-	struct fat_slot_info sinfo;
-	struct inode *inode = NULL;
-	struct dentry *alias;
-	int err, table;
-
-	lock_super(sb);
-	table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
-	dentry->d_op = &vfat_dentry_ops[table];
-
-	err = vfat_find(dir, &dentry->d_name, &sinfo);
-	if (err) {
-		table++;
-		goto error;
-	}
-	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
-	brelse(sinfo.bh);
-	if (IS_ERR(inode)) {
-		unlock_super(sb);
-		return ERR_CAST(inode);
-	}
-	alias = d_find_alias(inode);
-	if (alias) {
-		if (d_invalidate(alias) == 0)
-			dput(alias);
-		else {
-			iput(inode);
-			unlock_super(sb);
-			return alias;
-		}
-
-	}
-error:
-	unlock_super(sb);
-	dentry->d_op = &vfat_dentry_ops[table];
-	dentry->d_time = dentry->d_parent->d_inode->i_version;
-	dentry = d_splice_alias(inode, dentry);
-	if (dentry) {
-		dentry->d_op = &vfat_dentry_ops[table];
-		dentry->d_time = dentry->d_parent->d_inode->i_version;
-	}
-	return dentry;
-}
-
-static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
-		       struct nameidata *nd)
-{
-	struct super_block *sb = dir->i_sb;
-	struct inode *inode;
-	struct fat_slot_info sinfo;
-	struct timespec ts;
-	int err;
-
-	lock_super(sb);
-
-	ts = CURRENT_TIME_SEC;
-	err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
-	if (err)
-		goto out;
-	dir->i_version++;
-
-	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
-	brelse(sinfo.bh);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		goto out;
-	}
-	inode->i_version++;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
-	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
-
-	dentry->d_time = dentry->d_parent->d_inode->i_version;
-	d_instantiate(dentry, inode);
-out:
-	unlock_super(sb);
-	return err;
-}
-
-static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	struct inode *inode = dentry->d_inode;
-	struct super_block *sb = dir->i_sb;
-	struct fat_slot_info sinfo;
-	int err;
-
-	lock_super(sb);
-
-	err = fat_dir_empty(inode);
-	if (err)
-		goto out;
-	err = vfat_find(dir, &dentry->d_name, &sinfo);
-	if (err)
-		goto out;
-
-	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
-	if (err)
-		goto out;
-	drop_nlink(dir);
-
-	clear_nlink(inode);
-	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
-	fat_detach(inode);
-out:
-	unlock_super(sb);
-
-	return err;
-}
-
-static int vfat_unlink(struct inode *dir, struct dentry *dentry)
-{
-	struct inode *inode = dentry->d_inode;
-	struct super_block *sb = dir->i_sb;
-	struct fat_slot_info sinfo;
-	int err;
-
-	lock_super(sb);
-
-	err = vfat_find(dir, &dentry->d_name, &sinfo);
-	if (err)
-		goto out;
-
-	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
-	if (err)
-		goto out;
-	clear_nlink(inode);
-	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
-	fat_detach(inode);
-out:
-	unlock_super(sb);
-
-	return err;
-}
-
-static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-	struct super_block *sb = dir->i_sb;
-	struct inode *inode;
-	struct fat_slot_info sinfo;
-	struct timespec ts;
-	int err, cluster;
-
-	lock_super(sb);
-
-	ts = CURRENT_TIME_SEC;
-	cluster = fat_alloc_new_dir(dir, &ts);
-	if (cluster < 0) {
-		err = cluster;
-		goto out;
-	}
-	err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo);
-	if (err)
-		goto out_free;
-	dir->i_version++;
-	inc_nlink(dir);
-
-	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
-	brelse(sinfo.bh);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		/* the directory was completed, just return a error */
-		goto out;
-	}
-	inode->i_version++;
-	inode->i_nlink = 2;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
-	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
-
-	dentry->d_time = dentry->d_parent->d_inode->i_version;
-	d_instantiate(dentry, inode);
-
-	unlock_super(sb);
-	return 0;
-
-out_free:
-	fat_free_clusters(dir, cluster);
-out:
-	unlock_super(sb);
-	return err;
-}
-
-static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
-		       struct inode *new_dir, struct dentry *new_dentry)
-{
-	struct buffer_head *dotdot_bh;
-	struct msdos_dir_entry *dotdot_de;
-	struct inode *old_inode, *new_inode;
-	struct fat_slot_info old_sinfo, sinfo;
-	struct timespec ts;
-	loff_t dotdot_i_pos, new_i_pos;
-	int err, is_dir, update_dotdot, corrupt = 0;
-	struct super_block *sb = old_dir->i_sb;
-
-	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
-	old_inode = old_dentry->d_inode;
-	new_inode = new_dentry->d_inode;
-	lock_super(sb);
-	err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
-	if (err)
-		goto out;
-
-	is_dir = S_ISDIR(old_inode->i_mode);
-	update_dotdot = (is_dir && old_dir != new_dir);
-	if (update_dotdot) {
-		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
-					 &dotdot_i_pos) < 0) {
-			err = -EIO;
-			goto out;
-		}
-	}
-
-	ts = CURRENT_TIME_SEC;
-	if (new_inode) {
-		if (is_dir) {
-			err = fat_dir_empty(new_inode);
-			if (err)
-				goto out;
-		}
-		new_i_pos = MSDOS_I(new_inode)->i_pos;
-		fat_detach(new_inode);
-	} else {
-		err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0,
-				     &ts, &sinfo);
-		if (err)
-			goto out;
-		new_i_pos = sinfo.i_pos;
-	}
-	new_dir->i_version++;
-
-	fat_detach(old_inode);
-	fat_attach(old_inode, new_i_pos);
-	if (IS_DIRSYNC(new_dir)) {
-		err = fat_sync_inode(old_inode);
-		if (err)
-			goto error_inode;
-	} else
-		mark_inode_dirty(old_inode);
-
-	if (update_dotdot) {
-		int start = MSDOS_I(new_dir)->i_logstart;
-		dotdot_de->start = cpu_to_le16(start);
-		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
-		if (IS_DIRSYNC(new_dir)) {
-			err = sync_dirty_buffer(dotdot_bh);
-			if (err)
-				goto error_dotdot;
-		}
-		drop_nlink(old_dir);
-		if (!new_inode)
- 			inc_nlink(new_dir);
-	}
-
-	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
-	old_sinfo.bh = NULL;
-	if (err)
-		goto error_dotdot;
-	old_dir->i_version++;
-	old_dir->i_ctime = old_dir->i_mtime = ts;
-	if (IS_DIRSYNC(old_dir))
-		(void)fat_sync_inode(old_dir);
-	else
-		mark_inode_dirty(old_dir);
-
-	if (new_inode) {
-		drop_nlink(new_inode);
-		if (is_dir)
-			drop_nlink(new_inode);
-		new_inode->i_ctime = ts;
-	}
-out:
-	brelse(sinfo.bh);
-	brelse(dotdot_bh);
-	brelse(old_sinfo.bh);
-	unlock_super(sb);
-
-	return err;
-
-error_dotdot:
-	/* data cluster is shared, serious corruption */
-	corrupt = 1;
-
-	if (update_dotdot) {
-		int start = MSDOS_I(old_dir)->i_logstart;
-		dotdot_de->start = cpu_to_le16(start);
-		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
-		corrupt |= sync_dirty_buffer(dotdot_bh);
-	}
-error_inode:
-	fat_detach(old_inode);
-	fat_attach(old_inode, old_sinfo.i_pos);
-	if (new_inode) {
-		fat_attach(new_inode, new_i_pos);
-		if (corrupt)
-			corrupt |= fat_sync_inode(new_inode);
-	} else {
-		/*
-		 * If new entry was not sharing the data cluster, it
-		 * shouldn't be serious corruption.
-		 */
-		int err2 = fat_remove_entries(new_dir, &sinfo);
-		if (corrupt)
-			corrupt |= err2;
-		sinfo.bh = NULL;
-	}
-	if (corrupt < 0) {
-		fat_fs_panic(new_dir->i_sb,
-			     "%s: Filesystem corrupted (i_pos %lld)",
-			     __func__, sinfo.i_pos);
-	}
-	goto out;
-}
-
-static const struct inode_operations vfat_dir_inode_operations = {
-	.create		= vfat_create,
-	.lookup		= vfat_lookup,
-	.unlink		= vfat_unlink,
-	.mkdir		= vfat_mkdir,
-	.rmdir		= vfat_rmdir,
-	.rename		= vfat_rename,
-	.setattr	= fat_setattr,
-	.getattr	= fat_getattr,
-};
-
-static int vfat_fill_super(struct super_block *sb, void *data, int silent)
-{
-	int res;
-
-	res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
-	if (res)
-		return res;
-
-	if (MSDOS_SB(sb)->options.name_check != 's')
-		sb->s_root->d_op = &vfat_dentry_ops[0];
-	else
-		sb->s_root->d_op = &vfat_dentry_ops[2];
-
-	return 0;
-}
-
-static int vfat_get_sb(struct file_system_type *fs_type,
-		       int flags, const char *dev_name,
-		       void *data, struct vfsmount *mnt)
-{
-	return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super,
-			   mnt);
-}
-
-static struct file_system_type vfat_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "vfat",
-	.get_sb		= vfat_get_sb,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-
-static int __init init_vfat_fs(void)
-{
-	return register_filesystem(&vfat_fs_type);
-}
-
-static void __exit exit_vfat_fs(void)
-{
-	unregister_filesystem(&vfat_fs_type);
-}
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("VFAT filesystem support");
-MODULE_AUTHOR("Gordon Chaffee");
-
-module_init(init_vfat_fs)
-module_exit(exit_vfat_fs)
-- 
cgit v1.2.2


From 9e975dae2970d22557662761c8505ce9fd165684 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:46 -0800
Subject: fat: split include/msdos_fs.h

This splits __KERNEL__ stuff in include/msdos_fs.h into fs/fat/fat.h.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/cache.c       |   2 +-
 fs/fat/dir.c         |   2 +-
 fs/fat/fat.h         | 274 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fat/fatent.c      |   1 +
 fs/fat/file.c        |   2 +-
 fs/fat/inode.c       |   2 +-
 fs/fat/misc.c        |   2 +-
 fs/fat/namei_msdos.c |   2 +-
 fs/fat/namei_vfat.c  |   3 +-
 9 files changed, 282 insertions(+), 8 deletions(-)
 create mode 100644 fs/fat/fat.h

(limited to 'fs')

diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 3222f51c41cf..589edde9053c 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -9,8 +9,8 @@
  */
 
 #include <linux/fs.h>
-#include <linux/msdos_fs.h>
 #include <linux/buffer_head.h>
+#include "fat.h"
 
 /* this must be > 0. */
 #define FAT_MAX_CACHE	8
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index bae1c3292522..08b23ad25f1c 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -16,11 +16,11 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/time.h>
-#include <linux/msdos_fs.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/compat.h>
 #include <asm/uaccess.h>
+#include "fat.h"
 
 static inline loff_t fat_make_i_pos(struct super_block *sb,
 				    struct buffer_head *bh,
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
new file mode 100644
index 000000000000..51f1c42ca5e3
--- /dev/null
+++ b/fs/fat/fat.h
@@ -0,0 +1,274 @@
+#ifndef _FAT_H
+#define _FAT_H
+
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/msdos_fs.h>
+
+/*
+ * vfat shortname flags
+ */
+#define VFAT_SFN_DISPLAY_LOWER	0x0001 /* convert to lowercase for display */
+#define VFAT_SFN_DISPLAY_WIN95	0x0002 /* emulate win95 rule for display */
+#define VFAT_SFN_DISPLAY_WINNT	0x0004 /* emulate winnt rule for display */
+#define VFAT_SFN_CREATE_WIN95	0x0100 /* emulate win95 rule for create */
+#define VFAT_SFN_CREATE_WINNT	0x0200 /* emulate winnt rule for create */
+
+struct fat_mount_options {
+	uid_t fs_uid;
+	gid_t fs_gid;
+	unsigned short fs_fmask;
+	unsigned short fs_dmask;
+	unsigned short codepage;  /* Codepage for shortname conversions */
+	char *iocharset;          /* Charset used for filename input/display */
+	unsigned short shortname; /* flags for shortname display/create rule */
+	unsigned char name_check; /* r = relaxed, n = normal, s = strict */
+	unsigned short allow_utime;/* permission for setting the [am]time */
+	unsigned quiet:1,         /* set = fake successful chmods and chowns */
+		 showexec:1,      /* set = only set x bit for com/exe/bat */
+		 sys_immutable:1, /* set = system files are immutable */
+		 dotsOK:1,        /* set = hidden and system files are named '.filename' */
+		 isvfat:1,        /* 0=no vfat long filename support, 1=vfat support */
+		 utf8:1,	  /* Use of UTF-8 character set (Default) */
+		 unicode_xlate:1, /* create escape sequences for unhandled Unicode */
+		 numtail:1,       /* Does first alias have a numeric '~1' type tail? */
+		 flush:1,	  /* write things quickly */
+		 nocase:1,	  /* Does this need case conversion? 0=need case conversion*/
+		 usefree:1,	  /* Use free_clusters for FAT32 */
+		 tz_utc:1;	  /* Filesystem timestamps are in UTC */
+};
+
+#define FAT_HASH_BITS	8
+#define FAT_HASH_SIZE	(1UL << FAT_HASH_BITS)
+#define FAT_HASH_MASK	(FAT_HASH_SIZE-1)
+
+/*
+ * MS-DOS file system in-core superblock data
+ */
+struct msdos_sb_info {
+	unsigned short sec_per_clus; /* sectors/cluster */
+	unsigned short cluster_bits; /* log2(cluster_size) */
+	unsigned int cluster_size;   /* cluster size */
+	unsigned char fats,fat_bits; /* number of FATs, FAT bits (12 or 16) */
+	unsigned short fat_start;
+	unsigned long fat_length;    /* FAT start & length (sec.) */
+	unsigned long dir_start;
+	unsigned short dir_entries;  /* root dir start & entries */
+	unsigned long data_start;    /* first data sector */
+	unsigned long max_cluster;   /* maximum cluster number */
+	unsigned long root_cluster;  /* first cluster of the root directory */
+	unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */
+	struct mutex fat_lock;
+	unsigned int prev_free;      /* previously allocated cluster number */
+	unsigned int free_clusters;  /* -1 if undefined */
+	unsigned int free_clus_valid; /* is free_clusters valid? */
+	struct fat_mount_options options;
+	struct nls_table *nls_disk;  /* Codepage used on disk */
+	struct nls_table *nls_io;    /* Charset used for input and display */
+	const void *dir_ops;		     /* Opaque; default directory operations */
+	int dir_per_block;	     /* dir entries per block */
+	int dir_per_block_bits;	     /* log2(dir_per_block) */
+
+	int fatent_shift;
+	struct fatent_operations *fatent_ops;
+
+	spinlock_t inode_hash_lock;
+	struct hlist_head inode_hashtable[FAT_HASH_SIZE];
+};
+
+#define FAT_CACHE_VALID	0	/* special case for valid cache */
+
+/*
+ * MS-DOS file system inode data in memory
+ */
+struct msdos_inode_info {
+	spinlock_t cache_lru_lock;
+	struct list_head cache_lru;
+	int nr_caches;
+	/* for avoiding the race between fat_free() and fat_get_cluster() */
+	unsigned int cache_valid_id;
+
+	loff_t mmu_private;
+	int i_start;		/* first cluster or 0 */
+	int i_logstart;		/* logical first cluster */
+	int i_attrs;		/* unused attribute bits */
+	loff_t i_pos;		/* on-disk position of directory entry or 0 */
+	struct hlist_node i_fat_hash;	/* hash by i_location */
+	struct inode vfs_inode;
+};
+
+struct fat_slot_info {
+	loff_t i_pos;		/* on-disk position of directory entry */
+	loff_t slot_off;	/* offset for slot or de start */
+	int nr_slots;		/* number of slots + 1(de) in filename */
+	struct msdos_dir_entry *de;
+	struct buffer_head *bh;
+};
+
+static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
+{
+	return container_of(inode, struct msdos_inode_info, vfs_inode);
+}
+
+/* Return the FAT attribute byte for this inode */
+static inline u8 fat_attr(struct inode *inode)
+{
+	return ((inode->i_mode & S_IWUGO) ? ATTR_NONE : ATTR_RO) |
+		(S_ISDIR(inode->i_mode) ? ATTR_DIR : ATTR_NONE) |
+		MSDOS_I(inode)->i_attrs;
+}
+
+static inline unsigned char fat_checksum(const __u8 *name)
+{
+	unsigned char s = name[0];
+	s = (s<<7) + (s>>1) + name[1];	s = (s<<7) + (s>>1) + name[2];
+	s = (s<<7) + (s>>1) + name[3];	s = (s<<7) + (s>>1) + name[4];
+	s = (s<<7) + (s>>1) + name[5];	s = (s<<7) + (s>>1) + name[6];
+	s = (s<<7) + (s>>1) + name[7];	s = (s<<7) + (s>>1) + name[8];
+	s = (s<<7) + (s>>1) + name[9];	s = (s<<7) + (s>>1) + name[10];
+	return s;
+}
+
+static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus)
+{
+	return ((sector_t)clus - FAT_START_ENT) * sbi->sec_per_clus
+		+ sbi->data_start;
+}
+
+static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len)
+{
+#ifdef __BIG_ENDIAN
+	while (len--) {
+		*dst++ = src[0] | (src[1] << 8);
+		src += 2;
+	}
+#else
+	memcpy(dst, src, len * 2);
+#endif
+}
+
+static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
+{
+#ifdef __BIG_ENDIAN
+	while (len--) {
+		dst[0] = *src & 0x00FF;
+		dst[1] = (*src & 0xFF00) >> 8;
+		dst += 2;
+		src++;
+	}
+#else
+	memcpy(dst, src, len * 2);
+#endif
+}
+
+/* fat/cache.c */
+extern void fat_cache_inval_inode(struct inode *inode);
+extern int fat_get_cluster(struct inode *inode, int cluster,
+			   int *fclus, int *dclus);
+extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+		    unsigned long *mapped_blocks);
+
+/* fat/dir.c */
+extern const struct file_operations fat_dir_operations;
+extern int fat_search_long(struct inode *inode, const unsigned char *name,
+			   int name_len, struct fat_slot_info *sinfo);
+extern int fat_dir_empty(struct inode *dir);
+extern int fat_subdirs(struct inode *dir);
+extern int fat_scan(struct inode *dir, const unsigned char *name,
+		    struct fat_slot_info *sinfo);
+extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
+				struct msdos_dir_entry **de, loff_t *i_pos);
+extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts);
+extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
+			   struct fat_slot_info *sinfo);
+extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo);
+
+/* fat/fatent.c */
+struct fat_entry {
+	int entry;
+	union {
+		u8 *ent12_p[2];
+		__le16 *ent16_p;
+		__le32 *ent32_p;
+	} u;
+	int nr_bhs;
+	struct buffer_head *bhs[2];
+};
+
+static inline void fatent_init(struct fat_entry *fatent)
+{
+	fatent->nr_bhs = 0;
+	fatent->entry = 0;
+	fatent->u.ent32_p = NULL;
+	fatent->bhs[0] = fatent->bhs[1] = NULL;
+}
+
+static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
+{
+	fatent->entry = entry;
+	fatent->u.ent32_p = NULL;
+}
+
+static inline void fatent_brelse(struct fat_entry *fatent)
+{
+	int i;
+	fatent->u.ent32_p = NULL;
+	for (i = 0; i < fatent->nr_bhs; i++)
+		brelse(fatent->bhs[i]);
+	fatent->nr_bhs = 0;
+	fatent->bhs[0] = fatent->bhs[1] = NULL;
+}
+
+extern void fat_ent_access_init(struct super_block *sb);
+extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent,
+			int entry);
+extern int fat_ent_write(struct inode *inode, struct fat_entry *fatent,
+			 int new, int wait);
+extern int fat_alloc_clusters(struct inode *inode, int *cluster,
+			      int nr_cluster);
+extern int fat_free_clusters(struct inode *inode, int cluster);
+extern int fat_count_free_clusters(struct super_block *sb);
+
+/* fat/file.c */
+extern int fat_generic_ioctl(struct inode *inode, struct file *filp,
+			     unsigned int cmd, unsigned long arg);
+extern const struct file_operations fat_file_operations;
+extern const struct inode_operations fat_file_inode_operations;
+extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
+extern void fat_truncate(struct inode *inode);
+extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		       struct kstat *stat);
+
+/* fat/inode.c */
+extern void fat_attach(struct inode *inode, loff_t i_pos);
+extern void fat_detach(struct inode *inode);
+extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos);
+extern struct inode *fat_build_inode(struct super_block *sb,
+			struct msdos_dir_entry *de, loff_t i_pos);
+extern int fat_sync_inode(struct inode *inode);
+extern int fat_fill_super(struct super_block *sb, void *data, int silent,
+			const struct inode_operations *fs_dir_inode_ops, int isvfat);
+
+extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
+		            struct inode *i2);
+/* fat/misc.c */
+extern void fat_fs_panic(struct super_block *s, const char *fmt, ...);
+extern void fat_clusters_flush(struct super_block *sb);
+extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
+extern int date_dos2unix(unsigned short time, unsigned short date, int tz_utc);
+extern void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date,
+			      int tz_utc);
+extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
+
+int fat_cache_init(void);
+void fat_cache_destroy(void);
+
+#endif /* !_FAT_H */
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index fb98b3d847ed..5b5f49061b7c 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/msdos_fs.h>
 #include <linux/blkdev.h>
+#include "fat.h"
 
 struct fatent_operations {
 	void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index ddde37025ca6..b21973f266a1 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -10,13 +10,13 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/time.h>
-#include <linux/msdos_fs.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
+#include "fat.h"
 
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		      unsigned int cmd, unsigned long arg)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 2b2eec1283bf..3921de2013a4 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -16,7 +16,6 @@
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/seq_file.h>
-#include <linux/msdos_fs.h>
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/buffer_head.h>
@@ -28,6 +27,7 @@
 #include <linux/writeback.h>
 #include <linux/log2.h>
 #include <asm/unaligned.h>
+#include "fat.h"
 
 #ifndef CONFIG_FAT_DEFAULT_IOCHARSET
 /* if user don't select VFAT, this is undefined. */
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 79fb98ad36d4..91ad9be18ff9 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -8,8 +8,8 @@
 
 #include <linux/module.h>
 #include <linux/fs.h>
-#include <linux/msdos_fs.h>
 #include <linux/buffer_head.h>
+#include "fat.h"
 
 /*
  * fat_fs_panic reports a severe file system problem and sets the file system
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index e844b9809d27..c0a4d5cd99b2 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -9,8 +9,8 @@
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/buffer_head.h>
-#include <linux/msdos_fs.h>
 #include <linux/smp_lock.h>
+#include "fat.h"
 
 /* Characters that are undesirable in an MS-DOS file name */
 static unsigned char bad_chars[] = "*?<>|\"";
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 155c10b4adbd..facf3bf0211a 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -16,14 +16,13 @@
  */
 
 #include <linux/module.h>
-
 #include <linux/jiffies.h>
-#include <linux/msdos_fs.h>
 #include <linux/ctype.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/namei.h>
+#include "fat.h"
 
 static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-- 
cgit v1.2.2


From 7decd1cb0305b97243f283fa7f4baf5fe613edeb Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:47 -0800
Subject: fat: Fix and cleanup timestamp conversion

This cleans date_dos2unix()/fat_date_unix2dos() up. New code should be
much more readable.

And this fixes those old functions. Those doesn't handle 2100
correctly. 2100 isn't leap year, but old one handles it as leap year.
Also, with this, centi sec is handled and is fixed.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c         |   6 ++-
 fs/fat/fat.h         |   7 +--
 fs/fat/inode.c       |  34 ++++--------
 fs/fat/misc.c        | 148 +++++++++++++++++++++++++++++++++++++--------------
 fs/fat/namei_msdos.c |   2 +-
 fs/fat/namei_vfat.c  |   5 +-
 6 files changed, 130 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 08b23ad25f1c..a601c6d45bc0 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1089,6 +1089,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
 	struct msdos_dir_entry *de;
 	sector_t blknr;
 	__le16 date, time;
+	u8 time_cs;
 	int err, cluster;
 
 	err = fat_alloc_clusters(dir, &cluster, 1);
@@ -1102,7 +1103,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
 		goto error_free;
 	}
 
-	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
+	fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);
 
 	de = (struct msdos_dir_entry *)bhs[0]->b_data;
 	/* filling the new directory slots ("." and ".." entries) */
@@ -1112,13 +1113,14 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
 	de[0].lcase = de[1].lcase = 0;
 	de[0].time = de[1].time = time;
 	de[0].date = de[1].date = date;
-	de[0].ctime_cs = de[1].ctime_cs = 0;
 	if (sbi->options.isvfat) {
 		/* extra timestamps */
 		de[0].ctime = de[1].ctime = time;
+		de[0].ctime_cs = de[1].ctime_cs = time_cs;
 		de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = date;
 	} else {
 		de[0].ctime = de[1].ctime = 0;
+		de[0].ctime_cs = de[1].ctime_cs = 0;
 		de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = 0;
 	}
 	de[0].start = cpu_to_le16(cluster);
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 51f1c42ca5e3..a2a570f81719 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -263,9 +263,10 @@ extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
 extern void fat_fs_panic(struct super_block *s, const char *fmt, ...);
 extern void fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
-extern int date_dos2unix(unsigned short time, unsigned short date, int tz_utc);
-extern void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date,
-			      int tz_utc);
+extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
+			      __le16 __time, __le16 __date, u8 time_cs);
+extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
+			      __le16 *time, __le16 *date, u8 *time_cs);
 extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
 
 int fat_cache_init(void);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 3921de2013a4..079d9d5e0d36 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -381,22 +381,12 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 	MSDOS_I(inode)->i_attrs = de->attr & ATTR_UNUSED;
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
-	inode->i_mtime.tv_sec =
-		date_dos2unix(le16_to_cpu(de->time), le16_to_cpu(de->date),
-			      sbi->options.tz_utc);
-	inode->i_mtime.tv_nsec = 0;
+
+	fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0);
 	if (sbi->options.isvfat) {
-		int secs = de->ctime_cs / 100;
-		int csecs = de->ctime_cs % 100;
-		inode->i_ctime.tv_sec  =
-			date_dos2unix(le16_to_cpu(de->ctime),
-				      le16_to_cpu(de->cdate),
-				      sbi->options.tz_utc) + secs;
-		inode->i_ctime.tv_nsec = csecs * 10000000;
-		inode->i_atime.tv_sec =
-			date_dos2unix(0, le16_to_cpu(de->adate),
-				      sbi->options.tz_utc);
-		inode->i_atime.tv_nsec = 0;
+		fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime,
+				  de->cdate, de->ctime_cs);
+		fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
 	} else
 		inode->i_ctime = inode->i_atime = inode->i_mtime;
 
@@ -591,16 +581,14 @@ retry:
 	raw_entry->attr = fat_attr(inode);
 	raw_entry->start = cpu_to_le16(MSDOS_I(inode)->i_logstart);
 	raw_entry->starthi = cpu_to_le16(MSDOS_I(inode)->i_logstart >> 16);
-	fat_date_unix2dos(inode->i_mtime.tv_sec, &raw_entry->time,
-			  &raw_entry->date, sbi->options.tz_utc);
+	fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time,
+			  &raw_entry->date, NULL);
 	if (sbi->options.isvfat) {
 		__le16 atime;
-		fat_date_unix2dos(inode->i_ctime.tv_sec, &raw_entry->ctime,
-				  &raw_entry->cdate, sbi->options.tz_utc);
-		fat_date_unix2dos(inode->i_atime.tv_sec, &atime,
-				  &raw_entry->adate, sbi->options.tz_utc);
-		raw_entry->ctime_cs = (inode->i_ctime.tv_sec & 1) * 100 +
-			inode->i_ctime.tv_nsec / 10000000;
+		fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime,
+				  &raw_entry->cdate, &raw_entry->ctime_cs);
+		fat_time_unix2fat(sbi, &inode->i_atime, &atime,
+				  &raw_entry->adate, NULL);
 	}
 	spin_unlock(&sbi->inode_hash_lock);
 	mark_buffer_dirty(bh);
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 91ad9be18ff9..a191e79e66a9 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -135,65 +135,131 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
 
 extern struct timezone sys_tz;
 
+/*
+ * The epoch of FAT timestamp is 1980.
+ *     :  bits :     value
+ * date:  0 -  4: day	(1 -  31)
+ * date:  5 -  8: month	(1 -  12)
+ * date:  9 - 15: year	(0 - 127) from 1980
+ * time:  0 -  4: sec	(0 -  29) 2sec counts
+ * time:  5 - 10: min	(0 -  59)
+ * time: 11 - 15: hour	(0 -  23)
+ */
+#define SECS_PER_MIN	60
+#define SECS_PER_HOUR	(60 * 60)
+#define SECS_PER_DAY	(SECS_PER_HOUR * 24)
+#define UNIX_SECS_1980	315532800L
+#if BITS_PER_LONG == 64
+#define UNIX_SECS_2108	4354819200L
+#endif
+/* days between 1.1.70 and 1.1.80 (2 leap days) */
+#define DAYS_DELTA	(365 * 10 + 2)
+/* 120 (2100 - 1980) isn't leap year */
+#define YEAR_2100	120
+#define IS_LEAP_YEAR(y)	(!((y) & 3) && (y) != YEAR_2100)
+
 /* Linear day numbers of the respective 1sts in non-leap years. */
-static int day_n[] = {
-   /* Jan  Feb  Mar  Apr   May  Jun  Jul  Aug  Sep  Oct  Nov  Dec */
-	0,  31,  59,  90,  120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0
+static time_t days_in_year[] = {
+	/* Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec */
+	0,   0,  31,  59,  90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0,
 };
 
-/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
-int date_dos2unix(unsigned short time, unsigned short date, int tz_utc)
+/* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */
+void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
+		       __le16 __time, __le16 __date, u8 time_cs)
 {
-	int month, year, secs;
+	u16 time = le16_to_cpu(__time), date = le16_to_cpu(__date);
+	time_t second, day, leap_day, month, year;
 
-	/*
-	 * first subtract and mask after that... Otherwise, if
-	 * date == 0, bad things happen
-	 */
-	month = ((date >> 5) - 1) & 15;
-	year = date >> 9;
-	secs = (time & 31)*2+60*((time >> 5) & 63)+(time >> 11)*3600+86400*
-	    ((date & 31)-1+day_n[month]+(year/4)+year*365-((year & 3) == 0 &&
-	    month < 2 ? 1 : 0)+3653);
-			/* days since 1.1.70 plus 80's leap day */
-	if (!tz_utc)
-		secs += sys_tz.tz_minuteswest*60;
-	return secs;
+	year  = date >> 9;
+	month = max(1, (date >> 5) & 0xf);
+	day   = max(1, date & 0x1f) - 1;
+
+	leap_day = (year + 3) / 4;
+	if (year > YEAR_2100)		/* 2100 isn't leap year */
+		leap_day--;
+	if (IS_LEAP_YEAR(year) && month > 2)
+		leap_day++;
+
+	second =  (time & 0x1f) << 1;
+	second += ((time >> 5) & 0x3f) * SECS_PER_MIN;
+	second += (time >> 11) * SECS_PER_HOUR;
+	second += (year * 365 + leap_day
+		   + days_in_year[month] + day
+		   + DAYS_DELTA) * SECS_PER_DAY;
+
+	if (!sbi->options.tz_utc)
+		second += sys_tz.tz_minuteswest * SECS_PER_MIN;
+
+	if (time_cs) {
+		ts->tv_sec = second + (time_cs / 100);
+		ts->tv_nsec = (time_cs % 100) * 10000000;
+	} else {
+		ts->tv_sec = second;
+		ts->tv_nsec = 0;
+	}
 }
 
-/* Convert linear UNIX date to a MS-DOS time/date pair. */
-void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date, int tz_utc)
+/* Convert linear UNIX date to a FAT time/date pair. */
+void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
+		       __le16 *time, __le16 *date, u8 *time_cs)
 {
-	int day, year, nl_day, month;
+	time_t second = ts->tv_sec;
+	time_t day, leap_day, month, year;
 
-	if (!tz_utc)
-		unix_date -= sys_tz.tz_minuteswest*60;
+	if (!sbi->options.tz_utc)
+		second -= sys_tz.tz_minuteswest * SECS_PER_MIN;
 
 	/* Jan 1 GMT 00:00:00 1980. But what about another time zone? */
-	if (unix_date < 315532800)
-		unix_date = 315532800;
-
-	*time = cpu_to_le16((unix_date % 60)/2+(((unix_date/60) % 60) << 5)+
-	    (((unix_date/3600) % 24) << 11));
-	day = unix_date/86400-3652;
-	year = day/365;
-	if ((year+3)/4+365*year > day)
+	if (second < UNIX_SECS_1980) {
+		*time = 0;
+		*date = cpu_to_le16((0 << 9) | (1 << 5) | 1);
+		if (time_cs)
+			*time_cs = 0;
+		return;
+	}
+#if BITS_PER_LONG == 64
+	if (second >= UNIX_SECS_2108) {
+		*time = cpu_to_le16((23 << 11) | (59 << 5) | 29);
+		*date = cpu_to_le16((127 << 9) | (12 << 5) | 31);
+		if (time_cs)
+			*time_cs = 199;
+		return;
+	}
+#endif
+
+	day = second / SECS_PER_DAY - DAYS_DELTA;
+	year = day / 365;
+	leap_day = (year + 3) / 4;
+	if (year > YEAR_2100)		/* 2100 isn't leap year */
+		leap_day--;
+	if (year * 365 + leap_day > day)
 		year--;
-	day -= (year+3)/4+365*year;
-	if (day == 59 && !(year & 3)) {
-		nl_day = day;
+	leap_day = (year + 3) / 4;
+	if (year > YEAR_2100)		/* 2100 isn't leap year */
+		leap_day--;
+	day -= year * 365 + leap_day;
+
+	if (IS_LEAP_YEAR(year) && day == days_in_year[3]) {
 		month = 2;
 	} else {
-		nl_day = (year & 3) || day <= 59 ? day : day-1;
-		for (month = 0; month < 12; month++) {
-			if (day_n[month] > nl_day)
+		if (IS_LEAP_YEAR(year) && day > days_in_year[3])
+			day--;
+		for (month = 1; month < 12; month++) {
+			if (days_in_year[month + 1] > day)
 				break;
 		}
 	}
-	*date = cpu_to_le16(nl_day-day_n[month-1]+1+(month << 5)+(year << 9));
-}
+	day -= days_in_year[month];
 
-EXPORT_SYMBOL_GPL(fat_date_unix2dos);
+	*time = cpu_to_le16(((second / SECS_PER_HOUR) % 24) << 11
+			    | ((second / SECS_PER_MIN) % 60) << 5
+			    | (second % SECS_PER_MIN) >> 1);
+	*date = cpu_to_le16((year << 9) | (month << 5) | (day + 1));
+	if (time_cs)
+		*time_cs = (ts->tv_sec & 1) * 100 + ts->tv_nsec / 10000000;
+}
+EXPORT_SYMBOL_GPL(fat_time_unix2fat);
 
 int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 {
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index c0a4d5cd99b2..e92e8158ebaf 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -247,7 +247,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
 	if (is_hid)
 		de.attr |= ATTR_HIDDEN;
 	de.lcase = 0;
-	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
+	fat_time_unix2fat(sbi, ts, &time, &date, NULL);
 	de.cdate = de.adate = 0;
 	de.ctime = 0;
 	de.ctime_cs = 0;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index facf3bf0211a..1536bc3ca0f0 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -568,6 +568,7 @@ static int vfat_build_slots(struct inode *dir, const unsigned char *name,
 	unsigned char msdos_name[MSDOS_NAME];
 	wchar_t *uname;
 	__le16 time, date;
+	u8 time_cs;
 	int err, ulen, usize, i;
 	loff_t offset;
 
@@ -620,10 +621,10 @@ shortname:
 	memcpy(de->name, msdos_name, MSDOS_NAME);
 	de->attr = is_dir ? ATTR_DIR : ATTR_ARCH;
 	de->lcase = lcase;
-	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
+	fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);
 	de->time = de->ctime = time;
 	de->date = de->cdate = de->adate = date;
-	de->ctime_cs = 0;
+	de->ctime_cs = time_cs;
 	de->start = cpu_to_le16(cluster);
 	de->starthi = cpu_to_le16(cluster >> 16);
 	de->size = 0;
-- 
cgit v1.2.2


From 53472bc8f810d2fb507593ea03703670506a668d Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:47 -0800
Subject: fat: use generic_file_llseek() for directory

Since fat_dir_ioctl() was already fixed (i.e. called under ->i_mutex),
and __fat_readdir() doesn't take BKL anymore. So, BKL for ->llseek()
is pointless, and we have to use generic_file_llseek().

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index a601c6d45bc0..931dd28b5289 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -832,6 +832,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
 #endif /* CONFIG_COMPAT */
 
 const struct file_operations fat_dir_operations = {
+	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= fat_readdir,
 	.ioctl		= fat_dir_ioctl,
-- 
cgit v1.2.2


From 52e9d9f4b32a3bec91feb76c84e37b7dcffe5040 Mon Sep 17 00:00:00 2001
From: Darren Jenkins <darrenrjenkins@gmail.com>
Date: Thu, 6 Nov 2008 12:53:48 -0800
Subject: fat: cleanup fat_parse_long() error handling

Coverity CID 2332 & 2333 RESOURCE_LEAK

In fat_search_long() if fat_parse_long() returns a -ve value we return
without first freeing unicode.  This patch free's them on this error path.

The above was false positive on current tree, but this change is more
clean, so apply as cleanup.

[hirofumi@mail.parknet.co.jp: fix coding style]
Signed-off-by: Darren Jenkins <darrenrjenkins@gmail.com>
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 931dd28b5289..140fc39e2307 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -373,9 +373,10 @@ parse_record:
 		if (de->attr == ATTR_EXT) {
 			int status = fat_parse_long(inode, &cpos, &bh, &de,
 						    &unicode, &nr_slots);
-			if (status < 0)
-				return status;
-			else if (status == PARSE_INVALID)
+			if (status < 0) {
+				err = status;
+				goto end_of_dir;
+			} else if (status == PARSE_INVALID)
 				continue;
 			else if (status == PARSE_NOT_LONGNAME)
 				goto parse_record;
-- 
cgit v1.2.2


From d3dfa8228f87ab9960ab8b4718013d68e3c25a43 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:49 -0800
Subject: fat: improve fat_hash()

fat_hash() is using the algorithm known as bad. Instead of it, this
uses hash_32(). The following is the summary of test.

old hash:
	hash func (1000 times): 33489 cycles
	total inodes in hash table: 70926
	largest bucket contains: 696
	smallest bucket contains: 54

new hash:
	hash func (1000 times): 33129 cycles
	total inodes in hash table: 70926
	largest bucket contains: 315
	smallest bucket contains: 236

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h   |  1 -
 fs/fat/inode.c | 18 +++++++-----------
 2 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index a2a570f81719..2b8e94c3eef4 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -43,7 +43,6 @@ struct fat_mount_options {
 
 #define FAT_HASH_BITS	8
 #define FAT_HASH_SIZE	(1UL << FAT_HASH_BITS)
-#define FAT_HASH_MASK	(FAT_HASH_SIZE-1)
 
 /*
  * MS-DOS file system in-core superblock data
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 079d9d5e0d36..f58cd48d98b8 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -26,6 +26,7 @@
 #include <linux/uio.h>
 #include <linux/writeback.h>
 #include <linux/log2.h>
+#include <linux/hash.h>
 #include <asm/unaligned.h>
 #include "fat.h"
 
@@ -247,25 +248,21 @@ static void fat_hash_init(struct super_block *sb)
 		INIT_HLIST_HEAD(&sbi->inode_hashtable[i]);
 }
 
-static inline unsigned long fat_hash(struct super_block *sb, loff_t i_pos)
+static inline unsigned long fat_hash(loff_t i_pos)
 {
-	unsigned long tmp = (unsigned long)i_pos | (unsigned long) sb;
-	tmp = tmp + (tmp >> FAT_HASH_BITS) + (tmp >> FAT_HASH_BITS * 2);
-	return tmp & FAT_HASH_MASK;
+	return hash_32(i_pos, FAT_HASH_BITS);
 }
 
 void fat_attach(struct inode *inode, loff_t i_pos)
 {
-	struct super_block *sb = inode->i_sb;
-	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos);
 
 	spin_lock(&sbi->inode_hash_lock);
 	MSDOS_I(inode)->i_pos = i_pos;
-	hlist_add_head(&MSDOS_I(inode)->i_fat_hash,
-			sbi->inode_hashtable + fat_hash(sb, i_pos));
+	hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head);
 	spin_unlock(&sbi->inode_hash_lock);
 }
-
 EXPORT_SYMBOL_GPL(fat_attach);
 
 void fat_detach(struct inode *inode)
@@ -276,13 +273,12 @@ void fat_detach(struct inode *inode)
 	hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
 	spin_unlock(&sbi->inode_hash_lock);
 }
-
 EXPORT_SYMBOL_GPL(fat_detach);
 
 struct inode *fat_iget(struct super_block *sb, loff_t i_pos)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
-	struct hlist_head *head = sbi->inode_hashtable + fat_hash(sb, i_pos);
+	struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos);
 	struct hlist_node *_p;
 	struct msdos_inode_info *i;
 	struct inode *inode = NULL;
-- 
cgit v1.2.2


From 5e35dd4651002207948f10c576fc7d9bad448815 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:49 -0800
Subject: fat: Fix fat_ent_update_ptr() for FAT12

This fixes the missing update for bhs/nr_bhs in case the caller
accessed from block boundary to first block of boundary.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fatent.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 5b5f49061b7c..13513992da3c 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -317,10 +317,20 @@ static inline int fat_ent_update_ptr(struct super_block *sb,
 	/* Is this fatent's blocks including this entry? */
 	if (!fatent->nr_bhs || bhs[0]->b_blocknr != blocknr)
 		return 0;
-	/* Does this entry need the next block? */
-	if (sbi->fat_bits == 12 && (offset + 1) >= sb->s_blocksize) {
-		if (fatent->nr_bhs != 2 || bhs[1]->b_blocknr != (blocknr + 1))
-			return 0;
+	if (sbi->fat_bits == 12) {
+		if ((offset + 1) < sb->s_blocksize) {
+			/* This entry is on bhs[0]. */
+			if (fatent->nr_bhs == 2) {
+				brelse(bhs[1]);
+				fatent->nr_bhs = 1;
+			}
+		} else {
+			/* This entry needs the next block. */
+			if (fatent->nr_bhs != 2)
+				return 0;
+			if (bhs[1]->b_blocknr != (blocknr + 1))
+				return 0;
+		}
 	}
 	ops->ent_set_ptr(fatent, offset);
 	return 1;
-- 
cgit v1.2.2


From a993b542bb4cd3e5a64863b7ef892bbebec2239b Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:50 -0800
Subject: fat: use fat_detach() in fat_clear_inode()

Use fat_detach() instead of opencoding it.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/inode.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index f58cd48d98b8..8e1b75c63c7f 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -429,13 +429,8 @@ static void fat_delete_inode(struct inode *inode)
 
 static void fat_clear_inode(struct inode *inode)
 {
-	struct super_block *sb = inode->i_sb;
-	struct msdos_sb_info *sbi = MSDOS_SB(sb);
-
-	spin_lock(&sbi->inode_hash_lock);
 	fat_cache_inval_inode(inode);
-	hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
-	spin_unlock(&sbi->inode_hash_lock);
+	fat_detach(inode);
 }
 
 static void fat_write_super(struct super_block *sb)
-- 
cgit v1.2.2


From 068f5ae05c51d2cee6b31cb3da06775dd83bd348 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:51 -0800
Subject: vfat: Fix vfat_find() error path in vfat_lookup()

Current vfat_lookup() creates negetive dentry blindly if vfat_find()
returned a error. It's wrong. If the error isn't -ENOENT, just return
error.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/namei_vfat.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 1536bc3ca0f0..419deabfb9be 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -683,7 +683,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 {
 	struct super_block *sb = dir->i_sb;
 	struct fat_slot_info sinfo;
-	struct inode *inode = NULL;
+	struct inode *inode;
 	struct dentry *alias;
 	int err, table;
 
@@ -693,14 +693,18 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 
 	err = vfat_find(dir, &dentry->d_name, &sinfo);
 	if (err) {
-		table++;
+		if (err == -ENOENT) {
+			table++;
+			inode = NULL;
+			goto out;
+		}
 		goto error;
 	}
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
 	brelse(sinfo.bh);
 	if (IS_ERR(inode)) {
-		unlock_super(sb);
-		return ERR_CAST(inode);
+		err = PTR_ERR(inode);
+		goto error;
 	}
 	alias = d_find_alias(inode);
 	if (alias) {
@@ -713,7 +717,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 		}
 
 	}
-error:
+out:
 	unlock_super(sb);
 	dentry->d_op = &vfat_dentry_ops[table];
 	dentry->d_time = dentry->d_parent->d_inode->i_version;
@@ -723,6 +727,10 @@ error:
 		dentry->d_time = dentry->d_parent->d_inode->i_version;
 	}
 	return dentry;
+
+error:
+	unlock_super(sb);
+	return ERR_PTR(err);
 }
 
 static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
-- 
cgit v1.2.2


From 1b52467243c7167b3a267ddbcbb14d550f28eb4a Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:51 -0800
Subject: fat: Fix/Cleanup dcache handling for vfat

- Add comments for handling dcache of vfat.

- Separate case-sensitive case and case-insensitive to
  vfat_revalidate() and vfat_ci_revalidate().

  vfat_revalidate() doesn't need to drop case-insensitive negative
  dentry on creation path.

- Current code is missing to set ->d_revalidate to the negative dentry
  created by unlink/etc..

  This sets ->d_revalidate always, and returns 1 for positive
  dentry. Now, we don't need to change ->d_op dynamically anymore,
  so this just uses sb->s_root->d_op to set ->d_op.

- d_find_alias() may return DCACHE_DISCONNECTED dentry. It's not
  the interesting dentry there. This checks it.

- Add missing LOOKUP_PARENT check. We don't need to drop the valid
  negative dentry for (LOOKUP_CREATE | LOOKUP_PARENT) lookup.

- For consistent filename on creation path, this drops negative dentry
  if we can't see intent.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/namei_vfat.c | 124 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 80 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 419deabfb9be..d585398f9f6b 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -24,27 +24,67 @@
 #include <linux/namei.h>
 #include "fat.h"
 
-static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
+/*
+ * If new entry was created in the parent, it could create the 8.3
+ * alias (the shortname of logname).  So, the parent may have the
+ * negative-dentry which matches the created 8.3 alias.
+ *
+ * If it happened, the negative dentry isn't actually negative
+ * anymore.  So, drop it.
+ */
+static int vfat_revalidate_shortname(struct dentry *dentry)
 {
 	int ret = 1;
-
-	if (!dentry->d_inode &&
-	    nd && !(nd->flags & LOOKUP_CONTINUE) && (nd->flags & LOOKUP_CREATE))
-		/*
-		 * negative dentry is dropped, in order to make sure
-		 * to use the name which a user desires if this is
-		 * create path.
-		 */
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_time != dentry->d_parent->d_inode->i_version)
 		ret = 0;
-	else {
-		spin_lock(&dentry->d_lock);
-		if (dentry->d_time != dentry->d_parent->d_inode->i_version)
-			ret = 0;
-		spin_unlock(&dentry->d_lock);
-	}
+	spin_unlock(&dentry->d_lock);
 	return ret;
 }
 
+static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	/* This is not negative dentry. Always valid. */
+	if (dentry->d_inode)
+		return 1;
+	return vfat_revalidate_shortname(dentry);
+}
+
+static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
+{
+	/*
+	 * This is not negative dentry. Always valid.
+	 *
+	 * Note, rename() to existing directory entry will have ->d_inode,
+	 * and will use existing name which isn't specified name by user.
+	 *
+	 * We may be able to drop this positive dentry here. But dropping
+	 * positive dentry isn't good idea. So it's unsupported like
+	 * rename("filename", "FILENAME") for now.
+	 */
+	if (dentry->d_inode)
+		return 1;
+
+	/*
+	 * This may be nfsd (or something), anyway, we can't see the
+	 * intent of this. So, since this can be for creation, drop it.
+	 */
+	if (!nd)
+		return 0;
+
+	/*
+	 * Drop the negative dentry, in order to make sure to use the
+	 * case sensitive name which is specified by user if this is
+	 * for creation.
+	 */
+	if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+		if (nd->flags & LOOKUP_CREATE)
+			return 0;
+	}
+
+	return vfat_revalidate_shortname(dentry);
+}
+
 /* returns the length of a struct qstr, ignoring trailing dots */
 static unsigned int vfat_striptail_len(struct qstr *qstr)
 {
@@ -126,25 +166,16 @@ static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
 	return 1;
 }
 
-static struct dentry_operations vfat_dentry_ops[4] = {
-	{
-		.d_hash		= vfat_hashi,
-		.d_compare	= vfat_cmpi,
-	},
-	{
-		.d_revalidate	= vfat_revalidate,
-		.d_hash		= vfat_hashi,
-		.d_compare	= vfat_cmpi,
-	},
-	{
-		.d_hash		= vfat_hash,
-		.d_compare	= vfat_cmp,
-	},
-	{
-		.d_revalidate	= vfat_revalidate,
-		.d_hash		= vfat_hash,
-		.d_compare	= vfat_cmp,
-	}
+static struct dentry_operations vfat_ci_dentry_ops = {
+	.d_revalidate	= vfat_revalidate_ci,
+	.d_hash		= vfat_hashi,
+	.d_compare	= vfat_cmpi,
+};
+
+static struct dentry_operations vfat_dentry_ops = {
+	.d_revalidate	= vfat_revalidate,
+	.d_hash		= vfat_hash,
+	.d_compare	= vfat_cmp,
 };
 
 /* Characters that are undesirable in an MS-DOS file name */
@@ -685,29 +716,35 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 	struct fat_slot_info sinfo;
 	struct inode *inode;
 	struct dentry *alias;
-	int err, table;
+	int err;
 
 	lock_super(sb);
-	table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
-	dentry->d_op = &vfat_dentry_ops[table];
 
 	err = vfat_find(dir, &dentry->d_name, &sinfo);
 	if (err) {
 		if (err == -ENOENT) {
-			table++;
 			inode = NULL;
 			goto out;
 		}
 		goto error;
 	}
+
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
 	brelse(sinfo.bh);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto error;
 	}
+
 	alias = d_find_alias(inode);
-	if (alias) {
+	if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
+		/*
+		 * This inode has non DCACHE_DISCONNECTED dentry. This
+		 * means, the user did ->lookup() by an another name
+		 * (longname vs 8.3 alias of it) in past.
+		 *
+		 * Switch to new one for reason of locality if possible.
+		 */
 		if (d_invalidate(alias) == 0)
 			dput(alias);
 		else {
@@ -715,15 +752,14 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 			unlock_super(sb);
 			return alias;
 		}
-
 	}
 out:
 	unlock_super(sb);
-	dentry->d_op = &vfat_dentry_ops[table];
+	dentry->d_op = sb->s_root->d_op;
 	dentry->d_time = dentry->d_parent->d_inode->i_version;
 	dentry = d_splice_alias(inode, dentry);
 	if (dentry) {
-		dentry->d_op = &vfat_dentry_ops[table];
+		dentry->d_op = sb->s_root->d_op;
 		dentry->d_time = dentry->d_parent->d_inode->i_version;
 	}
 	return dentry;
@@ -1022,9 +1058,9 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
 		return res;
 
 	if (MSDOS_SB(sb)->options.name_check != 's')
-		sb->s_root->d_op = &vfat_dentry_ops[0];
+		sb->s_root->d_op = &vfat_ci_dentry_ops;
 	else
-		sb->s_root->d_op = &vfat_dentry_ops[2];
+		sb->s_root->d_op = &vfat_dentry_ops;
 
 	return 0;
 }
-- 
cgit v1.2.2


From 1c13a243a461dd5b089d29e5d57f260c990e462c Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:52 -0800
Subject: fat: Kill d_invalidate() in vfat_lookup()

d_invalidate() for positive dentry doesn't work in some cases
(vfsmount, nfsd, and maybe others). shrink_dcache_parent() by
d_invalidate() is pointless for vfat usage at all.

So, this kills it, and intead of it uses d_move().

To save old behavior, this returns alias simply for directory (don't
change pwd, etc..). the directory lookup shouldn't be important for
performance.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/namei_vfat.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index d585398f9f6b..bf326d4356a3 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -745,13 +745,12 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 		 *
 		 * Switch to new one for reason of locality if possible.
 		 */
-		if (d_invalidate(alias) == 0)
-			dput(alias);
-		else {
-			iput(inode);
-			unlock_super(sb);
-			return alias;
-		}
+		BUG_ON(d_unhashed(alias));
+		if (!S_ISDIR(inode->i_mode))
+			d_move(alias, dentry);
+		iput(inode);
+		unlock_super(sb);
+		return alias;
 	}
 out:
 	unlock_super(sb);
-- 
cgit v1.2.2


From 45cfbe354785a5bc9a38354754d6f7322f598001 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:53 -0800
Subject: fat: Cleanup msdos_lookup()

Use same style with vfat_lookup().

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/namei_msdos.c | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index e92e8158ebaf..7ba03a4acbe0 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -203,33 +203,37 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
 {
 	struct super_block *sb = dir->i_sb;
 	struct fat_slot_info sinfo;
-	struct inode *inode = NULL;
-	int res;
-
-	dentry->d_op = &msdos_dentry_operations;
+	struct inode *inode;
+	int err;
 
 	lock_super(sb);
-	res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
-	if (res == -ENOENT)
-		goto add;
-	if (res < 0)
-		goto out;
+
+	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
+	if (err) {
+		if (err == -ENOENT) {
+			inode = NULL;
+			goto out;
+		}
+		goto error;
+	}
+
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
 	brelse(sinfo.bh);
 	if (IS_ERR(inode)) {
-		res = PTR_ERR(inode);
-		goto out;
+		err = PTR_ERR(inode);
+		goto error;
 	}
-add:
-	res = 0;
+out:
+	unlock_super(sb);
+	dentry->d_op = &msdos_dentry_operations;
 	dentry = d_splice_alias(inode, dentry);
 	if (dentry)
 		dentry->d_op = &msdos_dentry_operations;
-out:
+	return dentry;
+
+error:
 	unlock_super(sb);
-	if (!res)
-		return dentry;
-	return ERR_PTR(res);
+	return ERR_PTR(err);
 }
 
 /***** Creates a directory entry (name is already formatted). */
-- 
cgit v1.2.2


From 9c0aa1b87bf541affef519eb4879ce7c5a5941ae Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:54 -0800
Subject: fat: Cleanup FAT attribute stuff

This adds three helpers:

fat_make_attrs() - makes FAT attributes from inode.
fat_make_mode()  - makes mode_t from FAT attributes.
fat_save_attrs() - saves FAT attributes to inode.

Then this replaces: MSDOS_MKMODE() by fat_make_mode(), fat_attr() by
fat_make_attrs(), ->i_attrs = attr & ATTR_UNUSED by fat_save_attrs().
And for root inode, those is used with ATTR_DIR instead of bogus
ATTR_NONE.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h   | 20 +++++++++++++++++++-
 fs/fat/file.c  | 32 ++++++++++++--------------------
 fs/fat/inode.c | 19 +++++++++----------
 3 files changed, 40 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 2b8e94c3eef4..3b4753a024e3 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -117,14 +117,32 @@ static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
 	return container_of(inode, struct msdos_inode_info, vfs_inode);
 }
 
+/* Convert attribute bits and a mask to the UNIX mode. */
+static inline mode_t fat_make_mode(struct msdos_sb_info *sbi,
+				   u8 attrs, mode_t mode)
+{
+	if (attrs & ATTR_RO)
+		mode &= ~S_IWUGO;
+
+	if (attrs & ATTR_DIR)
+		return (mode & ~sbi->options.fs_dmask) | S_IFDIR;
+	else
+		return (mode & ~sbi->options.fs_fmask) | S_IFREG;
+}
+
 /* Return the FAT attribute byte for this inode */
-static inline u8 fat_attr(struct inode *inode)
+static inline u8 fat_make_attrs(struct inode *inode)
 {
 	return ((inode->i_mode & S_IWUGO) ? ATTR_NONE : ATTR_RO) |
 		(S_ISDIR(inode->i_mode) ? ATTR_DIR : ATTR_NONE) |
 		MSDOS_I(inode)->i_attrs;
 }
 
+static inline void fat_save_attrs(struct inode *inode, u8 attrs)
+{
+	MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED;
+}
+
 static inline unsigned char fat_checksum(const __u8 *name)
 {
 	unsigned char s = name[0];
diff --git a/fs/fat/file.c b/fs/fat/file.c
index b21973f266a1..f5a7e907a8fa 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -27,13 +27,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 	switch (cmd) {
 	case FAT_IOCTL_GET_ATTRIBUTES:
 	{
-		u32 attr;
-
-		if (inode->i_ino == MSDOS_ROOT_INO)
-			attr = ATTR_DIR;
-		else
-			attr = fat_attr(inode);
-
+		u32 attr = fat_make_attrs(inode);
 		return put_user(attr, user_attr);
 	}
 	case FAT_IOCTL_SET_ATTRIBUTES:
@@ -62,20 +56,16 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		/* Merge in ATTR_VOLUME and ATTR_DIR */
 		attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
 			(is_dir ? ATTR_DIR : 0);
-		oldattr = fat_attr(inode);
+		oldattr = fat_make_attrs(inode);
 
 		/* Equivalent to a chmod() */
 		ia.ia_valid = ATTR_MODE | ATTR_CTIME;
 		ia.ia_ctime = current_fs_time(inode->i_sb);
-		if (is_dir) {
-			ia.ia_mode = MSDOS_MKMODE(attr,
-				S_IRWXUGO & ~sbi->options.fs_dmask)
-				| S_IFDIR;
-		} else {
-			ia.ia_mode = MSDOS_MKMODE(attr,
-				(S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO))
-				& ~sbi->options.fs_fmask)
-				| S_IFREG;
+		if (is_dir)
+			ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
+		else {
+			ia.ia_mode = fat_make_mode(sbi, attr,
+				S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
 		}
 
 		/* The root directory has no attributes */
@@ -115,7 +105,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 				inode->i_flags &= S_IMMUTABLE;
 		}
 
-		MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED;
+		fat_save_attrs(inode, attr);
 		mark_inode_dirty(inode);
 up:
 		mnt_drop_write(filp->f_path.mnt);
@@ -274,7 +264,7 @@ static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
 
 	/*
 	 * Note, the basic check is already done by a caller of
-	 * (attr->ia_mode & ~MSDOS_VALID_MODE)
+	 * (attr->ia_mode & ~FAT_VALID_MODE)
 	 */
 
 	if (S_ISREG(inode->i_mode))
@@ -314,6 +304,8 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
 }
 
 #define TIMES_SET_FLAGS	(ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
+/* valid file mode bits */
+#define FAT_VALID_MODE	(S_IFREG | S_IFDIR | S_IRWXUGO)
 
 int fat_setattr(struct dentry *dentry, struct iattr *attr)
 {
@@ -356,7 +348,7 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 	    ((attr->ia_valid & ATTR_GID) &&
 	     (attr->ia_gid != sbi->options.fs_gid)) ||
 	    ((attr->ia_valid & ATTR_MODE) &&
-	     (attr->ia_mode & ~MSDOS_VALID_MODE)))
+	     (attr->ia_mode & ~FAT_VALID_MODE)))
 		error = -EPERM;
 
 	if (error) {
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 8e1b75c63c7f..7aaa21cf019a 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -337,8 +337,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 
 	if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) {
 		inode->i_generation &= ~1;
-		inode->i_mode = MSDOS_MKMODE(de->attr,
-			S_IRWXUGO & ~sbi->options.fs_dmask) | S_IFDIR;
+		inode->i_mode = fat_make_mode(sbi, de->attr, S_IRWXUGO);
 		inode->i_op = sbi->dir_ops;
 		inode->i_fop = &fat_dir_operations;
 
@@ -355,10 +354,9 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 		inode->i_nlink = fat_subdirs(inode);
 	} else { /* not a directory */
 		inode->i_generation |= 1;
-		inode->i_mode = MSDOS_MKMODE(de->attr,
-		    ((sbi->options.showexec && !is_exec(de->name + 8))
-			? S_IRUGO|S_IWUGO : S_IRWXUGO)
-		    & ~sbi->options.fs_fmask) | S_IFREG;
+		inode->i_mode = fat_make_mode(sbi, de->attr,
+			((sbi->options.showexec && !is_exec(de->name + 8))
+			 ? S_IRUGO|S_IWUGO : S_IRWXUGO));
 		MSDOS_I(inode)->i_start = le16_to_cpu(de->start);
 		if (sbi->fat_bits == 32)
 			MSDOS_I(inode)->i_start |= (le16_to_cpu(de->starthi) << 16);
@@ -374,7 +372,8 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 		if (sbi->options.sys_immutable)
 			inode->i_flags |= S_IMMUTABLE;
 	}
-	MSDOS_I(inode)->i_attrs = de->attr & ATTR_UNUSED;
+	fat_save_attrs(inode, de->attr);
+
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
 
@@ -569,7 +568,7 @@ retry:
 		raw_entry->size = 0;
 	else
 		raw_entry->size = cpu_to_le32(inode->i_size);
-	raw_entry->attr = fat_attr(inode);
+	raw_entry->attr = fat_make_attrs(inode);
 	raw_entry->start = cpu_to_le16(MSDOS_I(inode)->i_logstart);
 	raw_entry->starthi = cpu_to_le16(MSDOS_I(inode)->i_logstart >> 16);
 	fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time,
@@ -1105,7 +1104,7 @@ static int fat_read_root(struct inode *inode)
 	inode->i_gid = sbi->options.fs_gid;
 	inode->i_version++;
 	inode->i_generation = 0;
-	inode->i_mode = (S_IRWXUGO & ~sbi->options.fs_dmask) | S_IFDIR;
+	inode->i_mode = fat_make_mode(sbi, ATTR_DIR, S_IRWXUGO);
 	inode->i_op = sbi->dir_ops;
 	inode->i_fop = &fat_dir_operations;
 	if (sbi->fat_bits == 32) {
@@ -1122,7 +1121,7 @@ static int fat_read_root(struct inode *inode)
 	MSDOS_I(inode)->i_logstart = 0;
 	MSDOS_I(inode)->mmu_private = inode->i_size;
 
-	MSDOS_I(inode)->i_attrs = ATTR_NONE;
+	fat_save_attrs(inode, ATTR_DIR);
 	inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0;
 	inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0;
 	inode->i_nlink = fat_subdirs(inode)+2;
-- 
cgit v1.2.2


From 9183482f5d4a2de00f66641b974e7f351d41b675 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:54 -0800
Subject: fat: Fix ATTR_RO in the case of (~umask & S_WUGO) == 0

If inode->i_mode doesn't have S_WUGO, current code assumes it means
ATTR_RO.  However, if (~[ufd]mask & S_WUGO) == 0, inode->i_mode can't
hold S_WUGO. Therefore the updated directory entry will always have
ATTR_RO.

This adds fat_mode_can_hold_ro() to check it. And if inode->i_mode
can't hold, uses -i_attrs to hold ATTR_RO instead.

With this, we don't set ATTR_RO unless users change it via ioctl() if
(~[ufd]mask & S_WUGO) == 0.

And on FAT_IOCTL_GET_ATTRIBUTES path, this adds ->i_mutex to it for
not returning the partially updated attributes by FAT_IOCTL_SET_ATTRIBUTES
to userland.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h  | 33 +++++++++++++++++++++++++++++----
 fs/fat/file.c |  7 ++++++-
 2 files changed, 35 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 3b4753a024e3..313b645b8126 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -117,6 +117,25 @@ static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
 	return container_of(inode, struct msdos_inode_info, vfs_inode);
 }
 
+/*
+ * If ->i_mode can't hold S_IWUGO (i.e. ATTR_RO), we use ->i_attrs to
+ * save ATTR_RO instead of ->i_mode.
+ */
+static inline int fat_mode_can_hold_ro(struct inode *inode)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	mode_t mask;
+
+	if (S_ISDIR(inode->i_mode))
+		mask = ~sbi->options.fs_dmask;
+	else
+		mask = ~sbi->options.fs_fmask;
+
+	if (!(mask & S_IWUGO))
+		return 0;
+	return 1;
+}
+
 /* Convert attribute bits and a mask to the UNIX mode. */
 static inline mode_t fat_make_mode(struct msdos_sb_info *sbi,
 				   u8 attrs, mode_t mode)
@@ -133,14 +152,20 @@ static inline mode_t fat_make_mode(struct msdos_sb_info *sbi,
 /* Return the FAT attribute byte for this inode */
 static inline u8 fat_make_attrs(struct inode *inode)
 {
-	return ((inode->i_mode & S_IWUGO) ? ATTR_NONE : ATTR_RO) |
-		(S_ISDIR(inode->i_mode) ? ATTR_DIR : ATTR_NONE) |
-		MSDOS_I(inode)->i_attrs;
+	u8 attrs = MSDOS_I(inode)->i_attrs;
+	if (S_ISDIR(inode->i_mode))
+		attrs |= ATTR_DIR;
+	if (fat_mode_can_hold_ro(inode) && !(inode->i_mode & S_IWUGO))
+		attrs |= ATTR_RO;
+	return attrs;
 }
 
 static inline void fat_save_attrs(struct inode *inode, u8 attrs)
 {
-	MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED;
+	if (fat_mode_can_hold_ro(inode))
+		MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED;
+	else
+		MSDOS_I(inode)->i_attrs = attrs & (ATTR_UNUSED | ATTR_RO);
 }
 
 static inline unsigned char fat_checksum(const __u8 *name)
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f5a7e907a8fa..81b15c623803 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -27,7 +27,12 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 	switch (cmd) {
 	case FAT_IOCTL_GET_ATTRIBUTES:
 	{
-		u32 attr = fat_make_attrs(inode);
+		u32 attr;
+
+		mutex_lock(&inode->i_mutex);
+		attr = fat_make_attrs(inode);
+		mutex_unlock(&inode->i_mutex);
+
 		return put_user(attr, user_attr);
 	}
 	case FAT_IOCTL_SET_ATTRIBUTES:
-- 
cgit v1.2.2


From dfc209c0064efef5590f608056a48b61a5cac09c Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:55 -0800
Subject: fat: Fix ATTR_RO for directory

FAT has the ATTR_RO (read-only) attribute. But on Windows, the ATTR_RO
of the directory will be just ignored actually, and is used by only
applications as flag. E.g. it's setted for the customized folder by
Explorer.

http://msdn2.microsoft.com/en-us/library/aa969337.aspx

This adds "rodir" option. If user specified it, ATTR_RO is used as
read-only flag even if it's the directory. Otherwise, inode->i_mode
is not used to hold ATTR_RO (i.e. fat_mode_can_save_ro() returns 0).

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h   | 14 ++++++++++----
 fs/fat/file.c  | 16 ++++++++++++----
 fs/fat/inode.c | 17 +++++++++++++----
 3 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 313b645b8126..e9dce5d8e7a7 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -38,7 +38,8 @@ struct fat_mount_options {
 		 flush:1,	  /* write things quickly */
 		 nocase:1,	  /* Does this need case conversion? 0=need case conversion*/
 		 usefree:1,	  /* Use free_clusters for FAT32 */
-		 tz_utc:1;	  /* Filesystem timestamps are in UTC */
+		 tz_utc:1,	  /* Filesystem timestamps are in UTC */
+		 rodir:1;	  /* allow ATTR_RO for directory */
 };
 
 #define FAT_HASH_BITS	8
@@ -120,15 +121,20 @@ static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
 /*
  * If ->i_mode can't hold S_IWUGO (i.e. ATTR_RO), we use ->i_attrs to
  * save ATTR_RO instead of ->i_mode.
+ *
+ * If it's directory and !sbi->options.rodir, ATTR_RO isn't read-only
+ * bit, it's just used as flag for app.
  */
 static inline int fat_mode_can_hold_ro(struct inode *inode)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
 	mode_t mask;
 
-	if (S_ISDIR(inode->i_mode))
+	if (S_ISDIR(inode->i_mode)) {
+		if (!sbi->options.rodir)
+			return 0;
 		mask = ~sbi->options.fs_dmask;
-	else
+	} else
 		mask = ~sbi->options.fs_fmask;
 
 	if (!(mask & S_IWUGO))
@@ -140,7 +146,7 @@ static inline int fat_mode_can_hold_ro(struct inode *inode)
 static inline mode_t fat_make_mode(struct msdos_sb_info *sbi,
 				   u8 attrs, mode_t mode)
 {
-	if (attrs & ATTR_RO)
+	if (attrs & ATTR_RO && !((attrs & ATTR_DIR) && !sbi->options.rodir))
 		mode &= ~S_IWUGO;
 
 	if (attrs & ATTR_DIR)
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 81b15c623803..f06a4e525ece 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -282,11 +282,18 @@ static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
 	/*
 	 * Of the r and x bits, all (subject to umask) must be present. Of the
 	 * w bits, either all (subject to umask) or none must be present.
+	 *
+	 * If fat_mode_can_hold_ro(inode) is false, can't change w bits.
 	 */
 	if ((perm & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO)))
 		return -EPERM;
-	if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
-		return -EPERM;
+	if (fat_mode_can_hold_ro(inode)) {
+		if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
+			return -EPERM;
+	} else {
+		if ((perm & S_IWUGO) != (S_IWUGO & ~mask))
+			return -EPERM;
+	}
 
 	*mode_ptr &= S_IFMT | perm;
 
@@ -316,8 +323,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
 	struct inode *inode = dentry->d_inode;
-	int error = 0;
 	unsigned int ia_valid;
+	int error;
 
 	/*
 	 * Expand the file. Since inode_setattr() updates ->i_size
@@ -371,7 +378,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 			attr->ia_valid &= ~ATTR_MODE;
 	}
 
-	error = inode_setattr(inode, attr);
+	if (attr->ia_valid)
+		error = inode_setattr(inode, attr);
 out:
 	return error;
 }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 7aaa21cf019a..0da04e6d1e34 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -797,8 +797,10 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
 			seq_puts(m, ",uni_xlate");
 		if (!opts->numtail)
 			seq_puts(m, ",nonumtail");
+		if (opts->rodir)
+			seq_puts(m, ",rodir");
 	}
-	if (sbi->options.flush)
+	if (opts->flush)
 		seq_puts(m, ",flush");
 	if (opts->tz_utc)
 		seq_puts(m, ",tz=UTC");
@@ -814,7 +816,7 @@ enum {
 	Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
 	Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
 	Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
-	Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_err,
+	Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err,
 };
 
 static const match_table_t fat_tokens = {
@@ -886,6 +888,7 @@ static const match_table_t vfat_tokens = {
 	{Opt_nonumtail_yes, "nonumtail=yes"},
 	{Opt_nonumtail_yes, "nonumtail=true"},
 	{Opt_nonumtail_yes, "nonumtail"},
+	{Opt_rodir, "rodir"},
 	{Opt_err, NULL}
 };
 
@@ -905,10 +908,13 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 	opts->allow_utime = -1;
 	opts->codepage = fat_default_codepage;
 	opts->iocharset = fat_default_iocharset;
-	if (is_vfat)
+	if (is_vfat) {
 		opts->shortname = VFAT_SFN_DISPLAY_LOWER|VFAT_SFN_CREATE_WIN95;
-	else
+		opts->rodir = 0;
+	} else {
 		opts->shortname = 0;
+		opts->rodir = 1;
+	}
 	opts->name_check = 'n';
 	opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK =  0;
 	opts->utf8 = opts->unicode_xlate = 0;
@@ -1059,6 +1065,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 		case Opt_nonumtail_yes:		/* empty or 1 or yes or true */
 			opts->numtail = 0;	/* negated option */
 			break;
+		case Opt_rodir:
+			opts->rodir = 1;
+			break;
 
 		/* obsolete mount options */
 		case Opt_obsolate:
-- 
cgit v1.2.2


From fa93ca18a8b0da4e26bd9491ad144cd14d22f8ec Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:56 -0800
Subject: fat: Fix _fat_bmap() race

fat_get_cluster() assumes the requested blocknr isn't truncated during
read. _fat_bmap() doesn't follow this rule.

This protects it by ->i_mutex.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/inode.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0da04e6d1e34..be88208b83a6 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -199,7 +199,14 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
 
 static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
 {
-	return generic_block_bmap(mapping, block, fat_get_block);
+	sector_t blocknr;
+
+	/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
+	mutex_lock(&mapping->host->i_mutex);
+	blocknr = generic_block_bmap(mapping, block, fat_get_block);
+	mutex_unlock(&mapping->host->i_mutex);
+
+	return blocknr;
 }
 
 static const struct address_space_operations fat_aops = {
-- 
cgit v1.2.2


From 0e75f5da06c05425f4b375eb981c4489fb2d9787 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:56 -0800
Subject: fat: Add printf attribute to fat_fs_panic()

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e9dce5d8e7a7..a69f7f9757c0 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -308,7 +308,8 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
 		            struct inode *i2);
 /* fat/misc.c */
-extern void fat_fs_panic(struct super_block *s, const char *fmt, ...);
+extern void fat_fs_panic(struct super_block *s, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3))) __cold;
 extern void fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
 extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
-- 
cgit v1.2.2


From 2bdf67eb1631f30e2f3f5d49e4007c76e88877a8 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:57 -0800
Subject: fat: mmu_private race fix

mmu_private is 64bits value, hence it's not atomic to update.

So, the access rule for mmu_private is we must hold ->i_mutex.  But,
fat_get_block() path doesn't follow the rule on non-allocation path.

This fixes by using i_size instead if non-allocation path.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/cache.c | 23 ++++++++++++++++++-----
 fs/fat/dir.c   |  2 +-
 fs/fat/fat.h   |  6 ++++--
 fs/fat/inode.c |  4 ++--
 4 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 589edde9053c..b42602298087 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -293,10 +293,12 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
 }
 
 int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
-	     unsigned long *mapped_blocks)
+	     unsigned long *mapped_blocks, int create)
 {
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	const unsigned long blocksize = sb->s_blocksize;
+	const unsigned char blocksize_bits = sb->s_blocksize_bits;
 	sector_t last_block;
 	int cluster, offset;
 
@@ -309,10 +311,21 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
 		}
 		return 0;
 	}
-	last_block = (MSDOS_I(inode)->mmu_private + (sb->s_blocksize - 1))
-		>> sb->s_blocksize_bits;
-	if (sector >= last_block)
-		return 0;
+
+	last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
+	if (sector >= last_block) {
+		if (!create)
+			return 0;
+
+		/*
+		 * ->mmu_private can access on only allocation path.
+		 * (caller must hold ->i_mutex)
+		 */
+		last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+			>> blocksize_bits;
+		if (sector >= last_block)
+			return 0;
+	}
 
 	cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
 	offset  = sector & (sbi->sec_per_clus - 1);
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 140fc39e2307..2ecaa17acdb5 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -77,7 +77,7 @@ next:
 
 	*bh = NULL;
 	iblock = *pos >> sb->s_blocksize_bits;
-	err = fat_bmap(dir, iblock, &phys, &mapped_blocks);
+	err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0);
 	if (err || !phys)
 		return -1;	/* beyond EOF or error */
 
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index a69f7f9757c0..4efc5038ed29 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -91,7 +91,9 @@ struct msdos_inode_info {
 	/* for avoiding the race between fat_free() and fat_get_cluster() */
 	unsigned int cache_valid_id;
 
-	loff_t mmu_private;
+	/* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */
+	loff_t mmu_private;	/* physically allocated size */
+
 	int i_start;		/* first cluster or 0 */
 	int i_logstart;		/* logical first cluster */
 	int i_attrs;		/* unused attribute bits */
@@ -222,7 +224,7 @@ extern void fat_cache_inval_inode(struct inode *inode);
 extern int fat_get_cluster(struct inode *inode, int cluster,
 			   int *fclus, int *dclus);
 extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
-		    unsigned long *mapped_blocks);
+		    unsigned long *mapped_blocks, int create);
 
 /* fat/dir.c */
 extern const struct file_operations fat_dir_operations;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index be88208b83a6..9e37ad93c730 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -64,7 +64,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
 	sector_t phys;
 	int err, offset;
 
-	err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
 	if (err)
 		return err;
 	if (phys) {
@@ -94,7 +94,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
 	*max_blocks = min(mapped_blocks, *max_blocks);
 	MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
 
-	err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
 	if (err)
 		return err;
 
-- 
cgit v1.2.2


From 9ca59f4c3d28df14a1545a1e2832f34a0a50e3ed Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:57 -0800
Subject: fat: ->i_pos race fix

i_pos is 64bits value, hence it's not atomic to update.

Important place is fat_write_inode() only, other places without lock
are just for printk().

This adds lock for "BITS_PER_LONG == 32" kernel.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/inode.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 9e37ad93c730..bdd8fb7be2ca 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -542,6 +542,20 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
+static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
+				    struct inode *inode)
+{
+	loff_t i_pos;
+#if BITS_PER_LONG == 32
+	spin_lock(&sbi->inode_hash_lock);
+#endif
+	i_pos = MSDOS_I(inode)->i_pos;
+#if BITS_PER_LONG == 32
+	spin_unlock(&sbi->inode_hash_lock);
+#endif
+	return i_pos;
+}
+
 static int fat_write_inode(struct inode *inode, int wait)
 {
 	struct super_block *sb = inode->i_sb;
@@ -551,9 +565,12 @@ static int fat_write_inode(struct inode *inode, int wait)
 	loff_t i_pos;
 	int err;
 
+	if (inode->i_ino == MSDOS_ROOT_INO)
+		return 0;
+
 retry:
-	i_pos = MSDOS_I(inode)->i_pos;
-	if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
+	i_pos = fat_i_pos_read(sbi, inode);
+	if (!i_pos)
 		return 0;
 
 	bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
-- 
cgit v1.2.2


From c3302931db090d87e9015c3a7ce5c97a7dd90f78 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:58 -0800
Subject: fat: i_blocks warning fix

blkcnt_t type depends on CONFIG_LSF. Use unsigned long long always for
printk().  But lazy to type it, so add "llu" and use it.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c    | 2 +-
 fs/fat/fat.h    | 3 +++
 fs/fat/fatent.c | 5 ++---
 fs/fat/misc.c   | 5 +++--
 4 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 2ecaa17acdb5..67e058357098 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -86,7 +86,7 @@ next:
 	*bh = sb_bread(sb, phys);
 	if (*bh == NULL) {
 		printk(KERN_ERR "FAT: Directory bread(block %llu) failed\n",
-		       (unsigned long long)phys);
+		       (llu)phys);
 		/* skip this block */
 		*pos = (iblock + 1) << sb->s_blocksize_bits;
 		goto next;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 4efc5038ed29..ea440d65819c 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -323,4 +323,7 @@ extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
 int fat_cache_init(void);
 void fat_cache_destroy(void);
 
+/* helper for printk */
+typedef unsigned long long	llu;
+
 #endif /* !_FAT_H */
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 13513992da3c..da6eea47872f 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -93,8 +93,7 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
 err_brelse:
 	brelse(bhs[0]);
 err:
-	printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
-	       (unsigned long long)blocknr);
+	printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n", (llu)blocknr);
 	return -EIO;
 }
 
@@ -107,7 +106,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
 	fatent->bhs[0] = sb_bread(sb, blocknr);
 	if (!fatent->bhs[0]) {
 		printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
-		       (unsigned long long)blocknr);
+		       (llu)blocknr);
 		return -EIO;
 	}
 	fatent->nr_bhs = 1;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index a191e79e66a9..ac39ebcc1496 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -124,8 +124,9 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
 			mark_inode_dirty(inode);
 	}
 	if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) {
-		fat_fs_panic(sb, "clusters badly computed (%d != %lu)",
-			new_fclus, inode->i_blocks >> (sbi->cluster_bits - 9));
+		fat_fs_panic(sb, "clusters badly computed (%d != %llu)",
+			     new_fclus,
+			     (llu)(inode->i_blocks >> (sbi->cluster_bits - 9)));
 		fat_cache_inval_inode(inode);
 	}
 	inode->i_blocks += nr_cluster << (sbi->cluster_bits - 9);
-- 
cgit v1.2.2


From ed9b3e3379731e9f9d2f73f3d7fd9e7d2ce3df4a Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 7 Nov 2008 09:06:45 -0500
Subject: ext4: Mark the buffer_heads as dirty and uptodate after prepare_write

We need to make sure we mark the buffer_heads as dirty and uptodate
so that block_write_full_page write them correctly.

This fixes mmap corruptions that can occur in low memory situations.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5a130b56f1cf..be21a5ae33cb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2329,6 +2329,8 @@ static int ext4_da_writepage(struct page *page,
 			unlock_page(page);
 			return 0;
 		}
+		/* now mark the buffer_heads as dirty and uptodate */
+		block_commit_write(page, 0, PAGE_CACHE_SIZE);
 	}
 
 	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-- 
cgit v1.2.2


From 23712a9c28b9f80a8cf70c8490358d5f562d2465 Mon Sep 17 00:00:00 2001
From: Frederic Bohe <frederic.bohe@bull.net>
Date: Fri, 7 Nov 2008 09:21:01 -0500
Subject: ext4: add checksum calculation when clearing UNINIT flag in
 ext4_new_inode

When initializing an uninitialized block group in ext4_new_inode(),
its block group checksum must be re-calculated.  This fixes a race
when several threads try to allocate a new inode in an UNINIT'd group.

There is some question whether we need to be initializing the block
bitmap in ext4_new_inode() at all, but for now, if we are going to
init the block group, let's eliminate the race.

Signed-off-by: Frederic Bohe <frederic.bohe@bull.net>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fe34d74cfb19..2a117e286e54 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -718,6 +718,8 @@ got:
 			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
 			free = ext4_free_blocks_after_init(sb, group, gdp);
 			gdp->bg_free_blocks_count = cpu_to_le16(free);
+			gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
+								gdp);
 		}
 		spin_unlock(sb_bgl_lock(sbi, group));
 
-- 
cgit v1.2.2


From b726e923ea4d216027e466aa602d914e4b4a63af Mon Sep 17 00:00:00 2001
From: Doug Nazar <nazard@dragoninc.ca>
Date: Wed, 5 Nov 2008 06:16:28 -0500
Subject: Fix nfsd truncation of readdir results

Commit 8d7c4203 "nfsd: fix failure to set eof in readdir in some
situations" introduced a bug: on a directory in an exported ext3
filesystem with dir_index unset, a READDIR will only return about 250
entries, even if the directory was larger.

Bisected it back to this commit; reverting it fixes the problem.

It turns out that in this case ext3 reads a block at a time, then
returns from readdir, which means we can end up with buf.full==0 but
with more entries in the directory still to be read.  Before 8d7c4203
(but after c002a6c797 "Optimise NFS readdir hack slightly"), this would
cause us to return the READDIR result immediately, but with the eof bit
unset.  That could cause a performance regression (because the client
would need more roundtrips to the server to read the whole directory),
but no loss in correctness, since the cleared eof bit caused the client
to send another readdir.  After 8d7c4203, the setting of the eof bit
made this a correctness problem.

So, move nfserr_eof into the loop and remove the buf.full check so that
we loop until buf.used==0.  The following seems to do the right thing
and reduces the network traffic since we don't return a READDIR result
until the buffer is full.

Tested on an empty directory & large directory; eof is properly sent and
there are no more short buffers.

Signed-off-by: Doug Nazar <nazard@dragoninc.ca>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/vfs.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 848a03e83a42..4433c8f00163 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1875,11 +1875,11 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
 		return -ENOMEM;
 
 	offset = *offsetp;
-	cdp->err = nfserr_eof; /* will be cleared on successful read */
 
 	while (1) {
 		unsigned int reclen;
 
+		cdp->err = nfserr_eof; /* will be cleared on successful read */
 		buf.used = 0;
 		buf.full = 0;
 
@@ -1912,9 +1912,6 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
 		offset = vfs_llseek(file, 0, SEEK_CUR);
-		cdp->err = nfserr_eof;
-		if (!buf.full)
-			break;
 	}
 
  done:
-- 
cgit v1.2.2


From 9ccbece546cf836f67f6d9bb4bf2f70f7476cb2c Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Thu, 30 Oct 2008 16:53:25 +1100
Subject: [XFS] Fix use-after-free with log and quotas

Destroying the quota stuff on unmount can access the log - ie
XFS_QM_DONE() ends up in xfs_dqunlock() which calls
xfs_trans_unlocked_item() and then xfs_log_move_tail(). By this time the
log has already been destroyed. Just move the cleanup of the quota code
earlier in xfs_unmountfs() before the call to xfs_log_unmount(). Moving
XFS_QM_DONE() up near XFS_QM_DQPURGEALL() seems like a good spot.

SGI-PV: 987086

SGI-Modid: xfs-linux-melb:xfs-kern:32148a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Peter Leckie <pleckie@sgi.com>
---
 fs/xfs/xfs_mount.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index a4503f5e9497..15f5dd22fbb2 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1245,6 +1245,9 @@ xfs_unmountfs(
 
 	XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
 
+	if (mp->m_quotainfo)
+		XFS_QM_DONE(mp);
+
 	/*
 	 * Flush out the log synchronously so that we know for sure
 	 * that nothing is pinned.  This is important because bflush()
@@ -1297,8 +1300,6 @@ xfs_unmountfs(
 	xfs_errortag_clearall(mp, 0);
 #endif
 	xfs_free_perag(mp);
-	if (mp->m_quotainfo)
-		XFS_QM_DONE(mp);
 }
 
 STATIC void
-- 
cgit v1.2.2


From 2cf7f0da3ae225848a2ee10d4e216448a770fd00 Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Thu, 30 Oct 2008 16:59:06 +1100
Subject: [XFS] Wait for all I/O on truncate to zero file size

It's possible to have outstanding xfs_ioend_t's queued when the file size
is zero. This can happen in the direct I/O path when a direct I/O write
fails due to ENOSPC. In this case the xfs_ioend_t will still be queued (ie
xfs_end_io_direct() does not know that the I/O failed so can't force the
xfs_ioend_t to be flushed synchronously).

When we truncate a file on unlink we don't know to wait for these
xfs_ioend_ts and we can have a use-after-free situation if the inode is
reclaimed before the xfs_ioend_t is finally processed.

As was suggested by Dave Chinner lets wait for all I/Os to complete when
truncating the file size to zero.

SGI-PV: 981668

SGI-Modid: xfs-linux-melb:xfs-kern:32216a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
---
 fs/xfs/xfs_inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index dbd9cef852ec..a391b955df01 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1414,7 +1414,7 @@ xfs_itruncate_start(
 	mp = ip->i_mount;
 
 	/* wait for the completion of any pending DIOs */
-	if (new_size < ip->i_size)
+	if (new_size == 0 || new_size < ip->i_size)
 		vn_iowait(ip);
 
 	/*
-- 
cgit v1.2.2


From 6f9f51adb6ac0a49fce49e01c47dcfc2810c6e9d Mon Sep 17 00:00:00 2001
From: David Chinner <david@fromorbit.com>
Date: Thu, 30 Oct 2008 17:38:12 +1100
Subject: [XFS] Account for allocated blocks when expanding directories

When we create a directory, we reserve a number of blocks for the maximum
possible expansion of of the directory due to various btree splits,
freespace allocation, etc. Unfortunately, each allocation is not reflected
in the total number of blocks still available to the transaction, so the
maximal reservation is used over and over again.

This leads to problems where an allocation group has only enough blocks
for *some* of the allocations required for the directory modification.
After the first N allocations, the remaining blocks in the allocation
group drops below the total reservation, and subsequent allocations fail
because the allocator will not allow the allocation to proceed if the AG
does not have the enough blocks available for the entire allocation total.

This results in an ENOSPC occurring after an allocation has already
occurred. This results in aborting the directory operation (leaving the
directory in an inconsistent state) and cancelling a dirty transaction,
which results in a filesystem shutdown.

Avoid the problem by reflecting the number of blocks allocated in any
directory expansion in the total number of blocks available to the
modification in progress. This prevents a directory modification from
being aborted part way through with an ENOSPC.

SGI-PV: 988144

SGI-Modid: xfs-linux-melb:xfs-kern:32340a

Signed-off-by: David Chinner <david@fromorbit.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_da_btree.c | 5 +++++
 fs/xfs/xfs_dir2.c     | 6 ++++++
 2 files changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 9e561a9cefca..a11a8390bf6c 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1566,11 +1566,14 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 	int nmap, error, w, count, c, got, i, mapi;
 	xfs_trans_t *tp;
 	xfs_mount_t *mp;
+	xfs_drfsbno_t	nblks;
 
 	dp = args->dp;
 	mp = dp->i_mount;
 	w = args->whichfork;
 	tp = args->trans;
+	nblks = dp->i_d.di_nblocks;
+
 	/*
 	 * For new directories adjust the file offset and block count.
 	 */
@@ -1647,6 +1650,8 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 	}
 	if (mapp != &map)
 		kmem_free(mapp);
+	/* account for newly allocated blocks in reserved blocks total */
+	args->total -= dp->i_d.di_nblocks - nblks;
 	*new_blkno = (xfs_dablk_t)bno;
 	return 0;
 }
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 80e0dc51361c..1afb12278b8d 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -525,11 +525,13 @@ xfs_dir2_grow_inode(
 	xfs_mount_t	*mp;
 	int		nmap;		/* number of bmap entries */
 	xfs_trans_t	*tp;
+	xfs_drfsbno_t	nblks;
 
 	xfs_dir2_trace_args_s("grow_inode", args, space);
 	dp = args->dp;
 	tp = args->trans;
 	mp = dp->i_mount;
+	nblks = dp->i_d.di_nblocks;
 	/*
 	 * Set lowest possible block in the space requested.
 	 */
@@ -622,7 +624,11 @@ xfs_dir2_grow_inode(
 	 */
 	if (mapp != &map)
 		kmem_free(mapp);
+
+	/* account for newly allocated blocks in reserved blocks total */
+	args->total -= dp->i_d.di_nblocks - nblks;
 	*dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno);
+
 	/*
 	 * Update file's size if this is the data space and it grew.
 	 */
-- 
cgit v1.2.2


From 8f330f5149ef41ff943b04d914406cc417f62784 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 10 Nov 2008 16:50:24 +1100
Subject: [XFS] handle memory allocation failures during log initialisation

When there is no memory left in the system, xfs_buf_get_noaddr()
can fail. If this happens at mount time during xlog_alloc_log()
we fail to catch the error and oops.

Catch the error from xfs_buf_get_noaddr(), and allow other memory
allocations to fail and catch those errors too. Report the error
to the console and fail the mount with ENOMEM.

Tested by manually injecting errors into xfs_buf_get_noaddr() and
xlog_alloc_log().

Version 2:
o remove unnecessary casts of the returned pointer from kmem_zalloc()

SGI-PV: 987246

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_log.c | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 0b02c6443551..3608a0f0a5f6 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -563,6 +563,11 @@ xfs_log_mount(
 	}
 
 	mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
+	if (!mp->m_log) {
+		cmn_err(CE_WARN, "XFS: Log allocation failed: No memory!");
+		error = ENOMEM;
+		goto out;
+	}
 
 	/*
 	 * Initialize the AIL now we have a log.
@@ -601,6 +606,7 @@ xfs_log_mount(
 	return 0;
 error:
 	xfs_log_unmount_dealloc(mp);
+out:
 	return error;
 }	/* xfs_log_mount */
 
@@ -1217,7 +1223,9 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	int			i;
 	int			iclogsize;
 
-	log = (xlog_t *)kmem_zalloc(sizeof(xlog_t), KM_SLEEP);
+	log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
+	if (!log)
+		return NULL;
 
 	log->l_mp	   = mp;
 	log->l_targ	   = log_target;
@@ -1249,6 +1257,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	xlog_get_iclog_buffer_size(mp, log);
 
 	bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
+	if (!bp)
+		goto out_free_log;
 	XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
 	XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
 	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
@@ -1275,13 +1285,17 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	iclogsize = log->l_iclog_size;
 	ASSERT(log->l_iclog_size >= 4096);
 	for (i=0; i < log->l_iclog_bufs; i++) {
-		*iclogp = (xlog_in_core_t *)
-			  kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP);
+		*iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
+		if (!*iclogp)
+			goto out_free_iclog;
+
 		iclog = *iclogp;
 		iclog->ic_prev = prev_iclog;
 		prev_iclog = iclog;
 
 		bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp);
+		if (!bp)
+			goto out_free_iclog;
 		if (!XFS_BUF_CPSEMA(bp))
 			ASSERT(0);
 		XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
@@ -1323,6 +1337,25 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	log->l_iclog->ic_prev = prev_iclog;	/* re-write 1st prev ptr */
 
 	return log;
+
+out_free_iclog:
+	for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
+		prev_iclog = iclog->ic_next;
+		if (iclog->ic_bp) {
+			sv_destroy(&iclog->ic_force_wait);
+			sv_destroy(&iclog->ic_write_wait);
+			xfs_buf_free(iclog->ic_bp);
+			xlog_trace_iclog_dealloc(iclog);
+		}
+		kmem_free(iclog);
+	}
+	spinlock_destroy(&log->l_icloglock);
+	spinlock_destroy(&log->l_grant_lock);
+	xlog_trace_loggrant_dealloc(log);
+	xfs_buf_free(log->l_xbuf);
+out_free_log:
+	kmem_free(log);
+	return NULL;
 }	/* xlog_alloc_log */
 
 
-- 
cgit v1.2.2


From 220ca310a53200b4bfbc7c4c6e365eea284ec44f Mon Sep 17 00:00:00 2001
From: David Chinner <david@fromorbit.com>
Date: Thu, 30 Oct 2008 17:40:09 +1100
Subject: [XFS] XFS: Check for valid transaction headers in recovery

When we are about to add a new item to a transaction in recovery, we need
to check that it is valid first. Currently we just assert that header
magic number matches, but in production systems that is not present and we
add a corrupted transaction to the list to be processed. This results in a
kernel oops later when processing the corrupted transaction.

Instead, if we detect a corrupted transaction, abort recovery and leave
the user to clean up the mess that has occurred.

SGI-PV: 988145

SGI-Modid: xfs-linux-melb:xfs-kern:32356a

Signed-off-by: David Chinner <david@fromorbit.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 82d46ce69d5f..70e3ba32e6be 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1419,7 +1419,13 @@ xlog_recover_add_to_trans(
 		return 0;
 	item = trans->r_itemq;
 	if (item == NULL) {
-		ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC);
+		/* we need to catch log corruptions here */
+		if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
+			xlog_warn("XFS: xlog_recover_add_to_trans: "
+				  "bad header magic number");
+			ASSERT(0);
+			return XFS_ERROR(EIO);
+		}
 		if (len == sizeof(xfs_trans_header_t))
 			xlog_recover_add_item(&trans->r_itemq);
 		memcpy(&trans->r_theader, dp, len); /* d, s, l */
-- 
cgit v1.2.2


From c3cb6827353102fee62f3b9401a03ee29b297e5b Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Thu, 23 Oct 2008 16:33:03 +0800
Subject: ocfs2: fix license in xattr

This patch fixes the license in xattr.c and xattr.h.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 13 ++++---------
 fs/ocfs2/xattr.h | 12 ++----------
 2 files changed, 6 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 802c41492214..2f8952e4e4c1 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3,25 +3,20 @@
  *
  * xattr.c
  *
- * Copyright (C) 2008 Oracle.  All rights reserved.
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
  *
  * CREDITS:
- * Lots of code in this file is taken from ext3.
+ * Lots of code in this file is copy from linux/fs/ext3/xattr.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * License version 2 as published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/capability.h>
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index c25c7c62a059..e4e45c81a261 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -3,24 +3,16 @@
  *
  * xattr.h
  *
- * Function prototypes
- *
- * Copyright (C) 2008 Oracle.  All rights reserved.
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * License version 2 as published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_XATTR_H
-- 
cgit v1.2.2


From 0030e001505d2d1503c083c917a747c033eaf8cd Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Thu, 23 Oct 2008 16:33:33 +0800
Subject: ocfs2: fix function declaration and definition in xattr

Because we merged the xattr sources into one file, some functions
no longer belong in the header file.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 28 +++++++++++++++++++++++-----
 fs/ocfs2/xattr.h | 26 ++++----------------------
 2 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2f8952e4e4c1..420d8e30b184 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -132,6 +132,24 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 static int ocfs2_delete_xattr_index_block(struct inode *inode,
 					  struct buffer_head *xb_bh);
 
+static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
+{
+	return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
+}
+
+static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
+{
+	return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
+}
+
+static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
+{
+	u16 len = sb->s_blocksize -
+		 offsetof(struct ocfs2_xattr_header, xh_entries);
+
+	return len / sizeof(struct ocfs2_xattr_entry);
+}
+
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
 	struct xattr_handler *handler = NULL;
@@ -832,11 +850,11 @@ cleanup:
  * Copy an extended attribute into the buffer provided.
  * Buffer is NULL to compute the size of buffer required.
  */
-int ocfs2_xattr_get(struct inode *inode,
-		    int name_index,
-		    const char *name,
-		    void *buffer,
-		    size_t buffer_size)
+static int ocfs2_xattr_get(struct inode *inode,
+			   int name_index,
+			   const char *name,
+			   void *buffer,
+			   size_t buffer_size)
 {
 	int ret;
 	struct ocfs2_dinode *di = NULL;
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index e4e45c81a261..1d8314c7656d 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -32,29 +32,11 @@ enum ocfs2_xattr_type {
 
 extern struct xattr_handler ocfs2_xattr_user_handler;
 extern struct xattr_handler ocfs2_xattr_trusted_handler;
-
-extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
-extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t);
-extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
-			   size_t, int);
-extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh);
 extern struct xattr_handler *ocfs2_xattr_handlers[];
 
-static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
-{
-	return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
-}
-
-static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
-{
-	return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
-}
-
-static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
-{
-	u16 len = sb->s_blocksize -
-		 offsetof(struct ocfs2_xattr_header, xh_entries);
+ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
+		    size_t, int);
+int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
 
-	return len / sizeof(struct ocfs2_xattr_entry);
-}
 #endif /* OCFS2_XATTR_H */
-- 
cgit v1.2.2


From ceb1eba3dc2ad94b25764785ff7d2082c6094115 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Thu, 23 Oct 2008 16:34:13 +0800
Subject: ocfs2: remove duplicate definition in xattr

Include/linux/xattr.h already has the definition about xattr prefix,
so remove the duplicate definitions in xattr.c.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 420d8e30b184..a9da45bbb9ed 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -4740,14 +4740,11 @@ out:
 /*
  * 'trusted' attributes support
  */
-
-#define XATTR_TRUSTED_PREFIX "trusted."
-
 static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
 				       size_t list_size, const char *name,
 				       size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1;
+	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (list && total_len <= list_size) {
@@ -4784,18 +4781,14 @@ struct xattr_handler ocfs2_xattr_trusted_handler = {
 	.set	= ocfs2_xattr_trusted_set,
 };
 
-
 /*
  * 'user' attributes support
  */
-
-#define XATTR_USER_PREFIX "user."
-
 static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
 				    size_t list_size, const char *name,
 				    size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1;
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-- 
cgit v1.2.2


From c988fd045f1195e62c0970384903ab9da26a9359 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Thu, 23 Oct 2008 16:34:44 +0800
Subject: ocfs2: add handler_map array bounds checking

Make the handler_map array as large as the possible value range to avoid
a fencepost error.

[ Utilize alternate method -- Joel ]

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index a9da45bbb9ed..e19980a71a3c 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -78,7 +78,7 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
 	NULL
 };
 
-static struct xattr_handler *ocfs2_xattr_handler_map[] = {
+static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
 	[OCFS2_XATTR_INDEX_USER]	= &ocfs2_xattr_user_handler,
 	[OCFS2_XATTR_INDEX_TRUSTED]	= &ocfs2_xattr_trusted_handler,
 };
-- 
cgit v1.2.2


From f6087fb799e097e7c9d912daa75701de9d62dc53 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 20 Oct 2008 18:20:43 -0700
Subject: ocfs2: Check xattr block signatures properly.

The xattr.c code is currently memcmp()ing naking buffer pointers.
Create the OCFS2_IS_VALID_XATTR_BLOCK() macro to match its peers and use
that.

In addition, failed signature checks were returning -EFAULT, which is
completely wrong.  Return -EIO.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/ocfs2.h |  3 +++
 fs/ocfs2/xattr.c | 38 ++++++++++++++++----------------------
 2 files changed, 19 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index a21a465490c4..fef7ece32376 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -473,6 +473,9 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 		(____gd)->bg_signature);				\
 } while (0)
 
+#define OCFS2_IS_VALID_XATTR_BLOCK(ptr)					\
+	(!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
+
 static inline unsigned long ino_from_blkno(struct super_block *sb,
 					   u64 blkno)
 {
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e19980a71a3c..151ba6257fbb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -555,14 +555,12 @@ static int ocfs2_xattr_block_list(struct inode *inode,
 		mlog_errno(ret);
 		return ret;
 	}
-	/*Verify the signature of xattr block*/
-	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
-		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
-		ret = -EFAULT;
-		goto cleanup;
-	}
 
 	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+		ret = -EIO;
+		goto cleanup;
+	}
 
 	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
@@ -779,15 +777,14 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 		mlog_errno(ret);
 		return ret;
 	}
-	/*Verify the signature of xattr block*/
-	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
-		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
-		ret = -EFAULT;
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+		ret = -EIO;
 		goto cleanup;
 	}
 
 	xs->xattr_bh = blk_bh;
-	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
 
 	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		xs->header = &xb->xb_attrs.xb_header;
@@ -1527,10 +1524,9 @@ static int ocfs2_xattr_free_block(struct inode *inode,
 		goto out;
 	}
 
-	/*Verify the signature of xattr block*/
-	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
-		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
-		ret = -EFAULT;
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+		ret = -EIO;
 		goto out;
 	}
 
@@ -1540,7 +1536,6 @@ static int ocfs2_xattr_free_block(struct inode *inode,
 		goto out;
 	}
 
-	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
 	blk = le64_to_cpu(xb->xb_blkno);
 	bit = le16_to_cpu(xb->xb_suballoc_bit);
 	bg_blkno = ocfs2_which_suballoc_group(blk, bit);
@@ -1784,15 +1779,14 @@ static int ocfs2_xattr_block_find(struct inode *inode,
 		mlog_errno(ret);
 		return ret;
 	}
-	/*Verify the signature of xattr block*/
-	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
-		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
-			ret = -EFAULT;
-			goto cleanup;
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+		ret = -EIO;
+		goto cleanup;
 	}
 
 	xs->xattr_bh = blk_bh;
-	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
 
 	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		xs->header = &xb->xb_attrs.xb_header;
-- 
cgit v1.2.2


From b37c4d84e9d16fd5b6f31197f02ea0a112fc9e99 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 20 Oct 2008 18:24:03 -0700
Subject: ocfs2: Don't return -EFAULT from a corrupt xattr entry.

If the xattr disk structures are corrupt, return -EIO, not -EFAULT.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 151ba6257fbb..41a6ca004ae3 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1239,7 +1239,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 
 	free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
 	if (free < 0)
-		return -EFAULT;
+		return -EIO;
 
 	if (!xs->not_found) {
 		size_t size = 0;
-- 
cgit v1.2.2


From bd60bd37ade4321ecce4ed4442f68c88febd76d5 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 20 Oct 2008 18:25:56 -0700
Subject: ocfs2: Check errors from ocfs2_xattr_update_xattr_search()

The ocfs2_xattr_update_xattr_search() function can return an error when
trying to read blocks off of disk.  The caller needs to check this error
before using those (possibly invalid) blocks.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 41a6ca004ae3..92df88a41e5d 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2825,7 +2825,11 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	if (data_bh)
 		ocfs2_journal_dirty(handle, data_bh);
 
-	ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
+	ret = ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
 
 	/* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
 	memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
-- 
cgit v1.2.2


From eb6ff2397d1fdfc6a7629c99896338e5b5c508e5 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 20 Oct 2008 18:32:48 -0700
Subject: ocfs2: Specify appropriate journal access for new xattr buckets.

There are a couple places that get an xattr bucket that may be reading
an existing one or may be allocating a new one.  They should specify the
correct journal access mode depending.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 92df88a41e5d..fb450200bc88 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3231,7 +3231,9 @@ static int ocfs2_half_xattr_bucket(struct inode *inode,
 
 	for (i = 0; i < blk_per_bucket; i++) {
 		ret = ocfs2_journal_access(handle, inode, t_bhs[i],
-					   OCFS2_JOURNAL_ACCESS_CREATE);
+					   new_bucket_head ?
+					   OCFS2_JOURNAL_ACCESS_CREATE :
+					   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -3393,6 +3395,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 
 	for (i = 0; i < blk_per_bucket; i++) {
 		ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+					   t_is_new ?
+					   OCFS2_JOURNAL_ACCESS_CREATE :
 					   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret)
 			goto out;
-- 
cgit v1.2.2


From 54f443f4e7265a1333886dbace31cb6eb1991c72 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 20 Oct 2008 18:43:07 -0700
Subject: ocfs2: Don't repeat ocfs2_xattr_block_find()

ocfs2_xattr_block_get() looks up the xattr in a startlingly familiar
way; it's identical to the function ocfs2_xattr_block_find().  Let's just
use the later in the former.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 39 +++++++++------------------------------
 1 file changed, 9 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index fb450200bc88..74d1faba23bb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -111,6 +111,10 @@ static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
 					     int *block_off,
 					     int *new_offset);
 
+static int ocfs2_xattr_block_find(struct inode *inode,
+				  int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs);
 static int ocfs2_xattr_index_block_find(struct inode *inode,
 					struct buffer_head *root_bh,
 					int name_index,
@@ -760,46 +764,20 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 				 size_t buffer_size,
 				 struct ocfs2_xattr_search *xs)
 {
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
-	struct buffer_head *blk_bh = NULL;
 	struct ocfs2_xattr_block *xb;
 	struct ocfs2_xattr_value_root *xv;
 	size_t size;
 	int ret = -ENODATA, name_offset, name_len, block_off, i;
 
-	if (!di->i_xattr_loc)
-		return ret;
-
 	memset(&xs->bucket, 0, sizeof(xs->bucket));
 
-	ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
-	if (ret < 0) {
+	ret = ocfs2_xattr_block_find(inode, name_index, name, xs);
+	if (ret) {
 		mlog_errno(ret);
-		return ret;
-	}
-
-	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-		ret = -EIO;
 		goto cleanup;
 	}
 
-	xs->xattr_bh = blk_bh;
-
-	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
-		xs->header = &xb->xb_attrs.xb_header;
-		xs->base = (void *)xs->header;
-		xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
-		xs->here = xs->header->xh_entries;
-
-		ret = ocfs2_xattr_find_entry(name_index, name, xs);
-	} else
-		ret = ocfs2_xattr_index_block_find(inode, blk_bh,
-						   name_index,
-						   name, xs);
-
-	if (ret)
-		goto cleanup;
+	xb = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
 	size = le64_to_cpu(xs->here->xe_value_size);
 	if (buffer) {
 		ret = -ERANGE;
@@ -838,7 +816,8 @@ cleanup:
 		brelse(xs->bucket.bhs[i]);
 	memset(&xs->bucket, 0, sizeof(xs->bucket));
 
-	brelse(blk_bh);
+	brelse(xs->xattr_bh);
+	xs->xattr_bh = NULL;
 	return ret;
 }
 
-- 
cgit v1.2.2


From 63fd77573723841d5d44a79471258f1b261f4482 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 17 Oct 2008 12:44:36 +0800
Subject: ocfs2: Remove unused ocfs2_restore_xattr_block().

Since now ocfs2 supports empty xattr buckets, we will never remove
the xattr index tree even if all the xattrs are removed, so this
function will never be called. So remove it.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 48 ------------------------------------------------
 1 file changed, 48 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 74d1faba23bb..789fb70462c9 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1791,52 +1791,6 @@ cleanup:
 	return ret;
 }
 
-/*
- * When all the xattrs are deleted from index btree, the ocfs2_xattr_tree
- * will be erased and ocfs2_xattr_block will have its ocfs2_xattr_header
- * re-initialized.
- */
-static int ocfs2_restore_xattr_block(struct inode *inode,
-				     struct ocfs2_xattr_search *xs)
-{
-	int ret;
-	handle_t *handle;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_xattr_block *xb =
-		(struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
-	struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
-	u16 xb_flags = le16_to_cpu(xb->xb_flags);
-
-	BUG_ON(!(xb_flags & OCFS2_XATTR_INDEXED) ||
-		le16_to_cpu(el->l_next_free_rec) != 0);
-
-	handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		handle = NULL;
-		goto out;
-	}
-
-	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
-	       offsetof(struct ocfs2_xattr_block, xb_attrs));
-
-	xb->xb_flags = cpu_to_le16(xb_flags & ~OCFS2_XATTR_INDEXED);
-
-	ocfs2_journal_dirty(handle, xs->xattr_bh);
-
-out_commit:
-	ocfs2_commit_trans(osb, handle);
-out:
-	return ret;
-}
-
 /*
  * ocfs2_xattr_block_set()
  *
@@ -1947,8 +1901,6 @@ out:
 	}
 
 	ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
-	if (!ret && xblk->xb_attrs.xb_root.xt_list.l_next_free_rec == 0)
-		ret = ocfs2_restore_xattr_block(inode, xs);
 
 end:
 
-- 
cgit v1.2.2


From 8573f79d30077875e2b6e83849b5245bfbb08685 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 24 Oct 2008 22:24:17 +0800
Subject: ocfs2: Fix some typos in xattr annotations.

Fix some typos in the xattr annotations.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Reported-by: Coly Li <coyli@suse.de>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/ocfs2_fs.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index f24ce3d3f956..5f180cf7abbd 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -742,12 +742,12 @@ struct ocfs2_group_desc
  */
 struct ocfs2_xattr_entry {
 	__le32	xe_name_hash;    /* hash value of xattr prefix+suffix. */
-	__le16	xe_name_offset;  /* byte offset from the 1st etnry in the local
+	__le16	xe_name_offset;  /* byte offset from the 1st entry in the
 				    local xattr storage(inode, xattr block or
 				    xattr bucket). */
 	__u8	xe_name_len;	 /* xattr name len, does't include prefix. */
-	__u8	xe_type;         /* the low 7 bits indicates the name prefix's
-				  * type and the highest 1 bits indicate whether
+	__u8	xe_type;         /* the low 7 bits indicate the name prefix
+				  * type and the highest bit indicates whether
 				  * the EA is stored in the local storage. */
 	__le64	xe_value_size;	 /* real xattr value length. */
 };
@@ -766,9 +766,10 @@ struct ocfs2_xattr_header {
 						   xattr. */
 	__le16	xh_name_value_len;              /* total length of name/value
 						   length in this bucket. */
-	__le16	xh_num_buckets;                 /* bucket nums in one extent
-						   record, only valid in the
-						   first bucket. */
+	__le16	xh_num_buckets;                 /* Number of xattr buckets
+						   in this extent record,
+						   only valid in the first
+						   bucket. */
 	__le64  xh_csum;
 	struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
 };
@@ -776,8 +777,8 @@ struct ocfs2_xattr_header {
 /*
  * On disk structure for xattr value root.
  *
- * It is used when one extended attribute's size is larger, and we will save it
- * in an outside cluster. It will stored in a b-tree like file content.
+ * When an xattr's value is large enough, it is stored in an external
+ * b-tree like file data.  The xattr value root points to this structure.
  */
 struct ocfs2_xattr_value_root {
 /*00*/	__le32	xr_clusters;              /* clusters covered by xattr value. */
-- 
cgit v1.2.2


From fa38e92cb34e27e60d0faf1035934eb9b44aa1d4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 20 Oct 2008 19:23:51 +0200
Subject: ocfs2: Fix check of return value of ocfs2_start_trans()

On failure, ocfs2_start_trans() returns values like ERR_PTR(-ENOMEM).
Thus checks for !handle are wrong. Fix them to use IS_ERR().

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/file.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7efe937a415f..3138a385fdbb 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -247,8 +247,8 @@ int ocfs2_update_inode_atime(struct inode *inode,
 	mlog_entry_void();
 
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (handle == NULL) {
-		ret = -ENOMEM;
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
@@ -312,8 +312,8 @@ static int ocfs2_simple_size_update(struct inode *inode,
 	handle_t *handle = NULL;
 
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (handle == NULL) {
-		ret = -ENOMEM;
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
@@ -1055,8 +1055,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
 
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (handle == NULL) {
-		ret = -ENOMEM;
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
@@ -1259,8 +1259,8 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 	}
 
 	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-	if (handle == NULL) {
-		ret = -ENOMEM;
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
@@ -1352,8 +1352,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
 		goto out;
 
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (handle == NULL) {
-		ret = -ENOMEM;
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
-- 
cgit v1.2.2


From 87cfa004321c62aec681713ea48e0b846336d9f4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 20 Oct 2008 19:23:53 +0200
Subject: ocfs2: Fix checking of return value of new_inode()

new_inode() does not return ERR_PTR() but NULL in case of failure. Correct
checking of the return value.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/namei.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 485a6aa0ad39..f594f300d4cd 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -378,8 +378,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	}
 
 	inode = new_inode(dir->i_sb);
-	if (IS_ERR(inode)) {
-		status = PTR_ERR(inode);
+	if (!inode) {
+		status = -ENOMEM;
 		mlog(ML_ERROR, "new_inode failed!\n");
 		goto leave;
 	}
-- 
cgit v1.2.2


From b99835c1684918b9975851d71455c5c007d1715b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 20 Oct 2008 19:23:54 +0200
Subject: ocfs2: Let inode be really deleted when ocfs2_mknod_locked() fails

We forgot to set i_nlink to 0 when returning due to error from ocfs2_mknod_locked()
and thus inode was not properly released via ocfs2_delete_inode() (e.g. claimed
space was not released). Fix it.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/namei.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index f594f300d4cd..f4967e634ffd 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -491,8 +491,10 @@ leave:
 			brelse(*new_fe_bh);
 			*new_fe_bh = NULL;
 		}
-		if (inode)
+		if (inode) {
+			clear_nlink(inode);
 			iput(inode);
+		}
 	}
 
 	mlog_exit(status);
-- 
cgit v1.2.2


From d32647993c211901fc4819ef3327f62d1859241b Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 24 Oct 2008 07:57:28 +0800
Subject: ocfs2: Fix check of return value of ocfs2_start_trans() in xattr.c.

On failure, ocfs2_start_trans() returns values like ERR_PTR(-ENOMEM),
so we should check whether handle is NULL. Fix them to use IS_ERR().
Jan has made the patch for other part in ocfs2(thank Jan for it), so
this is just the fix for fs/ocfs2/xattr.c.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 789fb70462c9..a371c01942b1 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -4092,7 +4092,7 @@ static int ocfs2_xattr_value_update_size(struct inode *inode,
 	handle_t *handle = NULL;
 
 	handle = ocfs2_start_trans(osb, 1);
-	if (handle == NULL) {
+	if (IS_ERR(handle)) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
@@ -4259,7 +4259,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 	}
 
 	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-	if (handle == NULL) {
+	if (IS_ERR(handle)) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
-- 
cgit v1.2.2


From ae0dff683076b2798763288c7ac2f09a18c4a998 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Wed, 22 Oct 2008 13:24:29 -0700
Subject: ocfs2: Set journal descriptor to NULL after journal shutdown

Patch sets journal descriptor to NULL after the journal is shutdown.
This ensures that jbd2_journal_release_jbd_inode(), which removes the
jbd2 inode from txn lists, can be called safely from ocfs2_clear_inode()
even after the journal has been shutdown.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/inode.c   | 6 ++++++
 fs/ocfs2/journal.c | 1 +
 2 files changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 4903688f72a9..7aa00d511874 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1106,6 +1106,12 @@ void ocfs2_clear_inode(struct inode *inode)
 	oi->ip_last_trans = 0;
 	oi->ip_dir_start_lookup = 0;
 	oi->ip_blkno = 0ULL;
+
+	/*
+	 * ip_jinode is used to track txns against this inode. We ensure that
+	 * the journal is flushed before journal shutdown. Thus it is safe to
+	 * have inodes get cleaned up after journal shutdown.
+	 */
 	jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
 				       &oi->ip_jinode);
 
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 81e40677eecb..99fe9d584f3c 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -690,6 +690,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 
 	/* Shutdown the kernel journal system */
 	jbd2_journal_destroy(journal->j_journal);
+	journal->j_journal = NULL;
 
 	OCFS2_I(inode)->ip_open_count--;
 
-- 
cgit v1.2.2


From 4c1bbf1ba631d7db61ce3462349a3f5d14ae3009 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 6 Oct 2008 16:59:55 +0800
Subject: ocfs2: return 0 in page_mkwrite to let VFS retry.

In ocfs2_page_mkwrite, we return -EINVAL when we found the page mapping
isn't updated, and it will cause the user space program get SIGBUS and
exit. The reason is that during race writeable mmap, we will do
unmap_mapping_range in ocfs2_data_downconvert_worker. The good thing is
that if we reuturn 0 in page_mkwrite, VFS will retry fault and then
call page_mkwrite again, so it is safe to return 0 here.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/mmap.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 3dc18d67557c..eea1d24713ea 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -113,7 +113,11 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
 	 * ocfs2_write_begin_nolock().
 	 */
 	if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
-		ret = -EINVAL;
+		/*
+		 * the page has been umapped in ocfs2_data_downconvert_worker.
+		 * So return 0 here and let VFS retry.
+		 */
+		ret = 0;
 		goto out;
 	}
 
-- 
cgit v1.2.2


From 80bcaf3469b8aefd316d4ceb27d9af7cfbb0b913 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 27 Oct 2008 06:06:24 +0800
Subject: ocfs2/xattr: Proper hash collision handle in bucket division

In ocfs2/xattr, we must make sure the xattrs which have the same hash value
exist in the same bucket so that the search schema can work. But in the old
implementation, when we want to extend a bucket, we just move half number of
xattrs to the new bucket. This works in most cases, but if we are lucky
enough we will move 2 xattrs into 2 different buckets. This means that an
xattr from the previous bucket cannot be found anymore. This patch fix this
problem by finding the right position during extending the bucket and extend
an empty bucket if needed.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Cc: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 144 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 115 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index a371c01942b1..f3ea7efb48c6 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3110,25 +3110,73 @@ static int ocfs2_read_xattr_bucket(struct inode *inode,
 }
 
 /*
- * Move half num of the xattrs in old bucket(blk) to new bucket(new_blk).
+ * Find the suitable pos when we divide a bucket into 2.
+ * We have to make sure the xattrs with the same hash value exist
+ * in the same bucket.
+ *
+ * If this ocfs2_xattr_header covers more than one hash value, find a
+ * place where the hash value changes.  Try to find the most even split.
+ * The most common case is that all entries have different hash values,
+ * and the first check we make will find a place to split.
+ */
+static int ocfs2_xattr_find_divide_pos(struct ocfs2_xattr_header *xh)
+{
+	struct ocfs2_xattr_entry *entries = xh->xh_entries;
+	int count = le16_to_cpu(xh->xh_count);
+	int delta, middle = count / 2;
+
+	/*
+	 * We start at the middle.  Each step gets farther away in both
+	 * directions.  We therefore hit the change in hash value
+	 * nearest to the middle.  Note that this loop does not execute for
+	 * count < 2.
+	 */
+	for (delta = 0; delta < middle; delta++) {
+		/* Let's check delta earlier than middle */
+		if (cmp_xe(&entries[middle - delta - 1],
+			   &entries[middle - delta]))
+			return middle - delta;
+
+		/* For even counts, don't walk off the end */
+		if ((middle + delta + 1) == count)
+			continue;
+
+		/* Now try delta past middle */
+		if (cmp_xe(&entries[middle + delta],
+			   &entries[middle + delta + 1]))
+			return middle + delta + 1;
+	}
+
+	/* Every entry had the same hash */
+	return count;
+}
+
+/*
+ * Move some xattrs in old bucket(blk) to new bucket(new_blk).
  * first_hash will record the 1st hash of the new bucket.
+ *
+ * Normally half of the xattrs will be moved.  But we have to make
+ * sure that the xattrs with the same hash value are stored in the
+ * same bucket. If all the xattrs in this bucket have the same hash
+ * value, the new bucket will be initialized as an empty one and the
+ * first_hash will be initialized as (hash_value+1).
  */
-static int ocfs2_half_xattr_bucket(struct inode *inode,
-				   handle_t *handle,
-				   u64 blk,
-				   u64 new_blk,
-				   u32 *first_hash,
-				   int new_bucket_head)
+static int ocfs2_divide_xattr_bucket(struct inode *inode,
+				    handle_t *handle,
+				    u64 blk,
+				    u64 new_blk,
+				    u32 *first_hash,
+				    int new_bucket_head)
 {
 	int ret, i;
-	u16 count, start, len, name_value_len, xe_len, name_offset;
+	int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	struct buffer_head **s_bhs, **t_bhs = NULL;
 	struct ocfs2_xattr_header *xh;
 	struct ocfs2_xattr_entry *xe;
 	int blocksize = inode->i_sb->s_blocksize;
 
-	mlog(0, "move half of xattrs from bucket %llu to %llu\n",
+	mlog(0, "move some of xattrs from bucket %llu to %llu\n",
 	     blk, new_blk);
 
 	s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
@@ -3171,14 +3219,35 @@ static int ocfs2_half_xattr_bucket(struct inode *inode,
 		}
 	}
 
+	xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+	count = le16_to_cpu(xh->xh_count);
+	start = ocfs2_xattr_find_divide_pos(xh);
+
+	if (start == count) {
+		xe = &xh->xh_entries[start-1];
+
+		/*
+		 * initialized a new empty bucket here.
+		 * The hash value is set as one larger than
+		 * that of the last entry in the previous bucket.
+		 */
+		for (i = 0; i < blk_per_bucket; i++)
+			memset(t_bhs[i]->b_data, 0, blocksize);
+
+		xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+		xh->xh_free_start = cpu_to_le16(blocksize);
+		xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
+		le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
+
+		goto set_num_buckets;
+	}
+
 	/* copy the whole bucket to the new first. */
 	for (i = 0; i < blk_per_bucket; i++)
 		memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
 
 	/* update the new bucket. */
 	xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
-	count = le16_to_cpu(xh->xh_count);
-	start = count / 2;
 
 	/*
 	 * Calculate the total name/value len and xh_free_start for
@@ -3235,6 +3304,7 @@ static int ocfs2_half_xattr_bucket(struct inode *inode,
 			xh->xh_free_start = xe->xe_name_offset;
 	}
 
+set_num_buckets:
 	/* set xh->xh_num_buckets for the new xh. */
 	if (new_bucket_head)
 		xh->xh_num_buckets = cpu_to_le16(1);
@@ -3252,9 +3322,13 @@ static int ocfs2_half_xattr_bucket(struct inode *inode,
 		*first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
 
 	/*
-	 * Now only update the 1st block of the old bucket.
-	 * Please note that the entry has been sorted already above.
+	 * Now only update the 1st block of the old bucket.  If we
+	 * just added a new empty bucket, there is no need to modify
+	 * it.
 	 */
+	if (start == count)
+		goto out;
+
 	xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
 	memset(&xh->xh_entries[start], 0,
 	       sizeof(struct ocfs2_xattr_entry) * (count - start));
@@ -3439,15 +3513,15 @@ out:
 }
 
 /*
- * Move half of the xattrs in this cluster to the new cluster.
+ * Move some xattrs in this cluster to the new cluster.
  * This function should only be called when bucket size == cluster size.
  * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead.
  */
-static int ocfs2_half_xattr_cluster(struct inode *inode,
-				    handle_t *handle,
-				    u64 prev_blk,
-				    u64 new_blk,
-				    u32 *first_hash)
+static int ocfs2_divide_xattr_cluster(struct inode *inode,
+				      handle_t *handle,
+				      u64 prev_blk,
+				      u64 new_blk,
+				      u32 *first_hash)
 {
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	int ret, credits = 2 * blk_per_bucket;
@@ -3461,8 +3535,8 @@ static int ocfs2_half_xattr_cluster(struct inode *inode,
 	}
 
 	/* Move half of the xattr in start_blk to the next bucket. */
-	return  ocfs2_half_xattr_bucket(inode, handle, prev_blk,
-					new_blk, first_hash, 1);
+	return  ocfs2_divide_xattr_bucket(inode, handle, prev_blk,
+					  new_blk, first_hash, 1);
 }
 
 /*
@@ -3524,9 +3598,9 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 						     last_blk, new_blk,
 						     v_start);
 		else {
-			ret = ocfs2_half_xattr_cluster(inode, handle,
-						       last_blk, new_blk,
-						       v_start);
+			ret = ocfs2_divide_xattr_cluster(inode, handle,
+							 last_blk, new_blk,
+							 v_start);
 
 			if ((*header_bh)->b_blocknr == last_blk && extend)
 				*extend = 0;
@@ -3743,8 +3817,8 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
 	}
 
 	/* Move half of the xattr in start_blk to the next bucket. */
-	ret = ocfs2_half_xattr_bucket(inode, handle, start_blk,
-				      start_blk + blk_per_bucket, NULL, 0);
+	ret = ocfs2_divide_xattr_bucket(inode, handle, start_blk,
+					start_blk + blk_per_bucket, NULL, 0);
 
 	le16_add_cpu(&first_xh->xh_num_buckets, 1);
 	ocfs2_journal_dirty(handle, first_bh);
@@ -4435,11 +4509,21 @@ out:
 	return ret;
 }
 
-/* check whether the xattr bucket is filled up with the same hash value. */
+/*
+ * check whether the xattr bucket is filled up with the same hash value.
+ * If we want to insert the xattr with the same hash, return -ENOSPC.
+ * If we want to insert a xattr with different hash value, go ahead
+ * and ocfs2_divide_xattr_bucket will handle this.
+ */
 static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
-					      struct ocfs2_xattr_bucket *bucket)
+					      struct ocfs2_xattr_bucket *bucket,
+					      const char *name)
 {
 	struct ocfs2_xattr_header *xh = bucket->xh;
+	u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
+
+	if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
+		return 0;
 
 	if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash ==
 	    xh->xh_entries[0].xe_name_hash) {
@@ -4562,7 +4646,9 @@ try_again:
 		 * one bucket's worth, so check it here whether we need to
 		 * add a new bucket for the insert.
 		 */
-		ret = ocfs2_check_xattr_bucket_collision(inode, &xs->bucket);
+		ret = ocfs2_check_xattr_bucket_collision(inode,
+							 &xs->bucket,
+							 xi->name);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
-- 
cgit v1.2.2


From c435400140d24fbcb3da6b1e006be831f9056cb6 Mon Sep 17 00:00:00 2001
From: Dmitri Monakhov <dmonakhov@openvz.org>
Date: Mon, 27 Oct 2008 13:01:49 -0700
Subject: ocfs2: truncate outstanding block after direct io failure

Signed-off-by: Dmitri Monakhov <dmonakhov@openvz.org>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Cc: Joel Becker <Joel.Becker@oracle.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/file.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3138a385fdbb..e2570a3bc2b2 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1866,6 +1866,13 @@ relock:
 		written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
 						    ppos, count, ocount);
 		if (written < 0) {
+			/*
+			 * direct write may have instantiated a few
+			 * blocks outside i_size. Trim these off again.
+			 * Don't need i_size_read because we hold i_mutex.
+			 */
+			if (*ppos + count > inode->i_size)
+				vmtruncate(inode, inode->i_size);
 			ret = written;
 			goto out_dio;
 		}
-- 
cgit v1.2.2


From de29c08528bae45e3fa1171d190f1340e37e0f70 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Wed, 29 Oct 2008 14:45:30 -0700
Subject: ocfs2: fix printk related build warnings in xattr.c

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index f3ea7efb48c6..70baffeb1812 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2336,7 +2336,8 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
 	BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
 
 	mlog(0, "find xattr extent rec %u clusters from %llu, the first hash "
-	     "in the rec is %u\n", num_clusters, p_blkno, first_hash);
+	     "in the rec is %u\n", num_clusters, (unsigned long long)p_blkno,
+	     first_hash);
 
 	ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
 				      p_blkno, first_hash, num_clusters, xs);
@@ -2360,7 +2361,7 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 	memset(&bucket, 0, sizeof(bucket));
 
 	mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
-	     clusters, blkno);
+	     clusters, (unsigned long long)blkno);
 
 	for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
 		ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
@@ -2378,7 +2379,8 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 		if (i == 0)
 			num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
 
-		mlog(0, "iterating xattr bucket %llu, first hash %u\n", blkno,
+		mlog(0, "iterating xattr bucket %llu, first hash %u\n",
+		     (unsigned long long)blkno,
 		     le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
 		if (func) {
 			ret = func(inode, &bucket, para);
@@ -2714,7 +2716,8 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	 */
 	blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
 
-	mlog(0, "allocate 1 cluster from %llu to xattr block\n", blkno);
+	mlog(0, "allocate 1 cluster from %llu to xattr block\n",
+	     (unsigned long long)blkno);
 
 	xh_bh = sb_getblk(inode->i_sb, blkno);
 	if (!xh_bh) {
@@ -2883,8 +2886,8 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 
 	mlog(0, "adjust xattr bucket in %llu, count = %u, "
 	     "xh_free_start = %u, xh_name_value_len = %u.\n",
-	     blkno, le16_to_cpu(xh->xh_count), xh_free_start,
-	     le16_to_cpu(xh->xh_name_value_len));
+	     (unsigned long long)blkno, le16_to_cpu(xh->xh_count),
+	     xh_free_start, le16_to_cpu(xh->xh_name_value_len));
 
 	/*
 	 * sort all the entries by their offset.
@@ -3000,7 +3003,7 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
 	prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
 
 	mlog(0, "move half of xattrs in cluster %llu to %llu\n",
-	     prev_blkno, new_blkno);
+	     (unsigned long long)prev_blkno, (unsigned long long)new_blkno);
 
 	/*
 	 * We need to update the 1st half of the new cluster and
@@ -3177,7 +3180,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 	int blocksize = inode->i_sb->s_blocksize;
 
 	mlog(0, "move some of xattrs from bucket %llu to %llu\n",
-	     blk, new_blk);
+	     (unsigned long long)blk, (unsigned long long)new_blk);
 
 	s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
 	if (!s_bhs)
@@ -3376,7 +3379,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 	BUG_ON(s_blkno == t_blkno);
 
 	mlog(0, "cp bucket %llu to %llu, target is %d\n",
-	     s_blkno, t_blkno, t_is_new);
+	     (unsigned long long)s_blkno, (unsigned long long)t_blkno,
+	     t_is_new);
 
 	s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
 			GFP_NOFS);
@@ -3448,7 +3452,8 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
 	struct ocfs2_xattr_header *xh;
 	u64 to_blk_start = to_blk;
 
-	mlog(0, "cp xattrs from cluster %llu to %llu\n", src_blk, to_blk);
+	mlog(0, "cp xattrs from cluster %llu to %llu\n",
+	     (unsigned long long)src_blk, (unsigned long long)to_blk);
 
 	/*
 	 * We need to update the new cluster and 1 more for the update of
@@ -3579,7 +3584,8 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
 
 	mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
-	     prev_blk, prev_clusters, new_blk);
+	     (unsigned long long)prev_blk, prev_clusters,
+	     (unsigned long long)new_blk);
 
 	if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
 		ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
@@ -3649,7 +3655,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 	mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
 	     "previous xattr blkno = %llu\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-	     prev_cpos, prev_blkno);
+	     prev_cpos, (unsigned long long)prev_blkno);
 
 	ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
 
@@ -3736,7 +3742,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 		}
 	}
 	mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
-	     num_bits, block, v_start);
+	     num_bits, (unsigned long long)block, v_start);
 	ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
 				  num_bits, 0, meta_ac);
 	if (ret < 0) {
@@ -3781,7 +3787,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
 	u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
 
 	mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
-	     "from %llu, len = %u\n", start_blk,
+	     "from %llu, len = %u\n", (unsigned long long)start_blk,
 	     (unsigned long long)first_bh->b_blocknr, num_clusters);
 
 	BUG_ON(bucket >= num_buckets);
-- 
cgit v1.2.2


From 6c1e183e12dbd78a897a859f13220406296fee31 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Sun, 2 Nov 2008 19:04:21 +0800
Subject: ocfs2: Check search result in ocfs2_xattr_block_get()

ocfs2_xattr_block_get() calls ocfs2_xattr_search() to find an external
xattr, but doesn't check the search result that is passed back via struct
ocfs2_xattr_search. Add a check for search result, and pass back -ENODATA if
the xattr search failed. This avoids a later NULL pointer error.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 70baffeb1812..054e2efb0b7e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -777,6 +777,11 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 		goto cleanup;
 	}
 
+	if (xs->not_found) {
+		ret = -ENODATA;
+		goto cleanup;
+	}
+
 	xb = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
 	size = le64_to_cpu(xs->here->xe_value_size);
 	if (buffer) {
@@ -860,7 +865,7 @@ static int ocfs2_xattr_get(struct inode *inode,
 	down_read(&oi->ip_xattr_sem);
 	ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
 				    buffer_size, &xis);
-	if (ret == -ENODATA)
+	if (ret == -ENODATA && di->i_xattr_loc)
 		ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
 					    buffer_size, &xbs);
 	up_read(&oi->ip_xattr_sem);
-- 
cgit v1.2.2


From afef80b3d87cae574b8c6b763505f25b74d254ef Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 12 Nov 2008 13:26:54 -0800
Subject: vfs: fix shrink_submounts

In the last refactoring of shrink_submounts a variable was not completely
renamed.  So finish the renaming of mnt to m now.

Without this if you attempt to mount an nfs mount that has both automatic
nfs sub mounts on it, and has normal mounts on it.  The unmount will
succeed when it should not.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namespace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index cce46702d33c..65b3dc844c87 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1815,8 +1815,8 @@ static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
 		while (!list_empty(&graveyard)) {
 			m = list_first_entry(&graveyard, struct vfsmount,
 						mnt_expire);
-			touch_mnt_namespace(mnt->mnt_ns);
-			umount_tree(mnt, 1, umounts);
+			touch_mnt_namespace(m->mnt_ns);
+			umount_tree(m, 1, umounts);
 		}
 	}
 }
-- 
cgit v1.2.2


From 6cdfcc275e40b89fb020da1088ead86a61d33115 Mon Sep 17 00:00:00 2001
From: Theodore Tso <tytso@mit.edu>
Date: Wed, 12 Nov 2008 13:27:01 -0800
Subject: ext3: Clean up outdated and incorrect comment for ext3_write_super()

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 5dec6d1356c4..f6c94f232ec1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2375,12 +2375,9 @@ int ext3_force_commit(struct super_block *sb)
 /*
  * Ext3 always journals updates to the superblock itself, so we don't
  * have to propagate any other updates to the superblock on disk at this
- * point.  Just start an async writeback to get the buffers on their way
- * to the disk.
- *
- * This implicitly triggers the writebehind on sync().
+ * point.  (We can probably nuke this function altogether, and remove
+ * any mention to sb->s_dirt in all of fs/ext3; eventual cleanup...)
  */
-
 static void ext3_write_super (struct super_block * sb)
 {
 	if (mutex_trylock(&sb->s_lock) != 0)
-- 
cgit v1.2.2


From 278afcbf4fe964230eba67f8fb8235e8b7e63ffb Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Thu, 13 Nov 2008 13:22:34 -0600
Subject: dlm: fix shutdown cleanup

Fixes a regression from commit 0f8e0d9a317406612700426fad3efab0b7bbc467,
"dlm: allow multiple lockspace creates".

An extraneous 'else' slipped into a code fragment being moved from
release_lockspace() to dlm_release_lockspace().  The result of the
unwanted 'else' is that dlm threads and structures are not stopped
and cleaned up when the final dlm lockspace is removed.  Trying to
create a new lockspace again afterward will fail with
"kmem_cache_create: duplicate cache dlm_conn" because the cache
was not previously destroyed.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lockspace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d910501de6d2..8d86b7960f0d 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -812,7 +812,7 @@ int dlm_release_lockspace(void *lockspace, int force)
 	error = release_lockspace(ls, force);
 	if (!error)
 		ls_count--;
-	else if (!ls_count)
+	if (!ls_count)
 		threads_stop();
 	mutex_unlock(&ls_lock);
 
-- 
cgit v1.2.2


From 3b7952109361c684caf0c50474da8662ecc81019 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 13 Nov 2008 19:45:32 +0000
Subject: [CIFS] Fix cifs reconnection flags

In preparation for Jeff's big umount/mount fixes to remove the possibility of
various races in cifs mount and linked list handling of sessions, sockets and
tree connections, this patch cleans up some repetitive code in cifs_mount,
and addresses a problem with ses->status and tcon->tidStatus in which we
were overloading the "need_reconnect" state with other status in that
field.  So the "need_reconnect" flag has been broken out from those
two state fields (need reconnect was not mutually exclusive from some of the
other possible tid and ses states).  In addition, a few exit cases in
cifs_mount were cleaned up, and a problem with a tcon flag (for lease support)
was not being set consistently for the 2nd mount of the same share

CC: Jeff Layton <jlayton@redhat.com>
CC: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c   |   2 +-
 fs/cifs/cifsglob.h |   5 +
 fs/cifs/cifssmb.c  |  40 ++++----
 fs/cifs/connect.c  | 262 ++++++++++++++++++++++++++---------------------------
 fs/cifs/file.c     |   2 +-
 5 files changed, 160 insertions(+), 151 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index ac5915d61dca..903bbd6449d1 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1013,7 +1013,7 @@ static int cifs_oplock_thread(void *dummyarg)
 				not bother sending an oplock release if session
 				to server still is disconnected since oplock
 				already released by the server in that case */
-			if (pTcon->tidStatus != CifsNeedReconnect) {
+			if (!pTcon->need_reconnect) {
 				rc = CIFSSMBLock(0, pTcon, netfid,
 						0 /* len */ , 0 /* offset */, 0,
 						0, LOCKING_ANDX_OPLOCK_RELEASE,
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 1cb1189f24e0..dc0aa140f1bf 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -122,6 +122,8 @@ struct cifs_cred {
  */
 
 struct TCP_Server_Info {
+	struct list_head tcp_ses_list;
+	struct list_head smb_ses_list;
 	/* 15 character server name + 0x20 16th byte indicating type = srv */
 	char server_RFC1001_name[SERVER_NAME_LEN_WITH_NULL];
 	char unicode_server_Name[SERVER_NAME_LEN_WITH_NULL * 2];
@@ -195,6 +197,7 @@ struct cifsUidInfo {
  */
 struct cifsSesInfo {
 	struct list_head cifsSessionList;
+	struct list_head tcon_list;
 	struct semaphore sesSem;
 #if 0
 	struct cifsUidInfo *uidInfo;	/* pointer to user info */
@@ -216,6 +219,7 @@ struct cifsSesInfo {
 	char userName[MAX_USERNAME_SIZE + 1];
 	char *domainName;
 	char *password;
+	bool need_reconnect:1; /* connection reset, uid now invalid */
 };
 /* no more than one of the following three session flags may be set */
 #define CIFS_SES_NT4 1
@@ -288,6 +292,7 @@ struct cifsTconInfo {
 	bool unix_ext:1;  /* if false disable Linux extensions to CIFS protocol
 				for this mount even if server would support */
 	bool local_lease:1; /* check leases (only) on local system not remote */
+	bool need_reconnect:1; /* connection reset, tid now invalid */
 	/* BB add field for back pointer to sb struct(s)? */
 };
 
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d5eac48fc415..7f0651b69573 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -190,10 +190,10 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 		/* need to prevent multiple threads trying to
 		simultaneously reconnect the same SMB session */
 			down(&tcon->ses->sesSem);
-			if (tcon->ses->status == CifsNeedReconnect)
+			if (tcon->ses->need_reconnect)
 				rc = cifs_setup_session(0, tcon->ses,
 							nls_codepage);
-			if (!rc && (tcon->tidStatus == CifsNeedReconnect)) {
+			if (!rc && (tcon->need_reconnect)) {
 				mark_open_files_invalid(tcon);
 				rc = CIFSTCon(0, tcon->ses, tcon->treeName,
 					      tcon, nls_codepage);
@@ -295,7 +295,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 	   check for tcp and smb session status done differently
 	   for those three - in the calling routine */
 	if (tcon) {
-		if (tcon->tidStatus == CifsExiting) {
+		if (tcon->need_reconnect) {
 			/* only tree disconnect, open, and write,
 			  (and ulogoff which does not have tcon)
 			  are allowed as we start force umount */
@@ -337,10 +337,10 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 		/* need to prevent multiple threads trying to
 		simultaneously reconnect the same SMB session */
 			down(&tcon->ses->sesSem);
-			if (tcon->ses->status == CifsNeedReconnect)
+			if (tcon->ses->need_reconnect)
 				rc = cifs_setup_session(0, tcon->ses,
 							nls_codepage);
-			if (!rc && (tcon->tidStatus == CifsNeedReconnect)) {
+			if (!rc && (tcon->need_reconnect)) {
 				mark_open_files_invalid(tcon);
 				rc = CIFSTCon(0, tcon->ses, tcon->treeName,
 					      tcon, nls_codepage);
@@ -759,7 +759,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
 
 	/* No need to return error on this operation if tid invalidated and
 	closed on server already e.g. due to tcp session crashing */
-	if (tcon->tidStatus == CifsNeedReconnect) {
+	if (tcon->need_reconnect) {
 		up(&tcon->tconSem);
 		return 0;
 	}
@@ -806,32 +806,36 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 		up(&ses->sesSem);
 		return -EBUSY;
 	}
+
+	if (ses->server == NULL)
+		return -EIO;
+
+	if (ses->need_reconnect)
+		goto session_already_dead; /* no need to send SMBlogoff if uid
+					      already closed due to reconnect */
 	rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB);
 	if (rc) {
 		up(&ses->sesSem);
 		return rc;
 	}
 
-	if (ses->server) {
-		pSMB->hdr.Mid = GetNextMid(ses->server);
+	pSMB->hdr.Mid = GetNextMid(ses->server);
 
-		if (ses->server->secMode &
+	if (ses->server->secMode &
 		   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
 			pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-	}
 
 	pSMB->hdr.Uid = ses->Suid;
 
 	pSMB->AndXCommand = 0xFF;
 	rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
-	if (ses->server) {
-		atomic_dec(&ses->server->socketUseCount);
-		if (atomic_read(&ses->server->socketUseCount) == 0) {
-			spin_lock(&GlobalMid_Lock);
-			ses->server->tcpStatus = CifsExiting;
-			spin_unlock(&GlobalMid_Lock);
-			rc = -ESHUTDOWN;
-		}
+session_already_dead:
+	atomic_dec(&ses->server->socketUseCount);
+	if (atomic_read(&ses->server->socketUseCount) == 0) {
+		spin_lock(&GlobalMid_Lock);
+		ses->server->tcpStatus = CifsExiting;
+		spin_unlock(&GlobalMid_Lock);
+		rc = -ESHUTDOWN;
 	}
 	up(&ses->sesSem);
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c682be8f2984..c1cd1217c990 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -149,7 +149,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
 		if (ses->server) {
 			if (ses->server == server) {
-				ses->status = CifsNeedReconnect;
+				ses->need_reconnect = true;
 				ses->ipc_tid = 0;
 			}
 		}
@@ -158,7 +158,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	list_for_each(tmp, &GlobalTreeConnectionList) {
 		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
 		if ((tcon->ses) && (tcon->ses->server == server))
-			tcon->tidStatus = CifsNeedReconnect;
+			tcon->need_reconnect = true;
 	}
 	read_unlock(&GlobalSMBSeslock);
 	/* do not want to be sending data on a socket we are freeing */
@@ -1891,6 +1891,92 @@ kill_cifsd(struct TCP_Server_Info *server)
 		force_sig(SIGKILL, task);
 }
 
+static void setup_cifs_sb(struct smb_vol *pvolume_info,
+			  struct cifs_sb_info *cifs_sb)
+{
+	if (pvolume_info->rsize > CIFSMaxBufSize) {
+		cERROR(1, ("rsize %d too large, using MaxBufSize",
+			pvolume_info->rsize));
+		cifs_sb->rsize = CIFSMaxBufSize;
+	} else if ((pvolume_info->rsize) &&
+			(pvolume_info->rsize <= CIFSMaxBufSize))
+		cifs_sb->rsize = pvolume_info->rsize;
+	else /* default */
+		cifs_sb->rsize = CIFSMaxBufSize;
+
+	if (pvolume_info->wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
+		cERROR(1, ("wsize %d too large, using 4096 instead",
+			  pvolume_info->wsize));
+		cifs_sb->wsize = 4096;
+	} else if (pvolume_info->wsize)
+		cifs_sb->wsize = pvolume_info->wsize;
+	else
+		cifs_sb->wsize = min_t(const int,
+					PAGEVEC_SIZE * PAGE_CACHE_SIZE,
+					127*1024);
+		/* old default of CIFSMaxBufSize was too small now
+		   that SMB Write2 can send multiple pages in kvec.
+		   RFC1001 does not describe what happens when frame
+		   bigger than 128K is sent so use that as max in
+		   conjunction with 52K kvec constraint on arch with 4K
+		   page size  */
+
+	if (cifs_sb->rsize < 2048) {
+		cifs_sb->rsize = 2048;
+		/* Windows ME may prefer this */
+		cFYI(1, ("readsize set to minimum: 2048"));
+	}
+	/* calculate prepath */
+	cifs_sb->prepath = pvolume_info->prepath;
+	if (cifs_sb->prepath) {
+		cifs_sb->prepathlen = strlen(cifs_sb->prepath);
+		/* we can not convert the / to \ in the path
+		separators in the prefixpath yet because we do not
+		know (until reset_cifs_unix_caps is called later)
+		whether POSIX PATH CAP is available. We normalize
+		the / to \ after reset_cifs_unix_caps is called */
+		pvolume_info->prepath = NULL;
+	} else
+		cifs_sb->prepathlen = 0;
+	cifs_sb->mnt_uid = pvolume_info->linux_uid;
+	cifs_sb->mnt_gid = pvolume_info->linux_gid;
+	cifs_sb->mnt_file_mode = pvolume_info->file_mode;
+	cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
+	cFYI(1, ("file mode: 0x%x  dir mode: 0x%x",
+		cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode));
+
+	if (pvolume_info->noperm)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
+	if (pvolume_info->setuids)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID;
+	if (pvolume_info->server_ino)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM;
+	if (pvolume_info->remap)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR;
+	if (pvolume_info->no_xattr)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR;
+	if (pvolume_info->sfu_emul)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
+	if (pvolume_info->nobrl)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
+	if (pvolume_info->cifs_acl)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
+	if (pvolume_info->override_uid)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
+	if (pvolume_info->override_gid)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
+	if (pvolume_info->dynperm)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
+	if (pvolume_info->direct_io) {
+		cFYI(1, ("mounting share using direct i/o"));
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
+	}
+
+	if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
+		cERROR(1, ("mount option dynperm ignored if cifsacl "
+			   "mount option supported"));
+}
+
 int
 cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	   char *mount_data, const char *devname)
@@ -1996,9 +2082,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		goto out;
 	}
 
-	if (srvTcp) {
-		cFYI(1, ("Existing tcp session with server found"));
-	} else {	/* create socket */
+	if (!srvTcp) {	/* create socket */
 		if (volume_info.port)
 			sin_server.sin_port = htons(volume_info.port);
 		else
@@ -2074,7 +2158,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		cFYI(1, ("Existing smb sess found (status=%d)",
 			pSesInfo->status));
 		down(&pSesInfo->sesSem);
-		if (pSesInfo->status == CifsNeedReconnect) {
+		if (pSesInfo->need_reconnect) {
 			cFYI(1, ("Session needs reconnect"));
 			rc = cifs_setup_session(xid, pSesInfo,
 						cifs_sb->local_nls);
@@ -2124,146 +2208,59 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 
 	/* search for existing tcon to this server share */
 	if (!rc) {
-		if (volume_info.rsize > CIFSMaxBufSize) {
-			cERROR(1, ("rsize %d too large, using MaxBufSize",
-				volume_info.rsize));
-			cifs_sb->rsize = CIFSMaxBufSize;
-		} else if ((volume_info.rsize) &&
-				(volume_info.rsize <= CIFSMaxBufSize))
-			cifs_sb->rsize = volume_info.rsize;
-		else /* default */
-			cifs_sb->rsize = CIFSMaxBufSize;
-
-		if (volume_info.wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
-			cERROR(1, ("wsize %d too large, using 4096 instead",
-				  volume_info.wsize));
-			cifs_sb->wsize = 4096;
-		} else if (volume_info.wsize)
-			cifs_sb->wsize = volume_info.wsize;
-		else
-			cifs_sb->wsize =
-				min_t(const int, PAGEVEC_SIZE * PAGE_CACHE_SIZE,
-					127*1024);
-			/* old default of CIFSMaxBufSize was too small now
-			   that SMB Write2 can send multiple pages in kvec.
-			   RFC1001 does not describe what happens when frame
-			   bigger than 128K is sent so use that as max in
-			   conjunction with 52K kvec constraint on arch with 4K
-			   page size  */
-
-		if (cifs_sb->rsize < 2048) {
-			cifs_sb->rsize = 2048;
-			/* Windows ME may prefer this */
-			cFYI(1, ("readsize set to minimum: 2048"));
-		}
-		/* calculate prepath */
-		cifs_sb->prepath = volume_info.prepath;
-		if (cifs_sb->prepath) {
-			cifs_sb->prepathlen = strlen(cifs_sb->prepath);
-			/* we can not convert the / to \ in the path
-			separators in the prefixpath yet because we do not
-			know (until reset_cifs_unix_caps is called later)
-			whether POSIX PATH CAP is available. We normalize
-			the / to \ after reset_cifs_unix_caps is called */
-			volume_info.prepath = NULL;
-		} else
-			cifs_sb->prepathlen = 0;
-		cifs_sb->mnt_uid = volume_info.linux_uid;
-		cifs_sb->mnt_gid = volume_info.linux_gid;
-		cifs_sb->mnt_file_mode = volume_info.file_mode;
-		cifs_sb->mnt_dir_mode = volume_info.dir_mode;
-		cFYI(1, ("file mode: 0x%x  dir mode: 0x%x",
-			cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode));
-
-		if (volume_info.noperm)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
-		if (volume_info.setuids)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID;
-		if (volume_info.server_ino)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM;
-		if (volume_info.remap)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR;
-		if (volume_info.no_xattr)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR;
-		if (volume_info.sfu_emul)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
-		if (volume_info.nobrl)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
-		if (volume_info.cifs_acl)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
-		if (volume_info.override_uid)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
-		if (volume_info.override_gid)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
-		if (volume_info.dynperm)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
-		if (volume_info.direct_io) {
-			cFYI(1, ("mounting share using direct i/o"));
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
-		}
-
-		if ((volume_info.cifs_acl) && (volume_info.dynperm))
-			cERROR(1, ("mount option dynperm ignored if cifsacl "
-				   "mount option supported"));
+		setup_cifs_sb(&volume_info, cifs_sb);
 
 		tcon =
 		    find_unc(sin_server.sin_addr.s_addr, volume_info.UNC,
 			     volume_info.username);
 		if (tcon) {
 			cFYI(1, ("Found match on UNC path"));
-			/* we can have only one retry value for a connection
-			   to a share so for resources mounted more than once
-			   to the same server share the last value passed in
-			   for the retry flag is used */
-			tcon->retry = volume_info.retry;
-			tcon->nocase = volume_info.nocase;
-			tcon->local_lease = volume_info.local_lease;
 			if (tcon->seal != volume_info.seal)
 				cERROR(1, ("transport encryption setting "
 					   "conflicts with existing tid"));
 		} else {
 			tcon = tconInfoAlloc();
-			if (tcon == NULL)
+			if (tcon == NULL) {
 				rc = -ENOMEM;
-			else {
-				/* check for null share name ie connecting to
-				 * dfs root */
-
-				/* BB check if this works for exactly length
-				 * three strings */
-				if ((strchr(volume_info.UNC + 3, '\\') == NULL)
-				    && (strchr(volume_info.UNC + 3, '/') ==
-					NULL)) {
-/*					rc = connect_to_dfs_path(xid, pSesInfo,
-						"", cifs_sb->local_nls,
-						cifs_sb->mnt_cifs_flags &
-						  CIFS_MOUNT_MAP_SPECIAL_CHR);*/
-					cFYI(1, ("DFS root not supported"));
-					rc = -ENODEV;
-					goto out;
-				} else {
-					/* BB Do we need to wrap sesSem around
-					 * this TCon call and Unix SetFS as
-					 * we do on SessSetup and reconnect? */
-					rc = CIFSTCon(xid, pSesInfo,
-						volume_info.UNC,
-						tcon, cifs_sb->local_nls);
-					cFYI(1, ("CIFS Tcon rc = %d", rc));
-					if (volume_info.nodfs) {
-						tcon->Flags &=
-							~SMB_SHARE_IS_IN_DFS;
-						cFYI(1, ("DFS disabled (%d)",
-							tcon->Flags));
-					}
-				}
-				if (!rc) {
-					atomic_inc(&pSesInfo->inUse);
-					tcon->retry = volume_info.retry;
-					tcon->nocase = volume_info.nocase;
-					tcon->seal = volume_info.seal;
+				goto mount_fail_check;
+			}
+
+			/* check for null share name ie connect to dfs root */
+
+			/* BB check if works for exactly length 3 strings */
+			if ((strchr(volume_info.UNC + 3, '\\') == NULL)
+			    && (strchr(volume_info.UNC + 3, '/') == NULL)) {
+				/* rc = connect_to_dfs_path(...) */
+				cFYI(1, ("DFS root not supported"));
+				rc = -ENODEV;
+				goto mount_fail_check;
+			} else {
+				/* BB Do we need to wrap sesSem around
+				 * this TCon call and Unix SetFS as
+				 * we do on SessSetup and reconnect? */
+				rc = CIFSTCon(xid, pSesInfo, volume_info.UNC,
+					      tcon, cifs_sb->local_nls);
+				cFYI(1, ("CIFS Tcon rc = %d", rc));
+				if (volume_info.nodfs) {
+					tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
+					cFYI(1, ("DFS disabled (%d)",
+						tcon->Flags));
 				}
 			}
+			if (!rc) {
+				atomic_inc(&pSesInfo->inUse);
+				tcon->seal = volume_info.seal;
+			} else
+				goto mount_fail_check;
 		}
+
+		/* we can have only one retry value for a connection
+		   to a share so for resources mounted more than once
+		   to the same server share the last value passed in
+		   for the retry flag is used */
+		tcon->retry = volume_info.retry;
+		tcon->nocase = volume_info.nocase;
+		tcon->local_lease = volume_info.local_lease;
 	}
 	if (pSesInfo) {
 		if (pSesInfo->capabilities & CAP_LARGE_FILES) {
@@ -2276,6 +2273,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	sb->s_time_gran = 100;
 
 /* on error free sesinfo and tcon struct if needed */
+mount_fail_check:
 	if (rc) {
 		/* if session setup failed, use count is zero but
 		we still need to free cifsd thread */
@@ -3518,6 +3516,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	/* above now done in SendReceive */
 	if ((rc == 0) && (tcon != NULL)) {
 		tcon->tidStatus = CifsGood;
+		tcon->need_reconnect = false;
 		tcon->tid = smb_buffer_response->Tid;
 		bcc_ptr = pByteArea(smb_buffer_response);
 		length = strnlen(bcc_ptr, BCC(smb_buffer_response) - 2);
@@ -3746,6 +3745,7 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 		cFYI(1, ("CIFS Session Established successfully"));
 			spin_lock(&GlobalMid_Lock);
 			pSesInfo->status = CifsGood;
+			pSesInfo->need_reconnect = false;
 			spin_unlock(&GlobalMid_Lock);
 	}
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ead1a3bb0256..1540adaa593d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -493,7 +493,7 @@ int cifs_close(struct inode *inode, struct file *file)
 		if (pTcon) {
 			/* no sense reconnecting to close a file that is
 			   already closed */
-			if (pTcon->tidStatus != CifsNeedReconnect) {
+			if (!pTcon->need_reconnect) {
 				timeout = 2;
 				while ((atomic_read(&pSMBFile->wrtPending) != 0)
 					&& (timeout <= 2048)) {
-- 
cgit v1.2.2


From fb396016647ae9de5b3bd8c4ee4f7b9cc7148bd5 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 13 Nov 2008 20:04:07 +0000
Subject: [CIFS] remove unused list, add new cifs sock list to prepare for
 mount/umount fix

Also adds two lines missing from the previous patch (for the need reconnect flag in the
/proc/fs/cifs/DebugData handling)

The new global_cifs_sock_list is added, and initialized in init_cifs but not used yet.
Jeff Layton will be adding code in to use that and to remove the GlobalTcon and GlobalSMBSession
lists.

CC: Jeff Layton <jlayton@redhat.com>
CC: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c |  4 ++--
 fs/cifs/cifsfs.c     |  6 +++---
 fs/cifs/cifsglob.h   | 23 ++++++++---------------
 3 files changed, 13 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 69a12aae91d3..ba8723d95996 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -204,7 +204,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 		else
 			seq_printf(m, " type: %d ", dev_type);
 
-		if (tcon->tidStatus == CifsNeedReconnect)
+		if (tcon->need_reconnect)
 			seq_puts(m, "\tDISCONNECTED ");
 	}
 	read_unlock(&GlobalSMBSeslock);
@@ -311,7 +311,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
 		i++;
 		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
 		seq_printf(m, "\n%d) %s", i, tcon->treeName);
-		if (tcon->tidStatus == CifsNeedReconnect)
+		if (tcon->need_reconnect)
 			seq_puts(m, "\tDISCONNECTED ");
 		seq_printf(m, "\nSMBs: %d Oplock Breaks: %d",
 			atomic_read(&tcon->num_smbs_sent),
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 903bbd6449d1..af16a2406b1c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1059,9 +1059,9 @@ init_cifs(void)
 {
 	int rc = 0;
 	cifs_proc_init();
-/*	INIT_LIST_HEAD(&GlobalServerList);*/	/* BB not implemented yet */
-	INIT_LIST_HEAD(&GlobalSMBSessionList);
-	INIT_LIST_HEAD(&GlobalTreeConnectionList);
+	INIT_LIST_HEAD(&global_cifs_sock_list);
+	INIT_LIST_HEAD(&GlobalSMBSessionList); /* BB to be removed by jl */
+	INIT_LIST_HEAD(&GlobalTreeConnectionList); /* BB to be removed by jl */
 	INIT_LIST_HEAD(&GlobalOplock_Q);
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	INIT_LIST_HEAD(&GlobalDnotifyReqList);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index dc0aa140f1bf..d6357dc1be72 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -592,22 +592,15 @@ require use of the stronger protocol */
 #define GLOBAL_EXTERN extern
 #endif
 
-/*
- * The list of servers that did not respond with NT LM 0.12.
- * This list helps improve performance and eliminate the messages indicating
- * that we had a communications error talking to the server in this list.
- */
-/* Feature not supported */
-/* GLOBAL_EXTERN struct servers_not_supported *NotSuppList; */
-
-/*
- * The following is a hash table of all the users we know about.
- */
-GLOBAL_EXTERN struct smbUidInfo *GlobalUidList[UID_HASH];
 
-/* GLOBAL_EXTERN struct list_head GlobalServerList; BB not implemented yet */
-GLOBAL_EXTERN struct list_head GlobalSMBSessionList;
-GLOBAL_EXTERN struct list_head GlobalTreeConnectionList;
+/* the list of TCP_Server_Info structures, ie each of the sockets
+ * connecting our client to a distinct server (ip address), is
+ * chained together by global_cifs_sock_list. The list of all our SMB
+ * sessions (and from that the tree connections) can be found
+ * by iterating over global_cifs_sock_list */
+GLOBAL_EXTERN struct list_head global_cifs_sock_list;
+GLOBAL_EXTERN struct list_head GlobalSMBSessionList; /* BB to be removed by jl*/
+GLOBAL_EXTERN struct list_head GlobalTreeConnectionList; /* BB to be removed */
 GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;  /* protects list inserts on 3 above */
 
 GLOBAL_EXTERN struct list_head GlobalOplock_Q;
-- 
cgit v1.2.2


From 3ec332ef7a38c2327e18d087d4120a8e3bd3dc6e Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 14 Nov 2008 03:35:10 +0000
Subject: [CIFS] clean up server protocol handling

We're currently declaring both a sockaddr_in and sockaddr6_in on the
stack, but we really only need storage for one of them. Declare a
sockaddr struct and cast it to the proper type. Also, eliminate the
protocolType field in the TCP_Server_Info struct. It's redundant since
we have a sa_family field in the sockaddr anyway.

We may need to revisit this if SCTP is ever implemented, but for now
this will simplify the code.

CIFS over IPv6 also has a number of problems currently. This fixes all
of them that I found. Eventually, it would be nice to move more of the
code to be protocol independent, but this is a start.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_spnego.c |  4 ++--
 fs/cifs/cifsglob.h    |  3 +--
 fs/cifs/connect.c     | 57 +++++++++++++++++++++++++++------------------------
 3 files changed, 33 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index fcee9298b620..0ab2fb5afef1 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -73,8 +73,8 @@ struct key_type cifs_spnego_key_type = {
  * strlen(";sec=ntlmsspi") */
 #define MAX_MECH_STR_LEN	13
 
-/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
-#define MAX_IPV6_ADDR_LEN	42
+/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/128 */
+#define MAX_IPV6_ADDR_LEN	43
 
 /* strlen of "host=" */
 #define HOST_KEY_LEN		5
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index d6357dc1be72..13dc48414a78 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -85,8 +85,7 @@ enum securityEnum {
 };
 
 enum protocolEnum {
-	IPV4 = 0,
-	IPV6,
+	TCP = 0,
 	SCTP
 	/* Netbios frames protocol not supported at this time */
 };
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c1cd1217c990..30ab8dc68e17 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -193,7 +193,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	while ((server->tcpStatus != CifsExiting) &&
 	       (server->tcpStatus != CifsGood)) {
 		try_to_freeze();
-		if (server->protocolType == IPV6) {
+		if (server->addr.sockAddr6.sin6_family == AF_INET6) {
 			rc = ipv6_connect(&server->addr.sockAddr6,
 					  &server->ssocket, server->noautotune);
 		} else {
@@ -1983,10 +1983,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 {
 	int rc = 0;
 	int xid;
-	int address_type = AF_INET;
 	struct socket *csocket = NULL;
-	struct sockaddr_in sin_server;
-	struct sockaddr_in6 sin_server6;
+	struct sockaddr addr;
+	struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
+	struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
 	struct smb_vol volume_info;
 	struct cifsSesInfo *pSesInfo = NULL;
 	struct cifsSesInfo *existingCifsSes = NULL;
@@ -1997,6 +1997,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 
 /* cFYI(1, ("Entering cifs_mount. Xid: %d with: %s", xid, mount_data)); */
 
+	memset(&addr, 0, sizeof(struct sockaddr));
 	memset(&volume_info, 0, sizeof(struct smb_vol));
 	if (cifs_parse_mount_options(mount_data, devname, &volume_info)) {
 		rc = -EINVAL;
@@ -2019,16 +2020,16 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 
 	if (volume_info.UNCip && volume_info.UNC) {
 		rc = cifs_inet_pton(AF_INET, volume_info.UNCip,
-				    &sin_server.sin_addr.s_addr);
+				    &sin_server->sin_addr.s_addr);
 
 		if (rc <= 0) {
 			/* not ipv4 address, try ipv6 */
 			rc = cifs_inet_pton(AF_INET6, volume_info.UNCip,
-					    &sin_server6.sin6_addr.in6_u);
+					    &sin_server6->sin6_addr.in6_u);
 			if (rc > 0)
-				address_type = AF_INET6;
+				addr.sa_family = AF_INET6;
 		} else {
-			address_type = AF_INET;
+			addr.sa_family = AF_INET;
 		}
 
 		if (rc <= 0) {
@@ -2068,39 +2069,38 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		}
 	}
 
-	if (address_type == AF_INET)
-		existingCifsSes = cifs_find_tcp_session(&sin_server.sin_addr,
+	if (addr.sa_family == AF_INET)
+		existingCifsSes = cifs_find_tcp_session(&sin_server->sin_addr,
 			NULL /* no ipv6 addr */,
 			volume_info.username, &srvTcp);
-	else if (address_type == AF_INET6) {
+	else if (addr.sa_family == AF_INET6) {
 		cFYI(1, ("looking for ipv6 address"));
 		existingCifsSes = cifs_find_tcp_session(NULL /* no ipv4 addr */,
-			&sin_server6.sin6_addr,
+			&sin_server6->sin6_addr,
 			volume_info.username, &srvTcp);
 	} else {
 		rc = -EINVAL;
 		goto out;
 	}
 
-	if (!srvTcp) {	/* create socket */
-		if (volume_info.port)
-			sin_server.sin_port = htons(volume_info.port);
-		else
-			sin_server.sin_port = 0;
-		if (address_type == AF_INET6) {
+	if (!srvTcp) {
+		if (addr.sa_family == AF_INET6) {
 			cFYI(1, ("attempting ipv6 connect"));
 			/* BB should we allow ipv6 on port 139? */
 			/* other OS never observed in Wild doing 139 with v6 */
-			rc = ipv6_connect(&sin_server6, &csocket,
+			sin_server6->sin6_port = htons(volume_info.port);
+			rc = ipv6_connect(sin_server6, &csocket,
 					volume_info.noblocksnd);
-		} else
-			rc = ipv4_connect(&sin_server, &csocket,
+		} else {
+			sin_server->sin_port = htons(volume_info.port);
+			rc = ipv4_connect(sin_server, &csocket,
 				  volume_info.source_rfc1001_name,
 				  volume_info.target_rfc1001_name,
 				  volume_info.noblocksnd,
 				  volume_info.noautotune);
+		}
 		if (rc < 0) {
-			cERROR(1, ("Error connecting to IPv4 socket. "
+			cERROR(1, ("Error connecting to socket. "
 				   "Aborting operation"));
 			if (csocket != NULL)
 				sock_release(csocket);
@@ -2115,12 +2115,15 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		} else {
 			srvTcp->noblocksnd = volume_info.noblocksnd;
 			srvTcp->noautotune = volume_info.noautotune;
-			memcpy(&srvTcp->addr.sockAddr, &sin_server,
-				sizeof(struct sockaddr_in));
+			if (addr.sa_family == AF_INET6)
+				memcpy(&srvTcp->addr.sockAddr6, sin_server6,
+					sizeof(struct sockaddr_in6));
+			else
+				memcpy(&srvTcp->addr.sockAddr, sin_server,
+					sizeof(struct sockaddr_in));
 			atomic_set(&srvTcp->inFlight, 0);
 			/* BB Add code for ipv6 case too */
 			srvTcp->ssocket = csocket;
-			srvTcp->protocolType = IPV4;
 			srvTcp->hostname = extract_hostname(volume_info.UNC);
 			if (IS_ERR(srvTcp->hostname)) {
 				rc = PTR_ERR(srvTcp->hostname);
@@ -2172,7 +2175,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		else {
 			pSesInfo->server = srvTcp;
 			sprintf(pSesInfo->serverName, "%u.%u.%u.%u",
-				NIPQUAD(sin_server.sin_addr.s_addr));
+				NIPQUAD(sin_server->sin_addr.s_addr));
 		}
 
 		if (!rc) {
@@ -2211,7 +2214,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		setup_cifs_sb(&volume_info, cifs_sb);
 
 		tcon =
-		    find_unc(sin_server.sin_addr.s_addr, volume_info.UNC,
+		    find_unc(sin_server->sin_addr.s_addr, volume_info.UNC,
 			     volume_info.username);
 		if (tcon) {
 			cFYI(1, ("Found match on UNC path"));
-- 
cgit v1.2.2


From e7ddee9037e7dd43de1ad08b51727e552aedd836 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 14 Nov 2008 13:44:38 -0500
Subject: cifs: disable sharing session and tcon and add new TCP sharing code

The code that allows these structs to be shared is extremely racy.
Disable the sharing of SMB and tcon structs for now until we can
come up with a way to do this that's race free.

We want to continue to share TCP sessions, however since they are
required for multiuser mounts. For that, implement a new (hopefully
race-free) scheme. Add a new global list of TCP sessions, and take
care to get a reference to it whenever we're dealing with one.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c |   2 +-
 fs/cifs/cifsfs.c     |   3 +-
 fs/cifs/cifsglob.h   |  17 +++--
 fs/cifs/cifsproto.h  |   1 +
 fs/cifs/cifssmb.c    |  18 ++---
 fs/cifs/connect.c    | 205 ++++++++++++++++++---------------------------------
 6 files changed, 96 insertions(+), 150 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index ba8723d95996..40b5108fb4f9 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -144,7 +144,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 			seq_printf(m, "TCP status: %d\n\tLocal Users To "
 				    "Server: %d SecMode: 0x%x Req On Wire: %d",
 				ses->server->tcpStatus,
-				atomic_read(&ses->server->socketUseCount),
+				ses->server->srv_count,
 				ses->server->secMode,
 				atomic_read(&ses->server->inFlight));
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index af16a2406b1c..2946dab0718f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1059,7 +1059,7 @@ init_cifs(void)
 {
 	int rc = 0;
 	cifs_proc_init();
-	INIT_LIST_HEAD(&global_cifs_sock_list);
+	INIT_LIST_HEAD(&cifs_tcp_ses_list);
 	INIT_LIST_HEAD(&GlobalSMBSessionList); /* BB to be removed by jl */
 	INIT_LIST_HEAD(&GlobalTreeConnectionList); /* BB to be removed by jl */
 	INIT_LIST_HEAD(&GlobalOplock_Q);
@@ -1089,6 +1089,7 @@ init_cifs(void)
 	GlobalMaxActiveXid = 0;
 	memset(Local_System_Name, 0, 15);
 	rwlock_init(&GlobalSMBSeslock);
+	rwlock_init(&cifs_tcp_ses_lock);
 	spin_lock_init(&GlobalMid_Lock);
 
 	if (cifs_max_pending < 2) {
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 13dc48414a78..313f7bfedec7 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -123,6 +123,7 @@ struct cifs_cred {
 struct TCP_Server_Info {
 	struct list_head tcp_ses_list;
 	struct list_head smb_ses_list;
+	int srv_count; /* reference counter */
 	/* 15 character server name + 0x20 16th byte indicating type = srv */
 	char server_RFC1001_name[SERVER_NAME_LEN_WITH_NULL];
 	char unicode_server_Name[SERVER_NAME_LEN_WITH_NULL * 2];
@@ -144,7 +145,6 @@ struct TCP_Server_Info {
 	bool svlocal:1;			/* local server or remote */
 	bool noblocksnd;		/* use blocking sendmsg */
 	bool noautotune;		/* do not autotune send buf sizes */
-	atomic_t socketUseCount; /* number of open cifs sessions on socket */
 	atomic_t inFlight;  /* number of requests on the wire to server */
 #ifdef CONFIG_CIFS_STATS2
 	atomic_t inSend; /* requests trying to send */
@@ -591,13 +591,18 @@ require use of the stronger protocol */
 #define GLOBAL_EXTERN extern
 #endif
 
-
-/* the list of TCP_Server_Info structures, ie each of the sockets
+/*
+ * the list of TCP_Server_Info structures, ie each of the sockets
  * connecting our client to a distinct server (ip address), is
- * chained together by global_cifs_sock_list. The list of all our SMB
+ * chained together by cifs_tcp_ses_list. The list of all our SMB
  * sessions (and from that the tree connections) can be found
- * by iterating over global_cifs_sock_list */
-GLOBAL_EXTERN struct list_head global_cifs_sock_list;
+ * by iterating over cifs_tcp_ses_list
+ */
+GLOBAL_EXTERN struct list_head		cifs_tcp_ses_list;
+
+/* protects cifs_tcp_ses_list and srv_count for each tcp session */
+GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
+
 GLOBAL_EXTERN struct list_head GlobalSMBSessionList; /* BB to be removed by jl*/
 GLOBAL_EXTERN struct list_head GlobalTreeConnectionList; /* BB to be removed */
 GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;  /* protects list inserts on 3 above */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 6f21ecb85ce5..0250a994c6e6 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -102,6 +102,7 @@ extern void acl_to_uid_mode(struct inode *inode, const char *path,
 			    const __u16 *pfid);
 extern int mode_to_acl(struct inode *inode, const char *path, __u64);
 
+extern void cifs_put_tcp_session(struct TCP_Server_Info *server);
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
 			const char *);
 extern int cifs_umount(struct super_block *, struct cifs_sb_info *);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 7f0651b69573..cd9e9a145e4d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -664,8 +664,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			rc = -EIO;
 			goto neg_err_exit;
 		}
-
-		if (server->socketUseCount.counter > 1) {
+		read_lock(&cifs_tcp_ses_lock);
+		if (server->srv_count > 1) {
+			read_unlock(&cifs_tcp_ses_lock);
 			if (memcmp(server->server_GUID,
 				   pSMBr->u.extended_response.
 				   GUID, 16) != 0) {
@@ -674,9 +675,11 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 					pSMBr->u.extended_response.GUID,
 					16);
 			}
-		} else
+		} else {
+			read_unlock(&cifs_tcp_ses_lock);
 			memcpy(server->server_GUID,
 			       pSMBr->u.extended_response.GUID, 16);
+		}
 
 		if (count == 16) {
 			server->secType = RawNTLMSSP;
@@ -830,12 +833,9 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 	pSMB->AndXCommand = 0xFF;
 	rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
 session_already_dead:
-	atomic_dec(&ses->server->socketUseCount);
-	if (atomic_read(&ses->server->socketUseCount) == 0) {
-		spin_lock(&GlobalMid_Lock);
-		ses->server->tcpStatus = CifsExiting;
-		spin_unlock(&GlobalMid_Lock);
-		rc = -ESHUTDOWN;
+	if (ses->server) {
+		cifs_put_tcp_session(ses->server);
+		rc = 0;
 	}
 	up(&ses->sesSem);
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 30ab8dc68e17..a0314259f94d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -659,6 +659,11 @@ multi_t2_fnd:
 		}
 	} /* end while !EXITING */
 
+	/* take it off the list, if it's not already */
+	write_lock(&cifs_tcp_ses_lock);
+	list_del_init(&server->tcp_ses_list);
+	write_unlock(&cifs_tcp_ses_lock);
+
 	spin_lock(&GlobalMid_Lock);
 	server->tcpStatus = CifsExiting;
 	spin_unlock(&GlobalMid_Lock);
@@ -1357,92 +1362,66 @@ cifs_parse_mount_options(char *options, const char *devname,
 	return 0;
 }
 
-static struct cifsSesInfo *
-cifs_find_tcp_session(struct in_addr *target_ip_addr,
-		      struct in6_addr *target_ip6_addr,
-		      char *userName, struct TCP_Server_Info **psrvTcp)
+static struct TCP_Server_Info *
+cifs_find_tcp_session(struct sockaddr *addr)
 {
 	struct list_head *tmp;
-	struct cifsSesInfo *ses;
-
-	*psrvTcp = NULL;
-
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalSMBSessionList) {
-		ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
-		if (!ses->server)
+	struct TCP_Server_Info *server;
+	struct sockaddr_in *addr4 = (struct sockaddr_in *) addr;
+	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) addr;
+
+	write_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &cifs_tcp_ses_list) {
+		server = list_entry(tmp, struct TCP_Server_Info,
+				    tcp_ses_list);
+
+		/*
+		 * the demux thread can exit on its own while still in CifsNew
+		 * so don't accept any sockets in that state. Since the
+		 * tcpStatus never changes back to CifsNew it's safe to check
+		 * for this without a lock.
+		 */
+		if (server->tcpStatus == CifsNew)
 			continue;
 
-		if (target_ip_addr &&
-		    ses->server->addr.sockAddr.sin_addr.s_addr != target_ip_addr->s_addr)
-				continue;
-		else if (target_ip6_addr &&
-			 memcmp(&ses->server->addr.sockAddr6.sin6_addr,
-				target_ip6_addr, sizeof(*target_ip6_addr)))
-				continue;
-		/* BB lock server and tcp session; increment use count here?? */
-
-		/* found a match on the TCP session */
-		*psrvTcp = ses->server;
+		if (addr->sa_family == AF_INET &&
+		    (addr4->sin_addr.s_addr !=
+		     server->addr.sockAddr.sin_addr.s_addr))
+			continue;
+		else if (addr->sa_family == AF_INET6 &&
+			 memcmp(&server->addr.sockAddr6.sin6_addr,
+				&addr6->sin6_addr, sizeof(addr6->sin6_addr)))
+			continue;
 
-		/* BB check if reconnection needed */
-		if (strncmp(ses->userName, userName, MAX_USERNAME_SIZE) == 0) {
-			read_unlock(&GlobalSMBSeslock);
-			/* Found exact match on both TCP and
-			   SMB sessions */
-			return ses;
-		}
-		/* else tcp and smb sessions need reconnection */
+		++server->srv_count;
+		write_unlock(&cifs_tcp_ses_lock);
+		return server;
 	}
-	read_unlock(&GlobalSMBSeslock);
-
+	write_unlock(&cifs_tcp_ses_lock);
 	return NULL;
 }
 
-static struct cifsTconInfo *
-find_unc(__be32 new_target_ip_addr, char *uncName, char *userName)
+void
+cifs_put_tcp_session(struct TCP_Server_Info *server)
 {
-	struct list_head *tmp;
-	struct cifsTconInfo *tcon;
-	__be32 old_ip;
-
-	read_lock(&GlobalSMBSeslock);
-
-	list_for_each(tmp, &GlobalTreeConnectionList) {
-		cFYI(1, ("Next tcon"));
-		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		if (!tcon->ses || !tcon->ses->server)
-			continue;
-
-		old_ip = tcon->ses->server->addr.sockAddr.sin_addr.s_addr;
-		cFYI(1, ("old ip addr: %x == new ip %x ?",
-			old_ip, new_target_ip_addr));
-
-		if (old_ip != new_target_ip_addr)
-			continue;
-
-		/* BB lock tcon, server, tcp session and increment use count? */
-		/* found a match on the TCP session */
-		/* BB check if reconnection needed */
-		cFYI(1, ("IP match, old UNC: %s new: %s",
-			tcon->treeName, uncName));
+	struct task_struct *task;
 
-		if (strncmp(tcon->treeName, uncName, MAX_TREE_SIZE))
-			continue;
+	write_lock(&cifs_tcp_ses_lock);
+	if (--server->srv_count > 0) {
+		write_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
 
-		cFYI(1, ("and old usr: %s new: %s",
-			tcon->treeName, uncName));
+	list_del_init(&server->tcp_ses_list);
+	write_unlock(&cifs_tcp_ses_lock);
 
-		if (strncmp(tcon->ses->userName, userName, MAX_USERNAME_SIZE))
-			continue;
-
-		/* matched smb session (user name) */
-		read_unlock(&GlobalSMBSeslock);
-		return tcon;
-	}
+	spin_lock(&GlobalMid_Lock);
+	server->tcpStatus = CifsExiting;
+	spin_unlock(&GlobalMid_Lock);
 
-	read_unlock(&GlobalSMBSeslock);
-	return NULL;
+	task = xchg(&server->tsk, NULL);
+	if (task)
+		force_sig(SIGKILL, task);
 }
 
 int
@@ -1881,16 +1860,6 @@ convert_delimiter(char *path, char delim)
 	}
 }
 
-static void
-kill_cifsd(struct TCP_Server_Info *server)
-{
-	struct task_struct *task;
-
-	task = xchg(&server->tsk, NULL);
-	if (task)
-		force_sig(SIGKILL, task);
-}
-
 static void setup_cifs_sb(struct smb_vol *pvolume_info,
 			  struct cifs_sb_info *cifs_sb)
 {
@@ -2069,21 +2038,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		}
 	}
 
-	if (addr.sa_family == AF_INET)
-		existingCifsSes = cifs_find_tcp_session(&sin_server->sin_addr,
-			NULL /* no ipv6 addr */,
-			volume_info.username, &srvTcp);
-	else if (addr.sa_family == AF_INET6) {
-		cFYI(1, ("looking for ipv6 address"));
-		existingCifsSes = cifs_find_tcp_session(NULL /* no ipv4 addr */,
-			&sin_server6->sin6_addr,
-			volume_info.username, &srvTcp);
-	} else {
-		rc = -EINVAL;
-		goto out;
-	}
-
-	if (!srvTcp) {
+	srvTcp = cifs_find_tcp_session(&addr);
+	if (srvTcp) {
+		cFYI(1, ("Existing tcp session with server found"));
+	} else {	/* create socket */
 		if (addr.sa_family == AF_INET6) {
 			cFYI(1, ("attempting ipv6 connect"));
 			/* BB should we allow ipv6 on port 139? */
@@ -2153,6 +2111,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			memcpy(srvTcp->server_RFC1001_name,
 				volume_info.target_rfc1001_name, 16);
 			srvTcp->sequence_number = 0;
+			INIT_LIST_HEAD(&srvTcp->tcp_ses_list);
+			++srvTcp->srv_count;
+			write_lock(&cifs_tcp_ses_lock);
+			list_add(&srvTcp->tcp_ses_list,
+				 &cifs_tcp_ses_list);
+			write_unlock(&cifs_tcp_ses_lock);
 		}
 	}
 
@@ -2204,8 +2168,6 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			rc = cifs_setup_session(xid, pSesInfo,
 						cifs_sb->local_nls);
 			up(&pSesInfo->sesSem);
-			if (!rc)
-				atomic_inc(&srvTcp->socketUseCount);
 		}
 	}
 
@@ -2213,9 +2175,6 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	if (!rc) {
 		setup_cifs_sb(&volume_info, cifs_sb);
 
-		tcon =
-		    find_unc(sin_server->sin_addr.s_addr, volume_info.UNC,
-			     volume_info.username);
 		if (tcon) {
 			cFYI(1, ("Found match on UNC path"));
 			if (tcon->seal != volume_info.seal)
@@ -2278,35 +2237,21 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 /* on error free sesinfo and tcon struct if needed */
 mount_fail_check:
 	if (rc) {
-		/* if session setup failed, use count is zero but
-		we still need to free cifsd thread */
-		if (atomic_read(&srvTcp->socketUseCount) == 0) {
-			spin_lock(&GlobalMid_Lock);
-			srvTcp->tcpStatus = CifsExiting;
-			spin_unlock(&GlobalMid_Lock);
-			kill_cifsd(srvTcp);
-		}
-		 /* If find_unc succeeded then rc == 0 so we can not end */
-		if (tcon)  /* up accidently freeing someone elses tcon struct */
+		/* If find_unc succeeded then rc == 0 so we can not end */
+		/* up accidently freeing someone elses tcon struct */
+		if (tcon)
 			tconInfoFree(tcon);
+
 		if (existingCifsSes == NULL) {
 			if (pSesInfo) {
 				if ((pSesInfo->server) &&
-				    (pSesInfo->status == CifsGood)) {
-					int temp_rc;
-					temp_rc = CIFSSMBLogoff(xid, pSesInfo);
-					/* if the socketUseCount is now zero */
-					if ((temp_rc == -ESHUTDOWN) &&
-					    (pSesInfo->server))
-						kill_cifsd(pSesInfo->server);
-				} else {
+				    (pSesInfo->status == CifsGood))
+					CIFSSMBLogoff(xid, pSesInfo);
+				else {
 					cFYI(1, ("No session or bad tcon"));
-					if (pSesInfo->server) {
-						spin_lock(&GlobalMid_Lock);
-						srvTcp->tcpStatus = CifsExiting;
-						spin_unlock(&GlobalMid_Lock);
-						kill_cifsd(pSesInfo->server);
-					}
+					if (pSesInfo->server)
+						cifs_put_tcp_session(
+							pSesInfo->server);
 				}
 				sesInfoFree(pSesInfo);
 				/* pSesInfo = NULL; */
@@ -3613,13 +3558,7 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 			if (rc == -EBUSY) {
 				FreeXid(xid);
 				return 0;
-			} else if (rc == -ESHUTDOWN) {
-				cFYI(1, ("Waking up socket by sending signal"));
-				if (ses->server)
-					kill_cifsd(ses->server);
-				rc = 0;
-			} /* else - we have an smb session
-				left on this socket do not kill cifsd */
+			}
 		} else
 			cFYI(1, ("No session or bad tcon"));
 	}
-- 
cgit v1.2.2


From 14fbf50d695207754daeb96270b3027a3821121f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 14 Nov 2008 13:53:46 -0500
Subject: cifs: reinstate sharing of SMB sessions sans races

We do this by abandoning the global list of SMB sessions and instead
moving to a per-server list. This entails adding a new list head to the
TCP_Server_Info struct. The refcounting for the cifsSesInfo is moved to
a non-atomic variable. We have to protect it by a lock anyway, so there's
no benefit to making it an atomic. The list and refcount are protected
by the global cifs_tcp_ses_lock.

The patch also adds a new routines to find and put SMB sessions and
that properly take and put references under the lock.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c |  53 ++++++------
 fs/cifs/cifsfs.c     |  17 ++--
 fs/cifs/cifsglob.h   |   6 +-
 fs/cifs/cifsproto.h  |   1 -
 fs/cifs/cifssmb.c    |  22 ++---
 fs/cifs/connect.c    | 226 ++++++++++++++++++++++++++++-----------------------
 fs/cifs/misc.c       |  16 ++--
 7 files changed, 175 insertions(+), 166 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 40b5108fb4f9..59841a68b0b6 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -107,9 +107,9 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
 #ifdef CONFIG_PROC_FS
 static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 {
-	struct list_head *tmp;
-	struct list_head *tmp1;
+	struct list_head *tmp, *tmp2, *tmp3;
 	struct mid_q_entry *mid_entry;
+	struct TCP_Server_Info *server;
 	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
 	int i;
@@ -122,43 +122,45 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 	seq_printf(m, "Servers:");
 
 	i = 0;
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalSMBSessionList) {
+	read_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &cifs_tcp_ses_list) {
+		server = list_entry(tmp, struct TCP_Server_Info,
+				    tcp_ses_list);
 		i++;
-		ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
-		if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) ||
-		   (ses->serverNOS == NULL)) {
-			seq_printf(m, "\nentry for %s not fully "
-					"displayed\n\t", ses->serverName);
-		} else {
-			seq_printf(m,
+		list_for_each(tmp2, &server->smb_ses_list) {
+			ses = list_entry(tmp2, struct cifsSesInfo,
+					 smb_ses_list);
+			if ((ses->serverDomain == NULL) ||
+				(ses->serverOS == NULL) ||
+				(ses->serverNOS == NULL)) {
+				seq_printf(m, "\nentry for %s not fully "
+					   "displayed\n\t", ses->serverName);
+			} else {
+				seq_printf(m,
 				    "\n%d) Name: %s  Domain: %s Mounts: %d OS:"
 				    " %s  \n\tNOS: %s\tCapability: 0x%x\n\tSMB"
 				    " session status: %d\t",
 				i, ses->serverName, ses->serverDomain,
-				atomic_read(&ses->inUse),
-				ses->serverOS, ses->serverNOS,
+				ses->ses_count, ses->serverOS, ses->serverNOS,
 				ses->capabilities, ses->status);
-		}
-		if (ses->server) {
+			}
 			seq_printf(m, "TCP status: %d\n\tLocal Users To "
-				    "Server: %d SecMode: 0x%x Req On Wire: %d",
-				ses->server->tcpStatus,
-				ses->server->srv_count,
-				ses->server->secMode,
-				atomic_read(&ses->server->inFlight));
+				   "Server: %d SecMode: 0x%x Req On Wire: %d",
+				   server->tcpStatus, server->srv_count,
+				   server->secMode,
+				   atomic_read(&server->inFlight));
 
 #ifdef CONFIG_CIFS_STATS2
 			seq_printf(m, " In Send: %d In MaxReq Wait: %d",
-				atomic_read(&ses->server->inSend),
-				atomic_read(&ses->server->num_waiters));
+				atomic_read(&server->inSend),
+				atomic_read(&server->num_waiters));
 #endif
 
 			seq_puts(m, "\nMIDs:\n");
 
 			spin_lock(&GlobalMid_Lock);
-			list_for_each(tmp1, &ses->server->pending_mid_q) {
-				mid_entry = list_entry(tmp1, struct
+			list_for_each(tmp3, &server->pending_mid_q) {
+				mid_entry = list_entry(tmp3, struct
 					mid_q_entry,
 					qhead);
 				seq_printf(m, "State: %d com: %d pid:"
@@ -171,9 +173,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 			}
 			spin_unlock(&GlobalMid_Lock);
 		}
-
 	}
-	read_unlock(&GlobalSMBSeslock);
+	read_unlock(&cifs_tcp_ses_lock);
 	seq_putc(m, '\n');
 
 	seq_puts(m, "Shares:");
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 2946dab0718f..a1e96620b097 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1031,24 +1031,24 @@ static int cifs_oplock_thread(void *dummyarg)
 static int cifs_dnotify_thread(void *dummyarg)
 {
 	struct list_head *tmp;
-	struct cifsSesInfo *ses;
+	struct TCP_Server_Info *server;
 
 	do {
 		if (try_to_freeze())
 			continue;
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(15*HZ);
-		read_lock(&GlobalSMBSeslock);
 		/* check if any stuck requests that need
 		   to be woken up and wakeq so the
 		   thread can wake up and error out */
-		list_for_each(tmp, &GlobalSMBSessionList) {
-			ses = list_entry(tmp, struct cifsSesInfo,
-				cifsSessionList);
-			if (ses->server && atomic_read(&ses->server->inFlight))
-				wake_up_all(&ses->server->response_q);
+		read_lock(&cifs_tcp_ses_lock);
+		list_for_each(tmp, &cifs_tcp_ses_list) {
+			server = list_entry(tmp, struct TCP_Server_Info,
+					 tcp_ses_list);
+			if (atomic_read(&server->inFlight))
+				wake_up_all(&server->response_q);
 		}
-		read_unlock(&GlobalSMBSeslock);
+		read_unlock(&cifs_tcp_ses_lock);
 	} while (!kthread_should_stop());
 
 	return 0;
@@ -1060,7 +1060,6 @@ init_cifs(void)
 	int rc = 0;
 	cifs_proc_init();
 	INIT_LIST_HEAD(&cifs_tcp_ses_list);
-	INIT_LIST_HEAD(&GlobalSMBSessionList); /* BB to be removed by jl */
 	INIT_LIST_HEAD(&GlobalTreeConnectionList); /* BB to be removed by jl */
 	INIT_LIST_HEAD(&GlobalOplock_Q);
 #ifdef CONFIG_CIFS_EXPERIMENTAL
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 313f7bfedec7..631a99f72f22 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -195,14 +195,14 @@ struct cifsUidInfo {
  * Session structure.  One of these for each uid session with a particular host
  */
 struct cifsSesInfo {
-	struct list_head cifsSessionList;
+	struct list_head smb_ses_list;
 	struct list_head tcon_list;
 	struct semaphore sesSem;
 #if 0
 	struct cifsUidInfo *uidInfo;	/* pointer to user info */
 #endif
 	struct TCP_Server_Info *server;	/* pointer to server info */
-	atomic_t inUse; /* # of mounts (tree connections) on this ses */
+	int ses_count;		/* reference counter */
 	enum statusEnum status;
 	unsigned overrideSecFlg;  /* if non-zero override global sec flags */
 	__u16 ipc_tid;		/* special tid for connection to IPC share */
@@ -602,8 +602,6 @@ GLOBAL_EXTERN struct list_head		cifs_tcp_ses_list;
 
 /* protects cifs_tcp_ses_list and srv_count for each tcp session */
 GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
-
-GLOBAL_EXTERN struct list_head GlobalSMBSessionList; /* BB to be removed by jl*/
 GLOBAL_EXTERN struct list_head GlobalTreeConnectionList; /* BB to be removed */
 GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;  /* protects list inserts on 3 above */
 
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 0250a994c6e6..6f21ecb85ce5 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -102,7 +102,6 @@ extern void acl_to_uid_mode(struct inode *inode, const char *path,
 			    const __u16 *pfid);
 extern int mode_to_acl(struct inode *inode, const char *path, __u64);
 
-extern void cifs_put_tcp_session(struct TCP_Server_Info *server);
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
 			const char *);
 extern int cifs_umount(struct super_block *, struct cifs_sb_info *);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index cd9e9a145e4d..9c95617baa4d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -799,20 +799,16 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 	int rc = 0;
 
 	cFYI(1, ("In SMBLogoff for session disconnect"));
-	if (ses)
-		down(&ses->sesSem);
-	else
-		return -EIO;
-
-	atomic_dec(&ses->inUse);
-	if (atomic_read(&ses->inUse) > 0) {
-		up(&ses->sesSem);
-		return -EBUSY;
-	}
 
-	if (ses->server == NULL)
+	/*
+	 * BB: do we need to check validity of ses and server? They should
+	 * always be valid since we have an active reference. If not, that
+	 * should probably be a BUG()
+	 */
+	if (!ses || !ses->server)
 		return -EIO;
 
+	down(&ses->sesSem);
 	if (ses->need_reconnect)
 		goto session_already_dead; /* no need to send SMBlogoff if uid
 					      already closed due to reconnect */
@@ -833,10 +829,6 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 	pSMB->AndXCommand = 0xFF;
 	rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
 session_already_dead:
-	if (ses->server) {
-		cifs_put_tcp_session(ses->server);
-		rc = 0;
-	}
 	up(&ses->sesSem);
 
 	/* if session dead then we do not need to do ulogoff,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a0314259f94d..44130e052e0b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -144,23 +144,18 @@ cifs_reconnect(struct TCP_Server_Info *server)
 
 	/* before reconnecting the tcp session, mark the smb session (uid)
 		and the tid bad so they are not used until reconnected */
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalSMBSessionList) {
-		ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
-		if (ses->server) {
-			if (ses->server == server) {
-				ses->need_reconnect = true;
-				ses->ipc_tid = 0;
-			}
-		}
-		/* else tcp and smb sessions need reconnection */
+	read_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &server->smb_ses_list) {
+		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+		ses->need_reconnect = true;
+		ses->ipc_tid = 0;
 	}
+	read_unlock(&cifs_tcp_ses_lock);
 	list_for_each(tmp, &GlobalTreeConnectionList) {
 		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
 		if ((tcon->ses) && (tcon->ses->server == server))
 			tcon->need_reconnect = true;
 	}
-	read_unlock(&GlobalSMBSeslock);
 	/* do not want to be sending data on a socket we are freeing */
 	down(&server->tcpSem);
 	if (server->ssocket) {
@@ -696,29 +691,29 @@ multi_t2_fnd:
 	if (smallbuf) /* no sense logging a debug message if NULL */
 		cifs_small_buf_release(smallbuf);
 
-	read_lock(&GlobalSMBSeslock);
+	/*
+	 * BB: we shouldn't have to do any of this. It shouldn't be
+	 * possible to exit from the thread with active SMB sessions
+	 */
+	read_lock(&cifs_tcp_ses_lock);
 	if (list_empty(&server->pending_mid_q)) {
 		/* loop through server session structures attached to this and
 		    mark them dead */
-		list_for_each(tmp, &GlobalSMBSessionList) {
-			ses =
-			    list_entry(tmp, struct cifsSesInfo,
-				       cifsSessionList);
-			if (ses->server == server) {
-				ses->status = CifsExiting;
-				ses->server = NULL;
-			}
+		list_for_each(tmp, &server->smb_ses_list) {
+			ses = list_entry(tmp, struct cifsSesInfo,
+					 smb_ses_list);
+			ses->status = CifsExiting;
+			ses->server = NULL;
 		}
-		read_unlock(&GlobalSMBSeslock);
+		read_unlock(&cifs_tcp_ses_lock);
 	} else {
 		/* although we can not zero the server struct pointer yet,
 		since there are active requests which may depnd on them,
 		mark the corresponding SMB sessions as exiting too */
-		list_for_each(tmp, &GlobalSMBSessionList) {
+		list_for_each(tmp, &server->smb_ses_list) {
 			ses = list_entry(tmp, struct cifsSesInfo,
-					 cifsSessionList);
-			if (ses->server == server)
-				ses->status = CifsExiting;
+					 smb_ses_list);
+			ses->status = CifsExiting;
 		}
 
 		spin_lock(&GlobalMid_Lock);
@@ -733,7 +728,7 @@ multi_t2_fnd:
 			}
 		}
 		spin_unlock(&GlobalMid_Lock);
-		read_unlock(&GlobalSMBSeslock);
+		read_unlock(&cifs_tcp_ses_lock);
 		/* 1/8th of sec is more than enough time for them to exit */
 		msleep(125);
 	}
@@ -755,14 +750,13 @@ multi_t2_fnd:
 	if there are any pointing to this (e.g
 	if a crazy root user tried to kill cifsd
 	kernel thread explicitly this might happen) */
-	write_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalSMBSessionList) {
-		ses = list_entry(tmp, struct cifsSesInfo,
-				cifsSessionList);
-		if (ses->server == server)
-			ses->server = NULL;
+	/* BB: This shouldn't be necessary, see above */
+	read_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &server->smb_ses_list) {
+		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+		ses->server = NULL;
 	}
-	write_unlock(&GlobalSMBSeslock);
+	read_unlock(&cifs_tcp_ses_lock);
 
 	kfree(server->hostname);
 	task_to_wake = xchg(&server->tsk, NULL);
@@ -1401,7 +1395,7 @@ cifs_find_tcp_session(struct sockaddr *addr)
 	return NULL;
 }
 
-void
+static void
 cifs_put_tcp_session(struct TCP_Server_Info *server)
 {
 	struct task_struct *task;
@@ -1424,6 +1418,50 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
 		force_sig(SIGKILL, task);
 }
 
+static struct cifsSesInfo *
+cifs_find_smb_ses(struct TCP_Server_Info *server, char *username)
+{
+	struct list_head *tmp;
+	struct cifsSesInfo *ses;
+
+	write_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &server->smb_ses_list) {
+		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+		if (strncmp(ses->userName, username, MAX_USERNAME_SIZE))
+			continue;
+
+		++ses->ses_count;
+		write_unlock(&cifs_tcp_ses_lock);
+		return ses;
+	}
+	write_unlock(&cifs_tcp_ses_lock);
+	return NULL;
+}
+
+static void
+cifs_put_smb_ses(struct cifsSesInfo *ses)
+{
+	int xid;
+	struct TCP_Server_Info *server = ses->server;
+
+	write_lock(&cifs_tcp_ses_lock);
+	if (--ses->ses_count > 0) {
+		write_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
+
+	list_del_init(&ses->smb_ses_list);
+	write_unlock(&cifs_tcp_ses_lock);
+
+	if (ses->status == CifsGood) {
+		xid = GetXid();
+		CIFSSMBLogoff(xid, ses);
+		_FreeXid(xid);
+	}
+	sesInfoFree(ses);
+	cifs_put_tcp_session(server);
+}
+
 int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
 	     const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
@@ -1958,7 +1996,6 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
 	struct smb_vol volume_info;
 	struct cifsSesInfo *pSesInfo = NULL;
-	struct cifsSesInfo *existingCifsSes = NULL;
 	struct cifsTconInfo *tcon = NULL;
 	struct TCP_Server_Info *srvTcp = NULL;
 
@@ -2112,6 +2149,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 				volume_info.target_rfc1001_name, 16);
 			srvTcp->sequence_number = 0;
 			INIT_LIST_HEAD(&srvTcp->tcp_ses_list);
+			INIT_LIST_HEAD(&srvTcp->smb_ses_list);
 			++srvTcp->srv_count;
 			write_lock(&cifs_tcp_ses_lock);
 			list_add(&srvTcp->tcp_ses_list,
@@ -2120,10 +2158,16 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		}
 	}
 
-	if (existingCifsSes) {
-		pSesInfo = existingCifsSes;
+	pSesInfo = cifs_find_smb_ses(srvTcp, volume_info.username);
+	if (pSesInfo) {
 		cFYI(1, ("Existing smb sess found (status=%d)",
 			pSesInfo->status));
+		/*
+		 * The existing SMB session already has a reference to srvTcp,
+		 * so we can put back the extra one we got before
+		 */
+		cifs_put_tcp_session(srvTcp);
+
 		down(&pSesInfo->sesSem);
 		if (pSesInfo->need_reconnect) {
 			cFYI(1, ("Session needs reconnect"));
@@ -2134,41 +2178,44 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	} else if (!rc) {
 		cFYI(1, ("Existing smb sess not found"));
 		pSesInfo = sesInfoAlloc();
-		if (pSesInfo == NULL)
+		if (pSesInfo == NULL) {
 			rc = -ENOMEM;
-		else {
-			pSesInfo->server = srvTcp;
-			sprintf(pSesInfo->serverName, "%u.%u.%u.%u",
-				NIPQUAD(sin_server->sin_addr.s_addr));
+			goto mount_fail_check;
 		}
 
-		if (!rc) {
-			/* volume_info.password freed at unmount */
-			if (volume_info.password) {
-				pSesInfo->password = volume_info.password;
-				/* set to NULL to prevent freeing on exit */
-				volume_info.password = NULL;
-			}
-			if (volume_info.username)
-				strncpy(pSesInfo->userName,
-					volume_info.username,
-					MAX_USERNAME_SIZE);
-			if (volume_info.domainname) {
-				int len = strlen(volume_info.domainname);
-				pSesInfo->domainName =
-					kmalloc(len + 1, GFP_KERNEL);
-				if (pSesInfo->domainName)
-					strcpy(pSesInfo->domainName,
-						volume_info.domainname);
-			}
-			pSesInfo->linux_uid = volume_info.linux_uid;
-			pSesInfo->overrideSecFlg = volume_info.secFlg;
-			down(&pSesInfo->sesSem);
-			/* BB FIXME need to pass vol->secFlgs BB */
-			rc = cifs_setup_session(xid, pSesInfo,
-						cifs_sb->local_nls);
-			up(&pSesInfo->sesSem);
-		}
+		/* new SMB session uses our srvTcp ref */
+		pSesInfo->server = srvTcp;
+		sprintf(pSesInfo->serverName, "%u.%u.%u.%u",
+			NIPQUAD(sin_server->sin_addr.s_addr));
+
+		write_lock(&cifs_tcp_ses_lock);
+		list_add(&pSesInfo->smb_ses_list, &srvTcp->smb_ses_list);
+		write_unlock(&cifs_tcp_ses_lock);
+
+		/* volume_info.password freed at unmount */
+		if (volume_info.password) {
+			pSesInfo->password = volume_info.password;
+			/* set to NULL to prevent freeing on exit */
+			volume_info.password = NULL;
+		}
+		if (volume_info.username)
+			strncpy(pSesInfo->userName, volume_info.username,
+				MAX_USERNAME_SIZE);
+		if (volume_info.domainname) {
+			int len = strlen(volume_info.domainname);
+			pSesInfo->domainName = kmalloc(len + 1, GFP_KERNEL);
+			if (pSesInfo->domainName)
+				strcpy(pSesInfo->domainName,
+					volume_info.domainname);
+		}
+		pSesInfo->linux_uid = volume_info.linux_uid;
+		pSesInfo->overrideSecFlg = volume_info.secFlg;
+		down(&pSesInfo->sesSem);
+
+		/* BB FIXME need to pass vol->secFlgs BB */
+		rc = cifs_setup_session(xid, pSesInfo,
+					cifs_sb->local_nls);
+		up(&pSesInfo->sesSem);
 	}
 
 	/* search for existing tcon to this server share */
@@ -2209,11 +2256,9 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 						tcon->Flags));
 				}
 			}
-			if (!rc) {
-				atomic_inc(&pSesInfo->inUse);
-				tcon->seal = volume_info.seal;
-			} else
+			if (rc)
 				goto mount_fail_check;
+			tcon->seal = volume_info.seal;
 		}
 
 		/* we can have only one retry value for a connection
@@ -2234,29 +2279,19 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	/* BB FIXME fix time_gran to be larger for LANMAN sessions */
 	sb->s_time_gran = 100;
 
-/* on error free sesinfo and tcon struct if needed */
 mount_fail_check:
+	/* on error free sesinfo and tcon struct if needed */
 	if (rc) {
 		/* If find_unc succeeded then rc == 0 so we can not end */
 		/* up accidently freeing someone elses tcon struct */
 		if (tcon)
 			tconInfoFree(tcon);
 
-		if (existingCifsSes == NULL) {
-			if (pSesInfo) {
-				if ((pSesInfo->server) &&
-				    (pSesInfo->status == CifsGood))
-					CIFSSMBLogoff(xid, pSesInfo);
-				else {
-					cFYI(1, ("No session or bad tcon"));
-					if (pSesInfo->server)
-						cifs_put_tcp_session(
-							pSesInfo->server);
-				}
-				sesInfoFree(pSesInfo);
-				/* pSesInfo = NULL; */
-			}
-		}
+		/* should also end up putting our tcp session ref if needed */
+		if (pSesInfo)
+			cifs_put_smb_ses(pSesInfo);
+		else
+			cifs_put_tcp_session(srvTcp);
 	} else {
 		atomic_inc(&tcon->useCount);
 		cifs_sb->tcon = tcon;
@@ -3551,16 +3586,7 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 		}
 		DeleteTconOplockQEntries(cifs_sb->tcon);
 		tconInfoFree(cifs_sb->tcon);
-		if ((ses) && (ses->server)) {
-			/* save off task so we do not refer to ses later */
-			cFYI(1, ("About to do SMBLogoff "));
-			rc = CIFSSMBLogoff(xid, ses);
-			if (rc == -EBUSY) {
-				FreeXid(xid);
-				return 0;
-			}
-		} else
-			cFYI(1, ("No session or bad tcon"));
+		cifs_put_smb_ses(ses);
 	}
 
 	cifs_sb->tcon = NULL;
@@ -3568,8 +3594,6 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 	cifs_sb->prepathlen = 0;
 	cifs_sb->prepath = NULL;
 	kfree(tmp);
-	if (ses)
-		sesInfoFree(ses);
 
 	FreeXid(xid);
 	return rc;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 88786ba02d27..46c8c7baccba 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -75,12 +75,11 @@ sesInfoAlloc(void)
 
 	ret_buf = kzalloc(sizeof(struct cifsSesInfo), GFP_KERNEL);
 	if (ret_buf) {
-		write_lock(&GlobalSMBSeslock);
 		atomic_inc(&sesInfoAllocCount);
 		ret_buf->status = CifsNew;
-		list_add(&ret_buf->cifsSessionList, &GlobalSMBSessionList);
+		++ret_buf->ses_count;
+		INIT_LIST_HEAD(&ret_buf->smb_ses_list);
 		init_MUTEX(&ret_buf->sesSem);
-		write_unlock(&GlobalSMBSeslock);
 	}
 	return ret_buf;
 }
@@ -93,10 +92,7 @@ sesInfoFree(struct cifsSesInfo *buf_to_free)
 		return;
 	}
 
-	write_lock(&GlobalSMBSeslock);
 	atomic_dec(&sesInfoAllocCount);
-	list_del(&buf_to_free->cifsSessionList);
-	write_unlock(&GlobalSMBSeslock);
 	kfree(buf_to_free->serverOS);
 	kfree(buf_to_free->serverDomain);
 	kfree(buf_to_free->serverNOS);
@@ -350,9 +346,9 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 				if (current->fsuid != treeCon->ses->linux_uid) {
 					cFYI(1, ("Multiuser mode and UID "
 						 "did not match tcon uid"));
-					read_lock(&GlobalSMBSeslock);
-					list_for_each(temp_item, &GlobalSMBSessionList) {
-						ses = list_entry(temp_item, struct cifsSesInfo, cifsSessionList);
+					read_lock(&cifs_tcp_ses_lock);
+					list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
+						ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
 						if (ses->linux_uid == current->fsuid) {
 							if (ses->server == treeCon->ses->server) {
 								cFYI(1, ("found matching uid substitute right smb_uid"));
@@ -364,7 +360,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 							}
 						}
 					}
-					read_unlock(&GlobalSMBSeslock);
+					read_unlock(&cifs_tcp_ses_lock);
 				}
 			}
 		}
-- 
cgit v1.2.2


From d82c2df54e2f7e447476350848d8eccc8d2fe46a Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sat, 15 Nov 2008 00:07:26 +0000
Subject: [CIFS] minor cleanup to cifs_mount

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 74 +++++++++++++++++++++++++------------------------------
 1 file changed, 34 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 44130e052e0b..a3dc0d7cafc3 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1368,7 +1368,6 @@ cifs_find_tcp_session(struct sockaddr *addr)
 	list_for_each(tmp, &cifs_tcp_ses_list) {
 		server = list_entry(tmp, struct TCP_Server_Info,
 				    tcp_ses_list);
-
 		/*
 		 * the demux thread can exit on its own while still in CifsNew
 		 * so don't accept any sockets in that state. Since the
@@ -1389,6 +1388,7 @@ cifs_find_tcp_session(struct sockaddr *addr)
 
 		++server->srv_count;
 		write_unlock(&cifs_tcp_ses_lock);
+		cFYI(1, ("Existing tcp session with server found"));
 		return server;
 	}
 	write_unlock(&cifs_tcp_ses_lock);
@@ -2076,9 +2076,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	}
 
 	srvTcp = cifs_find_tcp_session(&addr);
-	if (srvTcp) {
-		cFYI(1, ("Existing tcp session with server found"));
-	} else {	/* create socket */
+	if (!srvTcp) { /* create socket */
 		if (addr.sa_family == AF_INET6) {
 			cFYI(1, ("attempting ipv6 connect"));
 			/* BB should we allow ipv6 on port 139? */
@@ -2292,45 +2290,41 @@ mount_fail_check:
 			cifs_put_smb_ses(pSesInfo);
 		else
 			cifs_put_tcp_session(srvTcp);
-	} else {
-		atomic_inc(&tcon->useCount);
-		cifs_sb->tcon = tcon;
-		tcon->ses = pSesInfo;
-
-		/* do not care if following two calls succeed - informational */
-		if (!tcon->ipc) {
-			CIFSSMBQFSDeviceInfo(xid, tcon);
-			CIFSSMBQFSAttributeInfo(xid, tcon);
-		}
-
-		/* tell server which Unix caps we support */
-		if (tcon->ses->capabilities & CAP_UNIX)
-			/* reset of caps checks mount to see if unix extensions
-			   disabled for just this mount */
-			reset_cifs_unix_caps(xid, tcon, sb, &volume_info);
-		else
-			tcon->unix_ext = 0; /* server does not support them */
-
-		/* convert forward to back slashes in prepath here if needed */
-		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
-			convert_delimiter(cifs_sb->prepath,
-					  CIFS_DIR_SEP(cifs_sb));
+		goto out;
+	}
+	atomic_inc(&tcon->useCount);
+	cifs_sb->tcon = tcon;
+	tcon->ses = pSesInfo;
 
-		if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
-			cifs_sb->rsize = 1024 * 127;
-			cFYI(DBG2,
-				("no very large read support, rsize now 127K"));
-		}
-		if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
-			cifs_sb->wsize = min(cifs_sb->wsize,
-					     (tcon->ses->server->maxBuf -
-					      MAX_CIFS_HDR_SIZE));
-		if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
-			cifs_sb->rsize = min(cifs_sb->rsize,
-					     (tcon->ses->server->maxBuf -
-					      MAX_CIFS_HDR_SIZE));
+	/* do not care if following two calls succeed - informational */
+	if (!tcon->ipc) {
+		CIFSSMBQFSDeviceInfo(xid, tcon);
+		CIFSSMBQFSAttributeInfo(xid, tcon);
 	}
 
+	/* tell server which Unix caps we support */
+	if (tcon->ses->capabilities & CAP_UNIX)
+		/* reset of caps checks mount to see if unix extensions
+		   disabled for just this mount */
+		reset_cifs_unix_caps(xid, tcon, sb, &volume_info);
+	else
+		tcon->unix_ext = 0; /* server does not support them */
+
+	/* convert forward to back slashes in prepath here if needed */
+	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
+		convert_delimiter(cifs_sb->prepath, CIFS_DIR_SEP(cifs_sb));
+
+	if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
+		cifs_sb->rsize = 1024 * 127;
+		cFYI(DBG2, ("no very large read support, rsize now 127K"));
+	}
+	if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
+		cifs_sb->wsize = min(cifs_sb->wsize,
+			       (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
+	if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
+		cifs_sb->rsize = min(cifs_sb->rsize,
+			       (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
+
 	/* volume_info.password is freed above when existing session found
 	(in which case it is not needed anymore) but when new sesion is created
 	the password ptr is put in the new session structure (in which case the
-- 
cgit v1.2.2


From 8f7b0ba1c853919b85b54774775f567f30006107 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sat, 15 Nov 2008 01:15:43 +0000
Subject: Fix inotify watch removal/umount races

Inotify watch removals suck violently.

To kick the watch out we need (in this order) inode->inotify_mutex and
ih->mutex.  That's fine if we have a hold on inode; however, for all
other cases we need to make damn sure we don't race with umount.  We can
*NOT* just grab a reference to a watch - inotify_unmount_inodes() will
happily sail past it and we'll end with reference to inode potentially
outliving its superblock.

Ideally we just want to grab an active reference to superblock if we
can; that will make sure we won't go into inotify_umount_inodes() until
we are done.  Cleanup is just deactivate_super().

However, that leaves a messy case - what if we *are* racing with
umount() and active references to superblock can't be acquired anymore?
We can bump ->s_count, grab ->s_umount, which will almost certainly wait
until the superblock is shut down and the watch in question is pining
for fjords.  That's fine, but there is a problem - we might have hit the
window between ->s_active getting to 0 / ->s_count - below S_BIAS (i.e.
the moment when superblock is past the point of no return and is heading
for shutdown) and the moment when deactivate_super() acquires
->s_umount.

We could just do drop_super() yield() and retry, but that's rather
antisocial and this stuff is luser-triggerable.  OTOH, having grabbed
->s_umount and having found that we'd got there first (i.e.  that
->s_root is non-NULL) we know that we won't race with
inotify_umount_inodes().

So we could grab a reference to watch and do the rest as above, just
with drop_super() instead of deactivate_super(), right? Wrong.  We had
to drop ih->mutex before we could grab ->s_umount.  So the watch
could've been gone already.

That still can be dealt with - we need to save watch->wd, do idr_find()
and compare its result with our pointer.  If they match, we either have
the damn thing still alive or we'd lost not one but two races at once,
the watch had been killed and a new one got created with the same ->wd
at the same address.  That couldn't have happened in inotify_destroy(),
but inotify_rm_wd() could run into that.  Still, "new one got created"
is not a problem - we have every right to kill it or leave it alone,
whatever's more convenient.

So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
"grab it and kill it" check.  If it's been our original watch, we are
fine, if it's a newcomer - nevermind, just pretend that we'd won the
race and kill the fscker anyway; we are safe since we know that its
superblock won't be going away.

And yes, this is far beyond mere "not very pretty"; so's the entire
concept of inotify to start with.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/inotify.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 144 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/inotify.c b/fs/inotify.c
index 690e72595e6e..7bbed1b89825 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -106,6 +106,20 @@ void get_inotify_watch(struct inotify_watch *watch)
 }
 EXPORT_SYMBOL_GPL(get_inotify_watch);
 
+int pin_inotify_watch(struct inotify_watch *watch)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	spin_lock(&sb_lock);
+	if (sb->s_count >= S_BIAS) {
+		atomic_inc(&sb->s_active);
+		spin_unlock(&sb_lock);
+		atomic_inc(&watch->count);
+		return 1;
+	}
+	spin_unlock(&sb_lock);
+	return 0;
+}
+
 /**
  * put_inotify_watch - decrements the ref count on a given watch.  cleans up
  * watch references if the count reaches zero.  inotify_watch is freed by
@@ -124,6 +138,13 @@ void put_inotify_watch(struct inotify_watch *watch)
 }
 EXPORT_SYMBOL_GPL(put_inotify_watch);
 
+void unpin_inotify_watch(struct inotify_watch *watch)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	put_inotify_watch(watch);
+	deactivate_super(sb);
+}
+
 /*
  * inotify_handle_get_wd - returns the next WD for use by the given handle
  *
@@ -479,6 +500,112 @@ void inotify_init_watch(struct inotify_watch *watch)
 }
 EXPORT_SYMBOL_GPL(inotify_init_watch);
 
+/*
+ * Watch removals suck violently.  To kick the watch out we need (in this
+ * order) inode->inotify_mutex and ih->mutex.  That's fine if we have
+ * a hold on inode; however, for all other cases we need to make damn sure
+ * we don't race with umount.  We can *NOT* just grab a reference to a
+ * watch - inotify_unmount_inodes() will happily sail past it and we'll end
+ * with reference to inode potentially outliving its superblock.  Ideally
+ * we just want to grab an active reference to superblock if we can; that
+ * will make sure we won't go into inotify_umount_inodes() until we are
+ * done.  Cleanup is just deactivate_super().  However, that leaves a messy
+ * case - what if we *are* racing with umount() and active references to
+ * superblock can't be acquired anymore?  We can bump ->s_count, grab
+ * ->s_umount, which will almost certainly wait until the superblock is shut
+ * down and the watch in question is pining for fjords.  That's fine, but
+ * there is a problem - we might have hit the window between ->s_active
+ * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
+ * is past the point of no return and is heading for shutdown) and the
+ * moment when deactivate_super() acquires ->s_umount.  We could just do
+ * drop_super() yield() and retry, but that's rather antisocial and this
+ * stuff is luser-triggerable.  OTOH, having grabbed ->s_umount and having
+ * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
+ * that we won't race with inotify_umount_inodes().  So we could grab a
+ * reference to watch and do the rest as above, just with drop_super() instead
+ * of deactivate_super(), right?  Wrong.  We had to drop ih->mutex before we
+ * could grab ->s_umount.  So the watch could've been gone already.
+ *
+ * That still can be dealt with - we need to save watch->wd, do idr_find()
+ * and compare its result with our pointer.  If they match, we either have
+ * the damn thing still alive or we'd lost not one but two races at once,
+ * the watch had been killed and a new one got created with the same ->wd
+ * at the same address.  That couldn't have happened in inotify_destroy(),
+ * but inotify_rm_wd() could run into that.  Still, "new one got created"
+ * is not a problem - we have every right to kill it or leave it alone,
+ * whatever's more convenient.
+ *
+ * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
+ * "grab it and kill it" check.  If it's been our original watch, we are
+ * fine, if it's a newcomer - nevermind, just pretend that we'd won the
+ * race and kill the fscker anyway; we are safe since we know that its
+ * superblock won't be going away.
+ *
+ * And yes, this is far beyond mere "not very pretty"; so's the entire
+ * concept of inotify to start with.
+ */
+
+/**
+ * pin_to_kill - pin the watch down for removal
+ * @ih: inotify handle
+ * @watch: watch to kill
+ *
+ * Called with ih->mutex held, drops it.  Possible return values:
+ * 0 - nothing to do, it has died
+ * 1 - remove it, drop the reference and deactivate_super()
+ * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
+ * that variant, since it involved a lot of PITA, but that's the best that
+ * could've been done.
+ */
+static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	s32 wd = watch->wd;
+
+	spin_lock(&sb_lock);
+	if (sb->s_count >= S_BIAS) {
+		atomic_inc(&sb->s_active);
+		spin_unlock(&sb_lock);
+		get_inotify_watch(watch);
+		mutex_unlock(&ih->mutex);
+		return 1;	/* the best outcome */
+	}
+	sb->s_count++;
+	spin_unlock(&sb_lock);
+	mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
+	down_read(&sb->s_umount);
+	if (likely(!sb->s_root)) {
+		/* fs is already shut down; the watch is dead */
+		drop_super(sb);
+		return 0;
+	}
+	/* raced with the final deactivate_super() */
+	mutex_lock(&ih->mutex);
+	if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
+		/* the watch is dead */
+		mutex_unlock(&ih->mutex);
+		drop_super(sb);
+		return 0;
+	}
+	/* still alive or freed and reused with the same sb and wd; kill */
+	get_inotify_watch(watch);
+	mutex_unlock(&ih->mutex);
+	return 2;
+}
+
+static void unpin_and_kill(struct inotify_watch *watch, int how)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	put_inotify_watch(watch);
+	switch (how) {
+	case 1:
+		deactivate_super(sb);
+		break;
+	case 2:
+		drop_super(sb);
+	}
+}
+
 /**
  * inotify_destroy - clean up and destroy an inotify instance
  * @ih: inotify handle
@@ -490,11 +617,15 @@ void inotify_destroy(struct inotify_handle *ih)
 	 * pretty.  We cannot do a simple iteration over the list, because we
 	 * do not know the inode until we iterate to the watch.  But we need to
 	 * hold inode->inotify_mutex before ih->mutex.  The following works.
+	 *
+	 * AV: it had to become even uglier to start working ;-/
 	 */
 	while (1) {
 		struct inotify_watch *watch;
 		struct list_head *watches;
+		struct super_block *sb;
 		struct inode *inode;
+		int how;
 
 		mutex_lock(&ih->mutex);
 		watches = &ih->watches;
@@ -503,8 +634,10 @@ void inotify_destroy(struct inotify_handle *ih)
 			break;
 		}
 		watch = list_first_entry(watches, struct inotify_watch, h_list);
-		get_inotify_watch(watch);
-		mutex_unlock(&ih->mutex);
+		sb = watch->inode->i_sb;
+		how = pin_to_kill(ih, watch);
+		if (!how)
+			continue;
 
 		inode = watch->inode;
 		mutex_lock(&inode->inotify_mutex);
@@ -518,7 +651,7 @@ void inotify_destroy(struct inotify_handle *ih)
 
 		mutex_unlock(&ih->mutex);
 		mutex_unlock(&inode->inotify_mutex);
-		put_inotify_watch(watch);
+		unpin_and_kill(watch, how);
 	}
 
 	/* free this handle: the put matching the get in inotify_init() */
@@ -719,7 +852,9 @@ void inotify_evict_watch(struct inotify_watch *watch)
 int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 {
 	struct inotify_watch *watch;
+	struct super_block *sb;
 	struct inode *inode;
+	int how;
 
 	mutex_lock(&ih->mutex);
 	watch = idr_find(&ih->idr, wd);
@@ -727,9 +862,12 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 		mutex_unlock(&ih->mutex);
 		return -EINVAL;
 	}
-	get_inotify_watch(watch);
+	sb = watch->inode->i_sb;
+	how = pin_to_kill(ih, watch);
+	if (!how)
+		return 0;
+
 	inode = watch->inode;
-	mutex_unlock(&ih->mutex);
 
 	mutex_lock(&inode->inotify_mutex);
 	mutex_lock(&ih->mutex);
@@ -740,7 +878,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 
 	mutex_unlock(&ih->mutex);
 	mutex_unlock(&inode->inotify_mutex);
-	put_inotify_watch(watch);
+	unpin_and_kill(watch, how);
 
 	return 0;
 }
-- 
cgit v1.2.2


From 5c06fe772da43db63b053addcd2c267f76d0be91 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sun, 16 Nov 2008 22:19:10 +0000
Subject: Fix broken ownership of /proc/sys/ files

D'oh...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Reported-and-tested-by: Peter Palfrader <peter@palfrader.org>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_sysctl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 94fcfff6863a..06ed10b7da9e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -31,6 +31,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
 	inode->i_mode = table->mode;
+	inode->i_uid = inode->i_gid = 0;
 	if (!table->child) {
 		inode->i_mode |= S_IFREG;
 		inode->i_op = &proc_sys_inode_operations;
-- 
cgit v1.2.2


From f1987b44f642e96176adc88b7ce23a1d74806f89 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 15 Nov 2008 11:12:47 -0500
Subject: cifs: reinstate sharing of tree connections

Use a similar approach to the SMB session sharing. Add a list of tcons
attached to each SMB session. Move the refcount to non-atomic. Protect
all of the above with the cifs_tcp_ses_lock. Add functions to
properly find and put references to the tcons.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c | 236 ++++++++++++++++++++++++++++-----------------------
 fs/cifs/cifsfs.c     |   8 +-
 fs/cifs/cifsglob.h   |  13 +--
 fs/cifs/cifssmb.c    |  43 +++-------
 fs/cifs/connect.c    |  93 +++++++++++++-------
 fs/cifs/misc.c       |  74 ++++++++--------
 6 files changed, 249 insertions(+), 218 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 59841a68b0b6..1d6dfa8923ca 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -107,12 +107,13 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
 #ifdef CONFIG_PROC_FS
 static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 {
-	struct list_head *tmp, *tmp2, *tmp3;
+	struct list_head *tmp1, *tmp2, *tmp3;
 	struct mid_q_entry *mid_entry;
 	struct TCP_Server_Info *server;
 	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
-	int i;
+	int i, j;
+	__u32 dev_type;
 
 	seq_puts(m,
 		    "Display Internal CIFS Data Structures for Debugging\n"
@@ -123,8 +124,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 
 	i = 0;
 	read_lock(&cifs_tcp_ses_lock);
-	list_for_each(tmp, &cifs_tcp_ses_list) {
-		server = list_entry(tmp, struct TCP_Server_Info,
+	list_for_each(tmp1, &cifs_tcp_ses_list) {
+		server = list_entry(tmp1, struct TCP_Server_Info,
 				    tcp_ses_list);
 		i++;
 		list_for_each(tmp2, &server->smb_ses_list) {
@@ -133,12 +134,12 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 			if ((ses->serverDomain == NULL) ||
 				(ses->serverOS == NULL) ||
 				(ses->serverNOS == NULL)) {
-				seq_printf(m, "\nentry for %s not fully "
-					   "displayed\n\t", ses->serverName);
+				seq_printf(m, "\n%d) entry for %s not fully "
+					   "displayed\n\t", i, ses->serverName);
 			} else {
 				seq_printf(m,
-				    "\n%d) Name: %s  Domain: %s Mounts: %d OS:"
-				    " %s  \n\tNOS: %s\tCapability: 0x%x\n\tSMB"
+				    "\n%d) Name: %s  Domain: %s Uses: %d OS:"
+				    " %s\n\tNOS: %s\tCapability: 0x%x\n\tSMB"
 				    " session status: %d\t",
 				i, ses->serverName, ses->serverDomain,
 				ses->ses_count, ses->serverOS, ses->serverNOS,
@@ -156,14 +157,44 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 				atomic_read(&server->num_waiters));
 #endif
 
-			seq_puts(m, "\nMIDs:\n");
+			seq_puts(m, "\n\tShares:");
+			j = 0;
+			list_for_each(tmp3, &ses->tcon_list) {
+				tcon = list_entry(tmp3, struct cifsTconInfo,
+						  tcon_list);
+				++j;
+				dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
+				seq_printf(m, "\n\t%d) %s Mounts: %d ", j,
+					   tcon->treeName, tcon->tc_count);
+				if (tcon->nativeFileSystem) {
+					seq_printf(m, "Type: %s ",
+						   tcon->nativeFileSystem);
+				}
+				seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
+					"\nPathComponentMax: %d Status: 0x%d",
+					le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
+					le32_to_cpu(tcon->fsAttrInfo.Attributes),
+					le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
+					tcon->tidStatus);
+				if (dev_type == FILE_DEVICE_DISK)
+					seq_puts(m, " type: DISK ");
+				else if (dev_type == FILE_DEVICE_CD_ROM)
+					seq_puts(m, " type: CDROM ");
+				else
+					seq_printf(m, " type: %d ", dev_type);
+
+				if (tcon->need_reconnect)
+					seq_puts(m, "\tDISCONNECTED ");
+				seq_putc(m, '\n');
+			}
+
+			seq_puts(m, "\n\tMIDs:\n");
 
 			spin_lock(&GlobalMid_Lock);
 			list_for_each(tmp3, &server->pending_mid_q) {
-				mid_entry = list_entry(tmp3, struct
-					mid_q_entry,
+				mid_entry = list_entry(tmp3, struct mid_q_entry,
 					qhead);
-				seq_printf(m, "State: %d com: %d pid:"
+				seq_printf(m, "\tState: %d com: %d pid:"
 						" %d tsk: %p mid %d\n",
 						mid_entry->midState,
 						(int)mid_entry->command,
@@ -177,41 +208,6 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 	read_unlock(&cifs_tcp_ses_lock);
 	seq_putc(m, '\n');
 
-	seq_puts(m, "Shares:");
-
-	i = 0;
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalTreeConnectionList) {
-		__u32 dev_type;
-		i++;
-		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
-		seq_printf(m, "\n%d) %s Uses: %d ", i,
-				 tcon->treeName, atomic_read(&tcon->useCount));
-		if (tcon->nativeFileSystem) {
-			seq_printf(m, "Type: %s ",
-					 tcon->nativeFileSystem);
-		}
-		seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
-				 "\nPathComponentMax: %d Status: %d",
-			    le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
-			    le32_to_cpu(tcon->fsAttrInfo.Attributes),
-			    le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
-			    tcon->tidStatus);
-		if (dev_type == FILE_DEVICE_DISK)
-			seq_puts(m, " type: DISK ");
-		else if (dev_type == FILE_DEVICE_CD_ROM)
-			seq_puts(m, " type: CDROM ");
-		else
-			seq_printf(m, " type: %d ", dev_type);
-
-		if (tcon->need_reconnect)
-			seq_puts(m, "\tDISCONNECTED ");
-	}
-	read_unlock(&GlobalSMBSeslock);
-
-	seq_putc(m, '\n');
-
 	/* BB add code to dump additional info such as TCP session info now */
 	return 0;
 }
@@ -235,7 +231,9 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 {
 	char c;
 	int rc;
-	struct list_head *tmp;
+	struct list_head *tmp1, *tmp2, *tmp3;
+	struct TCP_Server_Info *server;
+	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
 
 	rc = get_user(c, buffer);
@@ -243,33 +241,42 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 		return rc;
 
 	if (c == '1' || c == 'y' || c == 'Y' || c == '0') {
-		read_lock(&GlobalSMBSeslock);
 #ifdef CONFIG_CIFS_STATS2
 		atomic_set(&totBufAllocCount, 0);
 		atomic_set(&totSmBufAllocCount, 0);
 #endif /* CONFIG_CIFS_STATS2 */
-		list_for_each(tmp, &GlobalTreeConnectionList) {
-			tcon = list_entry(tmp, struct cifsTconInfo,
-					cifsConnectionList);
-			atomic_set(&tcon->num_smbs_sent, 0);
-			atomic_set(&tcon->num_writes, 0);
-			atomic_set(&tcon->num_reads, 0);
-			atomic_set(&tcon->num_oplock_brks, 0);
-			atomic_set(&tcon->num_opens, 0);
-			atomic_set(&tcon->num_closes, 0);
-			atomic_set(&tcon->num_deletes, 0);
-			atomic_set(&tcon->num_mkdirs, 0);
-			atomic_set(&tcon->num_rmdirs, 0);
-			atomic_set(&tcon->num_renames, 0);
-			atomic_set(&tcon->num_t2renames, 0);
-			atomic_set(&tcon->num_ffirst, 0);
-			atomic_set(&tcon->num_fnext, 0);
-			atomic_set(&tcon->num_fclose, 0);
-			atomic_set(&tcon->num_hardlinks, 0);
-			atomic_set(&tcon->num_symlinks, 0);
-			atomic_set(&tcon->num_locks, 0);
+		read_lock(&cifs_tcp_ses_lock);
+		list_for_each(tmp1, &cifs_tcp_ses_list) {
+			server = list_entry(tmp1, struct TCP_Server_Info,
+					    tcp_ses_list);
+			list_for_each(tmp2, &server->smb_session_list) {
+				ses = list_entry(tmp2, struct cifsSesInfo,
+						 smb_session_list);
+				list_for_each(tmp3, &ses->tcon_list) {
+					tcon = list_entry(tmp3,
+							  struct cifsTconInfo,
+							  tcon_list);
+					atomic_set(&tcon->num_smbs_sent, 0);
+					atomic_set(&tcon->num_writes, 0);
+					atomic_set(&tcon->num_reads, 0);
+					atomic_set(&tcon->num_oplock_brks, 0);
+					atomic_set(&tcon->num_opens, 0);
+					atomic_set(&tcon->num_closes, 0);
+					atomic_set(&tcon->num_deletes, 0);
+					atomic_set(&tcon->num_mkdirs, 0);
+					atomic_set(&tcon->num_rmdirs, 0);
+					atomic_set(&tcon->num_renames, 0);
+					atomic_set(&tcon->num_t2renames, 0);
+					atomic_set(&tcon->num_ffirst, 0);
+					atomic_set(&tcon->num_fnext, 0);
+					atomic_set(&tcon->num_fclose, 0);
+					atomic_set(&tcon->num_hardlinks, 0);
+					atomic_set(&tcon->num_symlinks, 0);
+					atomic_set(&tcon->num_locks, 0);
+				}
+			}
 		}
-		read_unlock(&GlobalSMBSeslock);
+		read_unlock(&cifs_tcp_ses_lock);
 	}
 
 	return count;
@@ -278,7 +285,9 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 static int cifs_stats_proc_show(struct seq_file *m, void *v)
 {
 	int i;
-	struct list_head *tmp;
+	struct list_head *tmp1, *tmp2, *tmp3;
+	struct TCP_Server_Info *server;
+	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
 
 	seq_printf(m,
@@ -307,44 +316,55 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
 		GlobalCurrentXid, GlobalMaxActiveXid);
 
 	i = 0;
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalTreeConnectionList) {
-		i++;
-		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		seq_printf(m, "\n%d) %s", i, tcon->treeName);
-		if (tcon->need_reconnect)
-			seq_puts(m, "\tDISCONNECTED ");
-		seq_printf(m, "\nSMBs: %d Oplock Breaks: %d",
-			atomic_read(&tcon->num_smbs_sent),
-			atomic_read(&tcon->num_oplock_brks));
-		seq_printf(m, "\nReads:  %d Bytes: %lld",
-			atomic_read(&tcon->num_reads),
-			(long long)(tcon->bytes_read));
-		seq_printf(m, "\nWrites: %d Bytes: %lld",
-			atomic_read(&tcon->num_writes),
-			(long long)(tcon->bytes_written));
-		seq_printf(m,
-			"\nLocks: %d HardLinks: %d Symlinks: %d",
-			atomic_read(&tcon->num_locks),
-			atomic_read(&tcon->num_hardlinks),
-			atomic_read(&tcon->num_symlinks));
-
-		seq_printf(m, "\nOpens: %d Closes: %d Deletes: %d",
-			atomic_read(&tcon->num_opens),
-			atomic_read(&tcon->num_closes),
-			atomic_read(&tcon->num_deletes));
-		seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
-			atomic_read(&tcon->num_mkdirs),
-			atomic_read(&tcon->num_rmdirs));
-		seq_printf(m, "\nRenames: %d T2 Renames %d",
-			atomic_read(&tcon->num_renames),
-			atomic_read(&tcon->num_t2renames));
-		seq_printf(m, "\nFindFirst: %d FNext %d FClose %d",
-			atomic_read(&tcon->num_ffirst),
-			atomic_read(&tcon->num_fnext),
-			atomic_read(&tcon->num_fclose));
+	read_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp1, &cifs_tcp_ses_list) {
+		server = list_entry(tmp1, struct TCP_Server_Info,
+				    tcp_ses_list);
+		list_for_each(tmp2, &server->smb_ses_list) {
+			ses = list_entry(tmp2, struct cifsSesInfo,
+					 smb_ses_list);
+			list_for_each(tmp3, &ses->tcon_list) {
+				tcon = list_entry(tmp3,
+						  struct cifsTconInfo,
+						  tcon_list);
+				i++;
+				seq_printf(m, "\n%d) %s", i, tcon->treeName);
+				if (tcon->need_reconnect)
+					seq_puts(m, "\tDISCONNECTED ");
+				seq_printf(m, "\nSMBs: %d Oplock Breaks: %d",
+					atomic_read(&tcon->num_smbs_sent),
+					atomic_read(&tcon->num_oplock_brks));
+				seq_printf(m, "\nReads:  %d Bytes: %lld",
+					atomic_read(&tcon->num_reads),
+					(long long)(tcon->bytes_read));
+				seq_printf(m, "\nWrites: %d Bytes: %lld",
+					atomic_read(&tcon->num_writes),
+					(long long)(tcon->bytes_written));
+				seq_printf(m, "\nLocks: %d HardLinks: %d "
+					      "Symlinks: %d",
+					atomic_read(&tcon->num_locks),
+					atomic_read(&tcon->num_hardlinks),
+					atomic_read(&tcon->num_symlinks));
+				seq_printf(m, "\nOpens: %d Closes: %d"
+					      "Deletes: %d",
+					atomic_read(&tcon->num_opens),
+					atomic_read(&tcon->num_closes),
+					atomic_read(&tcon->num_deletes));
+				seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
+					atomic_read(&tcon->num_mkdirs),
+					atomic_read(&tcon->num_rmdirs));
+				seq_printf(m, "\nRenames: %d T2 Renames %d",
+					atomic_read(&tcon->num_renames),
+					atomic_read(&tcon->num_t2renames));
+				seq_printf(m, "\nFindFirst: %d FNext %d "
+					      "FClose %d",
+					atomic_read(&tcon->num_ffirst),
+					atomic_read(&tcon->num_fnext),
+					atomic_read(&tcon->num_fclose));
+			}
+		}
 	}
-	read_unlock(&GlobalSMBSeslock);
+	read_unlock(&cifs_tcp_ses_lock);
 
 	seq_putc(m, '\n');
 	return 0;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index a1e96620b097..d9cf467309e8 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -514,10 +514,11 @@ static void cifs_umount_begin(struct super_block *sb)
 	tcon = cifs_sb->tcon;
 	if (tcon == NULL)
 		return;
-	down(&tcon->tconSem);
-	if (atomic_read(&tcon->useCount) == 1)
+
+	read_lock(&cifs_tcp_ses_lock);
+	if (tcon->tc_count == 1)
 		tcon->tidStatus = CifsExiting;
-	up(&tcon->tconSem);
+	read_unlock(&cifs_tcp_ses_lock);
 
 	/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
 	/* cancel_notify_requests(tcon); */
@@ -1060,7 +1061,6 @@ init_cifs(void)
 	int rc = 0;
 	cifs_proc_init();
 	INIT_LIST_HEAD(&cifs_tcp_ses_list);
-	INIT_LIST_HEAD(&GlobalTreeConnectionList); /* BB to be removed by jl */
 	INIT_LIST_HEAD(&GlobalOplock_Q);
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	INIT_LIST_HEAD(&GlobalDnotifyReqList);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 631a99f72f22..f1ae1f57c30d 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -233,16 +233,15 @@ struct cifsSesInfo {
  * session
  */
 struct cifsTconInfo {
-	struct list_head cifsConnectionList;
+	struct list_head tcon_list;
+	int tc_count;
 	struct list_head openFileList;
-	struct semaphore tconSem;
 	struct cifsSesInfo *ses;	/* pointer to session associated with */
 	char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */
 	char *nativeFileSystem;
 	__u16 tid;		/* The 2 byte tree id */
 	__u16 Flags;		/* optional support bits */
 	enum statusEnum tidStatus;
-	atomic_t useCount;	/* how many explicit/implicit mounts to share */
 #ifdef CONFIG_CIFS_STATS
 	atomic_t num_smbs_sent;
 	atomic_t num_writes;
@@ -600,9 +599,13 @@ require use of the stronger protocol */
  */
 GLOBAL_EXTERN struct list_head		cifs_tcp_ses_list;
 
-/* protects cifs_tcp_ses_list and srv_count for each tcp session */
+/*
+ * This lock protects the cifs_tcp_ses_list, the list of smb sessions per
+ * tcp session, and the list of tcon's per smb session. It also protects
+ * the reference counters for the server, smb session, and tcon. Finally,
+ * changes to the tcon->tidStatus should be done while holding this lock.
+ */
 GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
-GLOBAL_EXTERN struct list_head GlobalTreeConnectionList; /* BB to be removed */
 GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;  /* protects list inserts on 3 above */
 
 GLOBAL_EXTERN struct list_head GlobalOplock_Q;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 9c95617baa4d..e6bb2d9d5b09 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -742,50 +742,31 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
 	int rc = 0;
 
 	cFYI(1, ("In tree disconnect"));
-	/*
-	 *  If last user of the connection and
-	 *  connection alive - disconnect it
-	 *  If this is the last connection on the server session disconnect it
-	 *  (and inside session disconnect we should check if tcp socket needs
-	 *  to be freed and kernel thread woken up).
-	 */
-	if (tcon)
-		down(&tcon->tconSem);
-	else
-		return -EIO;
 
-	atomic_dec(&tcon->useCount);
-	if (atomic_read(&tcon->useCount) > 0) {
-		up(&tcon->tconSem);
-		return -EBUSY;
-	}
+	/* BB: do we need to check this? These should never be NULL. */
+	if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
+		return -EIO;
 
-	/* No need to return error on this operation if tid invalidated and
-	closed on server already e.g. due to tcp session crashing */
-	if (tcon->need_reconnect) {
-		up(&tcon->tconSem);
+	/*
+	 * No need to return error on this operation if tid invalidated and
+	 * closed on server already e.g. due to tcp session crashing. Also,
+	 * the tcon is no longer on the list, so no need to take lock before
+	 * checking this.
+	 */
+	if (tcon->need_reconnect)
 		return 0;
-	}
 
-	if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) {
-		up(&tcon->tconSem);
-		return -EIO;
-	}
 	rc = small_smb_init(SMB_COM_TREE_DISCONNECT, 0, tcon,
 			    (void **)&smb_buffer);
-	if (rc) {
-		up(&tcon->tconSem);
+	if (rc)
 		return rc;
-	}
 
 	rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0);
 	if (rc)
 		cFYI(1, ("Tree disconnect failed %d", rc));
 
-	up(&tcon->tconSem);
-
 	/* No need to return error on this operation if tid invalidated and
-	closed on server already e.g. due to tcp session crashing */
+	   closed on server already e.g. due to tcp session crashing */
 	if (rc == -EAGAIN)
 		rc = 0;
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a3dc0d7cafc3..2f2be8faabb3 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -124,7 +124,7 @@ static int
 cifs_reconnect(struct TCP_Server_Info *server)
 {
 	int rc = 0;
-	struct list_head *tmp;
+	struct list_head *tmp, *tmp2;
 	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
 	struct mid_q_entry *mid_entry;
@@ -149,13 +149,12 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
 		ses->need_reconnect = true;
 		ses->ipc_tid = 0;
-	}
-	read_unlock(&cifs_tcp_ses_lock);
-	list_for_each(tmp, &GlobalTreeConnectionList) {
-		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		if ((tcon->ses) && (tcon->ses->server == server))
+		list_for_each(tmp2, &ses->tcon_list) {
+			tcon = list_entry(tmp2, struct cifsTconInfo, tcon_list);
 			tcon->need_reconnect = true;
+		}
 	}
+	read_unlock(&cifs_tcp_ses_lock);
 	/* do not want to be sending data on a socket we are freeing */
 	down(&server->tcpSem);
 	if (server->ssocket) {
@@ -1462,6 +1461,52 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
 	cifs_put_tcp_session(server);
 }
 
+static struct cifsTconInfo *
+cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
+{
+	struct list_head *tmp;
+	struct cifsTconInfo *tcon;
+
+	write_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &ses->tcon_list) {
+		tcon = list_entry(tmp, struct cifsTconInfo, tcon_list);
+		if (tcon->tidStatus == CifsExiting)
+			continue;
+		if (strncmp(tcon->treeName, unc, MAX_TREE_SIZE))
+			continue;
+
+		++tcon->tc_count;
+		write_unlock(&cifs_tcp_ses_lock);
+		return tcon;
+	}
+	write_unlock(&cifs_tcp_ses_lock);
+	return NULL;
+}
+
+static void
+cifs_put_tcon(struct cifsTconInfo *tcon)
+{
+	int xid;
+	struct cifsSesInfo *ses = tcon->ses;
+
+	write_lock(&cifs_tcp_ses_lock);
+	if (--tcon->tc_count > 0) {
+		write_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
+
+	list_del_init(&tcon->tcon_list);
+	write_unlock(&cifs_tcp_ses_lock);
+
+	xid = GetXid();
+	CIFSSMBTDis(xid, tcon);
+	_FreeXid(xid);
+
+	DeleteTconOplockQEntries(tcon);
+	tconInfoFree(tcon);
+	cifs_put_smb_ses(ses);
+}
+
 int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
 	     const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
@@ -2220,11 +2265,11 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	if (!rc) {
 		setup_cifs_sb(&volume_info, cifs_sb);
 
+		tcon = cifs_find_tcon(pSesInfo, volume_info.UNC);
 		if (tcon) {
 			cFYI(1, ("Found match on UNC path"));
-			if (tcon->seal != volume_info.seal)
-				cERROR(1, ("transport encryption setting "
-					   "conflicts with existing tid"));
+			/* existing tcon already has a reference */
+			cifs_put_smb_ses(pSesInfo);
 		} else {
 			tcon = tconInfoAlloc();
 			if (tcon == NULL) {
@@ -2257,6 +2302,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			if (rc)
 				goto mount_fail_check;
 			tcon->seal = volume_info.seal;
+			tcon->ses = pSesInfo;
+			write_lock(&cifs_tcp_ses_lock);
+			list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
+			write_unlock(&cifs_tcp_ses_lock);
 		}
 
 		/* we can have only one retry value for a connection
@@ -2283,18 +2332,14 @@ mount_fail_check:
 		/* If find_unc succeeded then rc == 0 so we can not end */
 		/* up accidently freeing someone elses tcon struct */
 		if (tcon)
-			tconInfoFree(tcon);
-
-		/* should also end up putting our tcp session ref if needed */
-		if (pSesInfo)
+			cifs_put_tcon(tcon);
+		else if (pSesInfo)
 			cifs_put_smb_ses(pSesInfo);
 		else
 			cifs_put_tcp_session(srvTcp);
 		goto out;
 	}
-	atomic_inc(&tcon->useCount);
 	cifs_sb->tcon = tcon;
-	tcon->ses = pSesInfo;
 
 	/* do not care if following two calls succeed - informational */
 	if (!tcon->ipc) {
@@ -3565,23 +3610,10 @@ int
 cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 {
 	int rc = 0;
-	int xid;
-	struct cifsSesInfo *ses = NULL;
 	char *tmp;
 
-	xid = GetXid();
-
-	if (cifs_sb->tcon) {
-		ses = cifs_sb->tcon->ses; /* save ptr to ses before delete tcon!*/
-		rc = CIFSSMBTDis(xid, cifs_sb->tcon);
-		if (rc == -EBUSY) {
-			FreeXid(xid);
-			return 0;
-		}
-		DeleteTconOplockQEntries(cifs_sb->tcon);
-		tconInfoFree(cifs_sb->tcon);
-		cifs_put_smb_ses(ses);
-	}
+	if (cifs_sb->tcon)
+		cifs_put_tcon(cifs_sb->tcon);
 
 	cifs_sb->tcon = NULL;
 	tmp = cifs_sb->prepath;
@@ -3589,7 +3621,6 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 	cifs_sb->prepath = NULL;
 	kfree(tmp);
 
-	FreeXid(xid);
 	return rc;
 }
 
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 46c8c7baccba..addd1dcc2d79 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -79,6 +79,7 @@ sesInfoAlloc(void)
 		ret_buf->status = CifsNew;
 		++ret_buf->ses_count;
 		INIT_LIST_HEAD(&ret_buf->smb_ses_list);
+		INIT_LIST_HEAD(&ret_buf->tcon_list);
 		init_MUTEX(&ret_buf->sesSem);
 	}
 	return ret_buf;
@@ -107,17 +108,14 @@ tconInfoAlloc(void)
 	struct cifsTconInfo *ret_buf;
 	ret_buf = kzalloc(sizeof(struct cifsTconInfo), GFP_KERNEL);
 	if (ret_buf) {
-		write_lock(&GlobalSMBSeslock);
 		atomic_inc(&tconInfoAllocCount);
-		list_add(&ret_buf->cifsConnectionList,
-			 &GlobalTreeConnectionList);
 		ret_buf->tidStatus = CifsNew;
+		++ret_buf->tc_count;
 		INIT_LIST_HEAD(&ret_buf->openFileList);
-		init_MUTEX(&ret_buf->tconSem);
+		INIT_LIST_HEAD(&ret_buf->tcon_list);
 #ifdef CONFIG_CIFS_STATS
 		spin_lock_init(&ret_buf->stat_lock);
 #endif
-		write_unlock(&GlobalSMBSeslock);
 	}
 	return ret_buf;
 }
@@ -129,10 +127,7 @@ tconInfoFree(struct cifsTconInfo *buf_to_free)
 		cFYI(1, ("Null buffer passed to tconInfoFree"));
 		return;
 	}
-	write_lock(&GlobalSMBSeslock);
 	atomic_dec(&tconInfoAllocCount);
-	list_del(&buf_to_free->cifsConnectionList);
-	write_unlock(&GlobalSMBSeslock);
 	kfree(buf_to_free->nativeFileSystem);
 	kfree(buf_to_free);
 }
@@ -493,9 +488,10 @@ bool
 is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 {
 	struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf;
-	struct list_head *tmp;
-	struct list_head *tmp1;
+	struct list_head *tmp, *tmp1, *tmp2;
+	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
+	struct cifsInodeInfo *pCifsInode;
 	struct cifsFileInfo *netfile;
 
 	cFYI(1, ("Checking for oplock break or dnotify response"));
@@ -550,42 +546,42 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 		return false;
 
 	/* look up tcon based on tid & uid */
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalTreeConnectionList) {
-		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		if ((tcon->tid == buf->Tid) && (srv == tcon->ses->server)) {
+	read_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &srv->smb_ses_list) {
+		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+		list_for_each(tmp1, &ses->tcon_list) {
+			tcon = list_entry(tmp1, struct cifsTconInfo, tcon_list);
+			if (tcon->tid != buf->Tid)
+				continue;
+
 			cifs_stats_inc(&tcon->num_oplock_brks);
-			list_for_each(tmp1, &tcon->openFileList) {
-				netfile = list_entry(tmp1, struct cifsFileInfo,
+			list_for_each(tmp2, &tcon->openFileList) {
+				netfile = list_entry(tmp2, struct cifsFileInfo,
 						     tlist);
-				if (pSMB->Fid == netfile->netfid) {
-					struct cifsInodeInfo *pCifsInode;
-					read_unlock(&GlobalSMBSeslock);
-					cFYI(1,
-					    ("file id match, oplock break"));
-					pCifsInode =
-						CIFS_I(netfile->pInode);
-					pCifsInode->clientCanCacheAll = false;
-					if (pSMB->OplockLevel == 0)
-						pCifsInode->clientCanCacheRead
-							= false;
-					pCifsInode->oplockPending = true;
-					AllocOplockQEntry(netfile->pInode,
-							  netfile->netfid,
-							  tcon);
-					cFYI(1,
-					    ("about to wake up oplock thread"));
-					if (oplockThread)
-					    wake_up_process(oplockThread);
-					return true;
-				}
+				if (pSMB->Fid != netfile->netfid)
+					continue;
+
+				read_unlock(&cifs_tcp_ses_lock);
+				cFYI(1, ("file id match, oplock break"));
+				pCifsInode = CIFS_I(netfile->pInode);
+				pCifsInode->clientCanCacheAll = false;
+				if (pSMB->OplockLevel == 0)
+					pCifsInode->clientCanCacheRead = false;
+				pCifsInode->oplockPending = true;
+				AllocOplockQEntry(netfile->pInode,
+						  netfile->netfid, tcon);
+				cFYI(1, ("about to wake up oplock thread"));
+				if (oplockThread)
+					wake_up_process(oplockThread);
+
+				return true;
 			}
-			read_unlock(&GlobalSMBSeslock);
+			read_unlock(&cifs_tcp_ses_lock);
 			cFYI(1, ("No matching file for oplock break"));
 			return true;
 		}
 	}
-	read_unlock(&GlobalSMBSeslock);
+	read_unlock(&cifs_tcp_ses_lock);
 	cFYI(1, ("Can not process oplock break for non-existent connection"));
 	return true;
 }
-- 
cgit v1.2.2


From c2b3382cd4d6c6adef1347e81f20e16c93a39feb Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 17 Nov 2008 03:57:13 +0000
Subject: [CIFS] Fix build break

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 1d6dfa8923ca..490e34bbf27a 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -249,9 +249,9 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 		list_for_each(tmp1, &cifs_tcp_ses_list) {
 			server = list_entry(tmp1, struct TCP_Server_Info,
 					    tcp_ses_list);
-			list_for_each(tmp2, &server->smb_session_list) {
+			list_for_each(tmp2, &server->smb_ses_list) {
 				ses = list_entry(tmp2, struct cifsSesInfo,
-						 smb_session_list);
+						 smb_ses_list);
 				list_for_each(tmp3, &ses->tcon_list) {
 					tcon = list_entry(tmp3,
 							  struct cifsTconInfo,
-- 
cgit v1.2.2


From ab3f992983062440b4f37c666dac66d987902d91 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 17 Nov 2008 16:03:00 +0000
Subject: [CIFS] Fix check for tcon seal setting and fix oops on failed mount
 from earlier patch

set tcon->ses earlier

If the inital tree connect fails, we'll end up calling cifs_put_smb_ses
with a NULL pointer. Fix it by setting the tcon->ses earlier.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2f2be8faabb3..c7d341714586 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2270,16 +2270,18 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			cFYI(1, ("Found match on UNC path"));
 			/* existing tcon already has a reference */
 			cifs_put_smb_ses(pSesInfo);
+			if (tcon->seal != volume_info.seal)
+				cERROR(1, ("transport encryption setting "
+					   "conflicts with existing tid"));
 		} else {
 			tcon = tconInfoAlloc();
 			if (tcon == NULL) {
 				rc = -ENOMEM;
 				goto mount_fail_check;
 			}
+			tcon->ses = pSesInfo;
 
 			/* check for null share name ie connect to dfs root */
-
-			/* BB check if works for exactly length 3 strings */
 			if ((strchr(volume_info.UNC + 3, '\\') == NULL)
 			    && (strchr(volume_info.UNC + 3, '/') == NULL)) {
 				/* rc = connect_to_dfs_path(...) */
@@ -2302,7 +2304,6 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			if (rc)
 				goto mount_fail_check;
 			tcon->seal = volume_info.seal;
-			tcon->ses = pSesInfo;
 			write_lock(&cifs_tcp_ses_lock);
 			list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
 			write_unlock(&cifs_tcp_ses_lock);
-- 
cgit v1.2.2


From 2c55608f28444c3f33b10312881384c470ceed56 Mon Sep 17 00:00:00 2001
From: Igor Mammedov <niallain@gmail.com>
Date: Thu, 23 Oct 2008 13:58:42 +0400
Subject: Fixed parsing of mount options when doing DFS submount

Since these hit the same routines, and are relatively small, it is easier to review
them as one patch.

Fixed incorrect handling of the last option in some cases
Fixed prefixpath handling convert path_consumed into host depended string length (in bytes)
Use non default separator if it is provided in the original mount options

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Igor Mammedov <niallain@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_dfs_ref.c | 71 +++++++++++++++++++++++++++++++++-----------------
 fs/cifs/cifssmb.c      | 39 ++++++++++++++++++++++++---
 2 files changed, 83 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index d2c8eef84f3c..e1c18362ba46 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -106,7 +106,8 @@ static char *cifs_get_share_name(const char *node_name)
 /**
  * compose_mount_options	-	creates mount options for refferral
  * @sb_mountdata:	parent/root DFS mount options (template)
- * @ref_unc:		refferral server UNC
+ * @dentry:		point where we are going to mount
+ * @ref:		server's referral
  * @devname:		pointer for saving device name
  *
  * creates mount options for submount based on template options sb_mountdata
@@ -116,7 +117,8 @@ static char *cifs_get_share_name(const char *node_name)
  * Caller is responcible for freeing retunrned value if it is not error.
  */
 static char *compose_mount_options(const char *sb_mountdata,
-				   const char *ref_unc,
+				   struct dentry *dentry,
+				   const struct dfs_info3_param *ref,
 				   char **devname)
 {
 	int rc;
@@ -126,11 +128,12 @@ static char *compose_mount_options(const char *sb_mountdata,
 	char *srvIP = NULL;
 	char sep = ',';
 	int off, noff;
+	char *fullpath;
 
 	if (sb_mountdata == NULL)
 		return ERR_PTR(-EINVAL);
 
-	*devname = cifs_get_share_name(ref_unc);
+	*devname = cifs_get_share_name(ref->node_name);
 	rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
 	if (rc != 0) {
 		cERROR(1, ("%s: Failed to resolve server part of %s to IP",
@@ -138,7 +141,12 @@ static char *compose_mount_options(const char *sb_mountdata,
 		mountdata = ERR_PTR(rc);
 		goto compose_mount_options_out;
 	}
-	md_len = strlen(sb_mountdata) + strlen(srvIP) + strlen(ref_unc) + 3;
+	/* md_len = strlen(...) + 12 for 'sep+prefixpath='
+	 * assuming that we have 'unc=' and 'ip=' in
+	 * the original sb_mountdata
+	 */
+	md_len = strlen(sb_mountdata) + strlen(srvIP) +
+		strlen(ref->node_name) + 12;
 	mountdata = kzalloc(md_len+1, GFP_KERNEL);
 	if (mountdata == NULL) {
 		mountdata = ERR_PTR(-ENOMEM);
@@ -152,41 +160,56 @@ static char *compose_mount_options(const char *sb_mountdata,
 			strncpy(mountdata, sb_mountdata, 5);
 			off += 5;
 	}
-	while ((tkn_e = strchr(sb_mountdata+off, sep))) {
-		noff = (tkn_e - (sb_mountdata+off)) + 1;
-		if (strnicmp(sb_mountdata+off, "unc=", 4) == 0) {
+
+	do {
+		tkn_e = strchr(sb_mountdata + off, sep);
+		if (tkn_e == NULL)
+			noff = strlen(sb_mountdata + off);
+		else
+			noff = tkn_e - (sb_mountdata + off) + 1;
+
+		if (strnicmp(sb_mountdata + off, "unc=", 4) == 0) {
 			off += noff;
 			continue;
 		}
-		if (strnicmp(sb_mountdata+off, "ip=", 3) == 0) {
+		if (strnicmp(sb_mountdata + off, "ip=", 3) == 0) {
 			off += noff;
 			continue;
 		}
-		if (strnicmp(sb_mountdata+off, "prefixpath=", 3) == 0) {
+		if (strnicmp(sb_mountdata + off, "prefixpath=", 11) == 0) {
 			off += noff;
 			continue;
 		}
-		strncat(mountdata, sb_mountdata+off, noff);
+		strncat(mountdata, sb_mountdata + off, noff);
 		off += noff;
-	}
-	strcat(mountdata, sb_mountdata+off);
+	} while (tkn_e);
+	strcat(mountdata, sb_mountdata + off);
 	mountdata[md_len] = '\0';
 
 	/* copy new IP and ref share name */
-	strcat(mountdata, ",ip=");
+	if (mountdata[strlen(mountdata) - 1] != sep)
+		strncat(mountdata, &sep, 1);
+	strcat(mountdata, "ip=");
 	strcat(mountdata, srvIP);
-	strcat(mountdata, ",unc=");
+	strncat(mountdata, &sep, 1);
+	strcat(mountdata, "unc=");
 	strcat(mountdata, *devname);
 
 	/* find & copy prefixpath */
-	tkn_e = strchr(ref_unc+2, '\\');
-	if (tkn_e) {
-		tkn_e = strchr(tkn_e+1, '\\');
-		if (tkn_e) {
-			strcat(mountdata, ",prefixpath=");
-			strcat(mountdata, tkn_e+1);
-		}
+	tkn_e = strchr(ref->node_name + 2, '\\');
+	if (tkn_e == NULL) /* invalid unc, missing share name*/
+		goto compose_mount_options_out;
+
+	fullpath = build_path_from_dentry(dentry);
+	tkn_e = strchr(tkn_e + 1, '\\');
+	if (tkn_e || strlen(fullpath) - (ref->path_consumed)) {
+		strncat(mountdata, &sep, 1);
+		strcat(mountdata, "prefixpath=");
+		if (tkn_e)
+			strcat(mountdata, tkn_e + 1);
+		strcat(mountdata, fullpath + (ref->path_consumed));
 	}
+	kfree(fullpath);
 
 	/*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/
 	/*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/
@@ -198,7 +221,7 @@ compose_mount_options_out:
 
 
 static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
-		struct dentry *dentry, char *ref_unc)
+		struct dentry *dentry, const struct dfs_info3_param *ref)
 {
 	struct cifs_sb_info *cifs_sb;
 	struct vfsmount *mnt;
@@ -207,7 +230,7 @@ static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
 
 	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
 	mountdata = compose_mount_options(cifs_sb->mountdata,
-						ref_unc, &devname);
+						dentry, ref, &devname);
 
 	if (IS_ERR(mountdata))
 		return (struct vfsmount *)mountdata;
@@ -310,7 +333,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 			}
 			mnt = cifs_dfs_do_refmount(nd->path.mnt,
 						nd->path.dentry,
-						referrals[i].node_name);
+						referrals + i);
 			cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p",
 					 __func__,
 					referrals[i].node_name, mnt));
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index e6bb2d9d5b09..bdda46dd435a 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3899,6 +3899,27 @@ GetInodeNumOut:
 	return rc;
 }
 
+/* computes length of UCS string converted to host codepage
+ * @src:	UCS string
+ * @maxlen:	length of the input string in UCS characters
+ * 		(not in bytes)
+ *
+ * return:	size of input string in host codepage
+ */
+static int hostlen_fromUCS(const __le16 *src, const int maxlen,
+		const struct nls_table *nls_codepage) {
+	int i;
+	int hostlen = 0;
+	char to[4];
+	int charlen;
+	for (i = 0; (i < maxlen) && src[i]; ++i) {
+		charlen = nls_codepage->uni2char(le16_to_cpu(src[i]),
+				to, NLS_MAX_CHARSET_SIZE);
+		hostlen += charlen > 0 ? charlen : 1;
+	}
+	return hostlen;
+}
+
 /* parses DFS refferal V3 structure
  * caller is responsible for freeing target_nodes
  * returns:
@@ -3909,7 +3930,8 @@ static int
 parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		unsigned int *num_of_nodes,
 		struct dfs_info3_param **target_nodes,
-		const struct nls_table *nls_codepage)
+		const struct nls_table *nls_codepage, int remap,
+		const char *searchName)
 {
 	int i, rc = 0;
 	char *data_end;
@@ -3960,7 +3982,17 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		struct dfs_info3_param *node = (*target_nodes)+i;
 
 		node->flags = le16_to_cpu(pSMBr->DFSFlags);
-		node->path_consumed = le16_to_cpu(pSMBr->PathConsumed);
+		if (is_unicode) {
+			__le16 *tmp = kmalloc(strlen(searchName)*2, GFP_KERNEL);
+			cifsConvertToUCS((__le16 *) tmp, searchName,
+					PATH_MAX, nls_codepage, remap);
+			node->path_consumed = hostlen_fromUCS(tmp,
+					le16_to_cpu(pSMBr->PathConsumed)/2,
+					nls_codepage);
+			kfree(tmp);
+		} else
+			node->path_consumed = le16_to_cpu(pSMBr->PathConsumed);
+
 		node->server_type = le16_to_cpu(ref->ServerType);
 		node->ref_flag = le16_to_cpu(ref->ReferralEntryFlags);
 
@@ -4093,7 +4125,8 @@ getDFSRetry:
 
 	/* parse returned result into more usable form */
 	rc = parse_DFS_referrals(pSMBr, num_of_nodes,
-				 target_nodes, nls_codepage);
+				 target_nodes, nls_codepage, remap,
+				 searchName);
 
 GetDFSRefExit:
 	cifs_buf_release(pSMB);
-- 
cgit v1.2.2


From b066a48c9532243894f93a06ca5a0ee2cc21a8dc Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Date: Tue, 18 Nov 2008 03:49:05 +0000
Subject: prevent cifs_writepages() from skipping unwritten pages

Fixes a data corruption under heavy stress in which pages could be left
dirty after all open instances of a inode have been closed.

In order to write contiguous pages whenever possible, cifs_writepages()
asks pagevec_lookup_tag() for more pages than it may write at one time.
Normally, it then resets index just past the last page written before calling
pagevec_lookup_tag() again.

If cifs_writepages() can't write the first page returned, it wasn't resetting
index, and the next call to pagevec_lookup_tag() resulted in skipping all of
the pages it previously returned, even though cifs_writepages() did nothing
with them.  This can result in data loss when the file descriptor is about
to be closed.

This patch ensures that index gets set back to the next returned page so
that none get skipped.

Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Cc: Shirish S Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 1540adaa593d..6449e1aae621 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1404,7 +1404,10 @@ retry:
 			if ((wbc->nr_to_write -= n_iov) <= 0)
 				done = 1;
 			index = next;
-		}
+		} else
+			/* Need to re-find the pages we skipped */
+			index = pvec.pages[0]->index + 1;
+
 		pagevec_release(&pvec);
 	}
 	if (!scanned && !done) {
-- 
cgit v1.2.2


From eb60fa1066622ddb2278732cf61e0c4544e82c6f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 10 Nov 2008 15:28:59 +0900
Subject: block: fix add_partition() error path

Partition stats structure was not freed on devt allocation failure
path.  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 633f7a0ebb2c..90bcf136a9de 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -395,7 +395,7 @@ int add_partition(struct gendisk *disk, int partno,
 
 	err = blk_alloc_devt(p, &devt);
 	if (err)
-		goto out_free;
+		goto out_free_stats;
 	pdev->devt = devt;
 
 	/* delay uevent until 'holders' subdir is created */
@@ -426,6 +426,8 @@ int add_partition(struct gendisk *disk, int partno,
 
 	return 0;
 
+out_free_stats:
+	free_part_stats(p);
 out_free:
 	kfree(p);
 	return err;
-- 
cgit v1.2.2


From ba32929a91fe2c0628f5be62d1597b379c8d3062 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 10 Nov 2008 15:29:58 +0900
Subject: block: make add_partition() return pointer to hd_struct

Make add_partition() return pointer to the new hd_struct on success
and ERR_PTR() value on failure.  This change will be used to fix md
autodetection bug.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 90bcf136a9de..633025340239 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -348,8 +348,8 @@ static ssize_t whole_disk_show(struct device *dev,
 static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
 		   whole_disk_show, NULL);
 
-int add_partition(struct gendisk *disk, int partno,
-		  sector_t start, sector_t len, int flags)
+struct hd_struct *add_partition(struct gendisk *disk, int partno,
+				sector_t start, sector_t len, int flags)
 {
 	struct hd_struct *p;
 	dev_t devt = MKDEV(0, 0);
@@ -361,15 +361,15 @@ int add_partition(struct gendisk *disk, int partno,
 
 	err = disk_expand_part_tbl(disk, partno);
 	if (err)
-		return err;
+		return ERR_PTR(err);
 	ptbl = disk->part_tbl;
 
 	if (ptbl->part[partno])
-		return -EBUSY;
+		return ERR_PTR(-EBUSY);
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
-		return -ENOMEM;
+		return ERR_PTR(-EBUSY);
 
 	if (!init_part_stats(p)) {
 		err = -ENOMEM;
@@ -424,20 +424,20 @@ int add_partition(struct gendisk *disk, int partno,
 	if (!ddev->uevent_suppress)
 		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
-	return 0;
+	return p;
 
 out_free_stats:
 	free_part_stats(p);
 out_free:
 	kfree(p);
-	return err;
+	return ERR_PTR(err);
 out_del:
 	kobject_put(p->holder_dir);
 	device_del(pdev);
 out_put:
 	put_device(pdev);
 	blk_free_devt(devt);
-	return err;
+	return ERR_PTR(err);
 }
 
 /* Not exported, helper to add_disk(). */
@@ -568,10 +568,11 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 			       disk->disk_name, p, (unsigned long long) size);
 			size = get_capacity(disk) - from;
 		}
-		res = add_partition(disk, p, from, size, state->parts[p].flags);
-		if (res) {
-			printk(KERN_ERR " %s: p%d could not be added: %d\n",
-				disk->disk_name, p, -res);
+		part = add_partition(disk, p, from, size,
+				     state->parts[p].flags);
+		if (IS_ERR(part)) {
+			printk(KERN_ERR " %s: p%d could not be added: %ld\n",
+			       disk->disk_name, p, -PTR_ERR(part));
 			continue;
 		}
 #ifdef CONFIG_BLK_DEV_MD
-- 
cgit v1.2.2


From 55e8e30c382d25c34f8aafcc78efec948571a941 Mon Sep 17 00:00:00 2001
From: Tejun Heo <teheo@suse.de>
Date: Mon, 10 Nov 2008 15:30:47 +0900
Subject: block/md: fix md autodetection

Block ext devt conversion missed md_autodetect_dev() call in
rescan_partitions() leaving md autodetect unable to see partitions.
Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 633025340239..6d5b213b8a9b 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -577,7 +577,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 		}
 #ifdef CONFIG_BLK_DEV_MD
 		if (state->parts[p].flags & ADDPART_FLAG_RAID)
-			md_autodetect_dev(bdev->bd_dev+p);
+			md_autodetect_dev(part_to_dev(part)->devt);
 #endif
 	}
 	kfree(state);
-- 
cgit v1.2.2


From bfb59820ee46616a7bdb4af6b8f7e109646de6ec Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 18 Nov 2008 16:33:48 +0000
Subject: [CIFS] fix check for dead tcon in smb_init

This was recently changed to check for need_reconnect, but should
actually be a check for a tidStatus of CifsExiting.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES   | 6 +++++-
 fs/cifs/cifssmb.c | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 8855331b2fba..e078b7aea143 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -8,7 +8,11 @@ handling fcntl(F_SETLEASE).  Convert cifs to using blocking tcp
 sends, and also let tcp autotune the socket send and receive buffers.
 This reduces the number of EAGAIN errors returned by TCP/IP in
 high stress workloads (and the number of retries on socket writes
-when sending large SMBWriteX requests).
+when sending large SMBWriteX requests).  Fix case in which a portion of
+data can in some cases not get written to the file on the server before the
+file is closed.  Fix DFS parsing to properly handle path consumed field,
+and to handle certain codepage conversions better.  Fix mount and
+umount race that can cause oops in mount or umount or reconnect.
 
 Version 1.54
 ------------
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index bdda46dd435a..2af8626ced43 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -295,7 +295,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 	   check for tcp and smb session status done differently
 	   for those three - in the calling routine */
 	if (tcon) {
-		if (tcon->need_reconnect) {
+		if (tcon->tidStatus == CifsExiting) {
 			/* only tree disconnect, open, and write,
 			  (and ulogoff which does not have tcon)
 			  are allowed as we start force umount */
-- 
cgit v1.2.2


From ac97b9f9a2d0b83488e0bbcb8517b229d5c9b142 Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Wed, 19 Nov 2008 15:36:28 -0800
Subject: eCryptfs: Allocate up to two scatterlists for crypto ops on keys

I have received some reports of out-of-memory errors on some older AMD
architectures.  These errors are what I would expect to see if
crypt_stat->key were split between two separate pages.  eCryptfs should
not assume that any of the memory sent through virt_to_scatterlist() is
all contained in a single page, and so this patch allocates two
scatterlist structs instead of one when processing keys.  I have received
confirmation from one person affected by this bug that this patch resolves
the issue for him, and so I am submitting it for inclusion in a future
stable release.

Note that virt_to_scatterlist() runs sg_init_table() on the scatterlist
structs passed to it, so the calls to sg_init_table() in
decrypt_passphrase_encrypted_session_key() are redundant.

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Reported-by: Paulo J. S. Silva <pjssilva@ime.usp.br>
Cc: "Leon Woestenberg" <leon.woestenberg@gmail.com>
Cc: Tim Gardner <tim.gardner@canonical.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/keystore.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e22bc3961345..0d713b691941 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1037,17 +1037,14 @@ static int
 decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 					 struct ecryptfs_crypt_stat *crypt_stat)
 {
-	struct scatterlist dst_sg;
-	struct scatterlist src_sg;
+	struct scatterlist dst_sg[2];
+	struct scatterlist src_sg[2];
 	struct mutex *tfm_mutex;
 	struct blkcipher_desc desc = {
 		.flags = CRYPTO_TFM_REQ_MAY_SLEEP
 	};
 	int rc = 0;
 
-	sg_init_table(&dst_sg, 1);
-	sg_init_table(&src_sg, 1);
-
 	if (unlikely(ecryptfs_verbosity > 0)) {
 		ecryptfs_printk(
 			KERN_DEBUG, "Session key encryption key (size [%d]):\n",
@@ -1066,8 +1063,8 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 	}
 	rc = virt_to_scatterlist(auth_tok->session_key.encrypted_key,
 				 auth_tok->session_key.encrypted_key_size,
-				 &src_sg, 1);
-	if (rc != 1) {
+				 src_sg, 2);
+	if (rc < 1 || rc > 2) {
 		printk(KERN_ERR "Internal error whilst attempting to convert "
 			"auth_tok->session_key.encrypted_key to scatterlist; "
 			"expected rc = 1; got rc = [%d]. "
@@ -1079,8 +1076,8 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 		auth_tok->session_key.encrypted_key_size;
 	rc = virt_to_scatterlist(auth_tok->session_key.decrypted_key,
 				 auth_tok->session_key.decrypted_key_size,
-				 &dst_sg, 1);
-	if (rc != 1) {
+				 dst_sg, 2);
+	if (rc < 1 || rc > 2) {
 		printk(KERN_ERR "Internal error whilst attempting to convert "
 			"auth_tok->session_key.decrypted_key to scatterlist; "
 			"expected rc = 1; got rc = [%d]\n", rc);
@@ -1096,7 +1093,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 		rc = -EINVAL;
 		goto out;
 	}
-	rc = crypto_blkcipher_decrypt(&desc, &dst_sg, &src_sg,
+	rc = crypto_blkcipher_decrypt(&desc, dst_sg, src_sg,
 				      auth_tok->session_key.encrypted_key_size);
 	mutex_unlock(tfm_mutex);
 	if (unlikely(rc)) {
@@ -1539,8 +1536,8 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 	size_t i;
 	size_t encrypted_session_key_valid = 0;
 	char session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES];
-	struct scatterlist dst_sg;
-	struct scatterlist src_sg;
+	struct scatterlist dst_sg[2];
+	struct scatterlist src_sg[2];
 	struct mutex *tfm_mutex = NULL;
 	u8 cipher_code;
 	size_t packet_size_length;
@@ -1619,8 +1616,8 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 		ecryptfs_dump_hex(session_key_encryption_key, 16);
 	}
 	rc = virt_to_scatterlist(crypt_stat->key, key_rec->enc_key_size,
-				 &src_sg, 1);
-	if (rc != 1) {
+				 src_sg, 2);
+	if (rc < 1 || rc > 2) {
 		ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
 				"for crypt_stat session key; expected rc = 1; "
 				"got rc = [%d]. key_rec->enc_key_size = [%d]\n",
@@ -1629,8 +1626,8 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 		goto out;
 	}
 	rc = virt_to_scatterlist(key_rec->enc_key, key_rec->enc_key_size,
-				 &dst_sg, 1);
-	if (rc != 1) {
+				 dst_sg, 2);
+	if (rc < 1 || rc > 2) {
 		ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
 				"for crypt_stat encrypted session key; "
 				"expected rc = 1; got rc = [%d]. "
@@ -1651,7 +1648,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 	rc = 0;
 	ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n",
 			crypt_stat->key_size);
-	rc = crypto_blkcipher_encrypt(&desc, &dst_sg, &src_sg,
+	rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg,
 				      (*key_rec).enc_key_size);
 	mutex_unlock(tfm_mutex);
 	if (rc) {
-- 
cgit v1.2.2


From f9454548e17cd56bad081bd7d55a09b001950cbb Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 19 Nov 2008 15:36:38 -0800
Subject: don't unlink an active swapfile

Peter Cordes is sorry that he rm'ed his swapfiles while they were in use,
he then had no pathname to swapoff.  It's a curious little oversight, but
not one worth a lot of hackery.  Kudos to Willy Tarreau for turning this
around from a discussion of synthetic pathnames to how to prevent unlink.
Mimic immutable: prohibit unlinking an active swapfile in may_delete()
(and don't worry my little head over the tiny race window).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Willy Tarreau <w@1wt.eu>
Acked-by: Christoph Hellwig <hch@infradead.org>
Cc: Peter Cordes <peter@cordes.ca>
Cc: Bodo Eggert <7eggert@gmx.de>
Cc: David Newall <davidn@davidnewall.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 09ce58e49e72..d34e0f9681c6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1378,7 +1378,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
 	if (IS_APPEND(dir))
 		return -EPERM;
 	if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
-	    IS_IMMUTABLE(victim->d_inode))
+	    IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
 		return -EPERM;
 	if (isdir) {
 		if (!S_ISDIR(victim->d_inode->i_mode))
-- 
cgit v1.2.2


From ea7e743e49b94749fc739baaf160809ed279aeda Mon Sep 17 00:00:00 2001
From: WANG Cong <wangcong@zeuux.org>
Date: Wed, 19 Nov 2008 15:36:46 -0800
Subject: hostfs: fix a duplicated global function name

fs/hostfs/hostfs_user.c defines do_readlink() as non-static, and so does
fs/xfs/linux-2.6/xfs_ioctl.c when CONFIG_XFS_DEBUG=y.  So rename
do_readlink() in hostfs to hostfs_do_readlink().

I think it's better if XFS guys will also rename their do_readlink(),
it's not necessary to use such a general name.

Signed-off-by: WANG Cong <wangcong@zeuux.org>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hostfs/hostfs.h      | 2 +-
 fs/hostfs/hostfs_kern.c | 4 ++--
 fs/hostfs/hostfs_user.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 6ae9011b95eb..2f34f8f2134b 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -81,7 +81,7 @@ extern int do_rmdir(const char *file);
 extern int do_mknod(const char *file, int mode, unsigned int major,
 		    unsigned int minor);
 extern int link_file(const char *from, const char *to);
-extern int do_readlink(char *file, char *buf, int size);
+extern int hostfs_do_readlink(char *file, char *buf, int size);
 extern int rename_file(char *from, char *to);
 extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 		     long long *bfree_out, long long *bavail_out,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 7f34f4385de0..3a31451ac170 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -168,7 +168,7 @@ static char *follow_link(char *link)
 		if (name == NULL)
 			goto out;
 
-		n = do_readlink(link, name, len);
+		n = hostfs_do_readlink(link, name, len);
 		if (n < len)
 			break;
 		len *= 2;
@@ -943,7 +943,7 @@ int hostfs_link_readpage(struct file *file, struct page *page)
 	name = inode_name(page->mapping->host, 0);
 	if (name == NULL)
 		return -ENOMEM;
-	err = do_readlink(name, buffer, PAGE_CACHE_SIZE);
+	err = hostfs_do_readlink(name, buffer, PAGE_CACHE_SIZE);
 	kfree(name);
 	if (err == PAGE_CACHE_SIZE)
 		err = -E2BIG;
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 53fd0a67c11a..b79424f93282 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -377,7 +377,7 @@ int link_file(const char *to, const char *from)
 	return 0;
 }
 
-int do_readlink(char *file, char *buf, int size)
+int hostfs_do_readlink(char *file, char *buf, int size)
 {
 	int n;
 
-- 
cgit v1.2.2


From ddb4cbfc53aa0913ee8da059fcbf628d14f40f63 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 20 Nov 2008 20:00:44 +0000
Subject: [CIFS] Do not attempt to close invalidated file handles

If a connection with open file handles has gone down
and come back up and reconnected without reopening
the file handle yet, do not attempt to send an SMB close
request for this handle in cifs_close.  We were
checking for the connection being invalid in cifs_close
but since the connection may have been reconnected
we also need to check whether the file handle
was marked invalid (otherwise we could close the
wrong file handle by accident).

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h | 10 +++++++++-
 fs/cifs/file.c     | 21 ++++++++++++++-------
 fs/cifs/misc.c     |  3 +++
 fs/cifs/readdir.c  |  5 ++++-
 4 files changed, 30 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f1ae1f57c30d..c57c0565547f 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -606,7 +606,15 @@ GLOBAL_EXTERN struct list_head		cifs_tcp_ses_list;
  * changes to the tcon->tidStatus should be done while holding this lock.
  */
 GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
-GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;  /* protects list inserts on 3 above */
+
+/*
+ * This lock protects the cifs_file->llist and cifs_file->flist
+ * list operations, and updates to some flags (cifs_file->invalidHandle)
+ * It will be moved to either use the tcon->stat_lock or equivalent later.
+ * If cifs_tcp_ses_lock and the lock below are both needed to be held, then
+ * the cifs_tcp_ses_lock must be grabbed first and released last.
+ */
+GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
 
 GLOBAL_EXTERN struct list_head GlobalOplock_Q;
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 6449e1aae621..b691b893a848 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -488,12 +488,13 @@ int cifs_close(struct inode *inode, struct file *file)
 	pTcon = cifs_sb->tcon;
 	if (pSMBFile) {
 		struct cifsLockInfo *li, *tmp;
-
+		write_lock(&GlobalSMBSeslock);
 		pSMBFile->closePend = true;
 		if (pTcon) {
 			/* no sense reconnecting to close a file that is
 			   already closed */
 			if (!pTcon->need_reconnect) {
+				write_unlock(&GlobalSMBSeslock);
 				timeout = 2;
 				while ((atomic_read(&pSMBFile->wrtPending) != 0)
 					&& (timeout <= 2048)) {
@@ -510,12 +511,15 @@ int cifs_close(struct inode *inode, struct file *file)
 					timeout *= 4;
 				}
 				if (atomic_read(&pSMBFile->wrtPending))
-					cERROR(1,
-						("close with pending writes"));
-				rc = CIFSSMBClose(xid, pTcon,
+					cERROR(1, ("close with pending write"));
+				if (!pTcon->need_reconnect &&
+				    !pSMBFile->invalidHandle)
+					rc = CIFSSMBClose(xid, pTcon,
 						  pSMBFile->netfid);
-			}
-		}
+			} else
+				write_unlock(&GlobalSMBSeslock);
+		} else
+			write_unlock(&GlobalSMBSeslock);
 
 		/* Delete any outstanding lock records.
 		   We'll lose them when the file is closed anyway. */
@@ -587,15 +591,18 @@ int cifs_closedir(struct inode *inode, struct file *file)
 		pTcon = cifs_sb->tcon;
 
 		cFYI(1, ("Freeing private data in close dir"));
+		write_lock(&GlobalSMBSeslock);
 		if (!pCFileStruct->srch_inf.endOfSearch &&
 		    !pCFileStruct->invalidHandle) {
 			pCFileStruct->invalidHandle = true;
+			write_unlock(&GlobalSMBSeslock);
 			rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
 			cFYI(1, ("Closing uncompleted readdir with rc %d",
 				 rc));
 			/* not much we can do if it fails anyway, ignore rc */
 			rc = 0;
-		}
+		} else
+			write_unlock(&GlobalSMBSeslock);
 		ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
 		if (ptmp) {
 			cFYI(1, ("closedir free smb buf in srch struct"));
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index addd1dcc2d79..9ee3f689c2b0 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -555,12 +555,14 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 				continue;
 
 			cifs_stats_inc(&tcon->num_oplock_brks);
+			write_lock(&GlobalSMBSeslock);
 			list_for_each(tmp2, &tcon->openFileList) {
 				netfile = list_entry(tmp2, struct cifsFileInfo,
 						     tlist);
 				if (pSMB->Fid != netfile->netfid)
 					continue;
 
+				write_unlock(&GlobalSMBSeslock);
 				read_unlock(&cifs_tcp_ses_lock);
 				cFYI(1, ("file id match, oplock break"));
 				pCifsInode = CIFS_I(netfile->pInode);
@@ -576,6 +578,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 
 				return true;
 			}
+			write_unlock(&GlobalSMBSeslock);
 			read_unlock(&cifs_tcp_ses_lock);
 			cFYI(1, ("No matching file for oplock break"));
 			return true;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 58d57299f2a0..9f51f9bf0292 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -741,11 +741,14 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	   (index_to_find < first_entry_in_buffer)) {
 		/* close and restart search */
 		cFYI(1, ("search backing up - close and restart search"));
+		write_lock(&GlobalSMBSeslock);
 		if (!cifsFile->srch_inf.endOfSearch &&
 		    !cifsFile->invalidHandle) {
 			cifsFile->invalidHandle = true;
+			write_unlock(&GlobalSMBSeslock);
 			CIFSFindClose(xid, pTcon, cifsFile->netfid);
-		}
+		} else
+			write_unlock(&GlobalSMBSeslock);
 		if (cifsFile->srch_inf.ntwrk_buf_start) {
 			cFYI(1, ("freeing SMB ff cache buf on search rewind"));
 			if (cifsFile->srch_inf.smallBuf)
-- 
cgit v1.2.2