From 3c6af7fa787f21f8873a050568ed892312899eb5 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 24 Nov 2005 13:41:33 +0000
Subject: NTFS: Fix a potential overflow by casting (index + 1) to s64 before
 doing a       left shift using PAGE_CACHE_SHIFT in fs/ntfs/file.c.  Thanks to
 Andrew       Morton pointing this out to.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 727533891813..c73c8864cbad 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -248,7 +248,7 @@ do_non_resident_extend:
 		 * enough to make ntfs_writepage() work.
 		 */
 		write_lock_irqsave(&ni->size_lock, flags);
-		ni->initialized_size = (index + 1) << PAGE_CACHE_SHIFT;
+		ni->initialized_size = (s64)(index + 1) << PAGE_CACHE_SHIFT;
 		if (ni->initialized_size > new_init_size)
 			ni->initialized_size = new_init_size;
 		write_unlock_irqrestore(&ni->size_lock, flags);
-- 
cgit v1.2.2


From 64419d93a5906600af5817ad0cae3c6ecf7fb389 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sun, 5 Feb 2006 21:43:57 +0000
Subject: NTFS: We have struct kmem_cache now so use it instead of the typedef.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ntfs.h  | 10 +++++-----
 fs/ntfs/super.c | 12 ++++++------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
index 446b5014115c..653d2a5c4899 100644
--- a/fs/ntfs/ntfs.h
+++ b/fs/ntfs/ntfs.h
@@ -50,11 +50,11 @@ typedef enum {
 /* Global variables. */
 
 /* Slab caches (from super.c). */
-extern kmem_cache_t *ntfs_name_cache;
-extern kmem_cache_t *ntfs_inode_cache;
-extern kmem_cache_t *ntfs_big_inode_cache;
-extern kmem_cache_t *ntfs_attr_ctx_cache;
-extern kmem_cache_t *ntfs_index_ctx_cache;
+extern struct kmem_cache *ntfs_name_cache;
+extern struct kmem_cache *ntfs_inode_cache;
+extern struct kmem_cache *ntfs_big_inode_cache;
+extern struct kmem_cache *ntfs_attr_ctx_cache;
+extern struct kmem_cache *ntfs_index_ctx_cache;
 
 /* The various operations structs defined throughout the driver files. */
 extern struct address_space_operations ntfs_aops;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index c3a3f1a8310b..e9c0d80dfab1 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2987,14 +2987,14 @@ err_out_now:
  * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN
  * (255) Unicode characters + a terminating NULL Unicode character.
  */
-kmem_cache_t *ntfs_name_cache;
+struct kmem_cache *ntfs_name_cache;
 
 /* Slab caches for efficient allocation/deallocation of inodes. */
-kmem_cache_t *ntfs_inode_cache;
-kmem_cache_t *ntfs_big_inode_cache;
+struct kmem_cache *ntfs_inode_cache;
+struct kmem_cache *ntfs_big_inode_cache;
 
 /* Init once constructor for the inode slab cache. */
-static void ntfs_big_inode_init_once(void *foo, kmem_cache_t *cachep,
+static void ntfs_big_inode_init_once(void *foo, struct kmem_cache *cachep,
 		unsigned long flags)
 {
 	ntfs_inode *ni = (ntfs_inode *)foo;
@@ -3008,8 +3008,8 @@ static void ntfs_big_inode_init_once(void *foo, kmem_cache_t *cachep,
  * Slab caches to optimize allocations and deallocations of attribute search
  * contexts and index contexts, respectively.
  */
-kmem_cache_t *ntfs_attr_ctx_cache;
-kmem_cache_t *ntfs_index_ctx_cache;
+struct kmem_cache *ntfs_attr_ctx_cache;
+struct kmem_cache *ntfs_index_ctx_cache;
 
 /* Driver wide semaphore. */
 DECLARE_MUTEX(ntfs_lock);
-- 
cgit v1.2.2


From 3672b638ec1d5b1020ea27986060b830f09c96c1 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Fri, 24 Feb 2006 09:55:07 +0000
Subject: NTFS: - Cope with attribute list attribute having invalid flags. 
 Windows copes with this and even chkdsk does not detect or fix this 	so we
 have to cope with it, too.  Thanks to Pawel Kot for reporting 	the problem.  
     - Miscellaneous updates to layout.h.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog | 28 +++++++++++++++++++---------
 fs/ntfs/inode.c   | 49 +++++++++++++++++++++++++++++++++++++++----------
 fs/ntfs/layout.h  | 25 ++++++++++++++++++-------
 3 files changed, 76 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 02f44094bda9..4a62201e8441 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -1,9 +1,9 @@
 ToDo/Notes:
 	- Find and fix bugs.
 	- The only places in the kernel where a file is resized are
-	  ntfs_file_write*() and ntfs_truncate() for both of which i_sem is
+	  ntfs_file_write*() and ntfs_truncate() for both of which i_mutex is
 	  held.  Just have to be careful in read-/writepage and other helpers
-	  not running under i_sem that we play nice...  Also need to be careful
+	  not running under i_mutex that we play nice.  Also need to be careful
 	  with initialized_size extension in ntfs_file_write*() and writepage.
 	  UPDATE: The only things that need to be checked are the compressed
 	  write and the other attribute resize/write cases like index
@@ -19,6 +19,16 @@ ToDo/Notes:
 	- Enable the code for setting the NT4 compatibility flag when we start
 	  making NTFS 1.2 specific modifications.
 
+2.1.26 - Minor bug fixes and updates.
+
+	- We have struct kmem_cache now so use it instead of the typedef
+	  kmem_cache_t.  (Pekka Enberg)
+	- Miscellaneous updates to layout.h.
+	- Cope with attribute list attribute having invalid flags.  Windows
+	  copes with this and even chkdsk does not detect or fix this so we
+	  have to cope with it, too.  Thanks to Pawel Kot for reporting the
+	  problem.
+
 2.1.25 - (Almost) fully implement write(2) and truncate(2).
 
 	- Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and
@@ -373,7 +383,7 @@ ToDo/Notes:
 	  single one of them had an mst error.  (Thanks to Ken MacFerrin for
 	  the bug report.)
 	- Fix error handling in fs/ntfs/quota.c::ntfs_mark_quotas_out_of_date()
-	  where we failed to release i_sem on the $Quota/$Q attribute inode.
+	  where we failed to release i_mutex on the $Quota/$Q attribute inode.
 	- Fix bug in handling of bad inodes in fs/ntfs/namei.c::ntfs_lookup().
 	- Add mapping of unmapped buffers to all remaining code paths, i.e.
 	  fs/ntfs/aops.c::ntfs_write_mst_block(), mft.c::ntfs_sync_mft_mirror(),
@@ -874,7 +884,7 @@ ToDo/Notes:
 	  clusters. (Philipp Thomas)
 	- attrib.c::load_attribute_list(): Fix bug when initialized_size is a
 	  multiple of the block_size but not the cluster size. (Szabolcs
-	  Szakacsits <szaka@sienet.hu>)
+	  Szakacsits)
 
 2.1.2 - Important bug fixes aleviating the hangs in statfs.
 
@@ -884,7 +894,7 @@ ToDo/Notes:
 
 	- Add handling for initialized_size != data_size in compressed files.
 	- Reduce function local stack usage from 0x3d4 bytes to just noise in
-	  fs/ntfs/upcase.c. (Randy Dunlap <rdunlap@xenotime.net>)
+	  fs/ntfs/upcase.c. (Randy Dunlap)
 	- Remove compiler warnings for newer gcc.
 	- Pages are no longer kmapped by mm/filemap.c::generic_file_write()
 	  around calls to ->{prepare,commit}_write.  Adapt NTFS appropriately
@@ -1201,11 +1211,11 @@ ToDo/Notes:
 	  the kernel. We probably want a kernel generic init_address_space()
 	  function...
 	- Drop BKL from ntfs_readdir() after consultation with Al Viro. The
-	  only caller of ->readdir() is vfs_readdir() which holds i_sem during
-	  the call, and i_sem is sufficient protection against changes in the
-	  directory inode (including ->i_size).
+	  only caller of ->readdir() is vfs_readdir() which holds i_mutex
+	  during the call, and i_mutex is sufficient protection against changes
+	  in the directory inode (including ->i_size).
 	- Use generic_file_llseek() for directories (as opposed to
-	  default_llseek()) as this downs i_sem instead of the BKL which is
+	  default_llseek()) as this downs i_mutex instead of the BKL which is
 	  what we now need for exclusion against ->f_pos changes considering we
 	  no longer take the BKL in ntfs_readdir().
 
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index ea1bd3feea1b..55263b7de9c0 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -677,13 +677,28 @@ static int ntfs_read_locked_inode(struct inode *vi)
 		ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino);
 		NInoSetAttrList(ni);
 		a = ctx->attr;
-		if (a->flags & ATTR_IS_ENCRYPTED ||
-				a->flags & ATTR_COMPRESSION_MASK ||
-				a->flags & ATTR_IS_SPARSE) {
+		if (a->flags & ATTR_COMPRESSION_MASK) {
 			ntfs_error(vi->i_sb, "Attribute list attribute is "
-					"compressed/encrypted/sparse.");
+					"compressed.");
 			goto unm_err_out;
 		}
+		if (a->flags & ATTR_IS_ENCRYPTED ||
+				a->flags & ATTR_IS_SPARSE) {
+			if (a->non_resident) {
+				ntfs_error(vi->i_sb, "Non-resident attribute "
+						"list attribute is encrypted/"
+						"sparse.");
+				goto unm_err_out;
+			}
+			ntfs_warning(vi->i_sb, "Resident attribute list "
+					"attribute in inode 0x%lx is marked "
+					"encrypted/sparse which is not true.  "
+					"However, Windows allows this and "
+					"chkdsk does not detect or correct it "
+					"so we will just ignore the invalid "
+					"flags and pretend they are not set.",
+					vi->i_ino);
+		}
 		/* Now allocate memory for the attribute list. */
 		ni->attr_list_size = (u32)ntfs_attr_size(a);
 		ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
@@ -1809,19 +1824,33 @@ int ntfs_read_inode_mount(struct inode *vi)
 	} else /* if (!err) */ {
 		ATTR_LIST_ENTRY *al_entry, *next_al_entry;
 		u8 *al_end;
+		static const char *es = "  Not allowed.  $MFT is corrupt.  "
+				"You should run chkdsk.";
 
 		ntfs_debug("Attribute list attribute found in $MFT.");
 		NInoSetAttrList(ni);
 		a = ctx->attr;
-		if (a->flags & ATTR_IS_ENCRYPTED ||
-				a->flags & ATTR_COMPRESSION_MASK ||
-				a->flags & ATTR_IS_SPARSE) {
+		if (a->flags & ATTR_COMPRESSION_MASK) {
 			ntfs_error(sb, "Attribute list attribute is "
-					"compressed/encrypted/sparse. Not "
-					"allowed. $MFT is corrupt. You should "
-					"run chkdsk.");
+					"compressed.%s", es);
 			goto put_err_out;
 		}
+		if (a->flags & ATTR_IS_ENCRYPTED ||
+				a->flags & ATTR_IS_SPARSE) {
+			if (a->non_resident) {
+				ntfs_error(sb, "Non-resident attribute list "
+						"attribute is encrypted/"
+						"sparse.%s", es);
+				goto put_err_out;
+			}
+			ntfs_warning(sb, "Resident attribute list attribute "
+					"in $MFT system file is marked "
+					"encrypted/sparse which is not true.  "
+					"However, Windows allows this and "
+					"chkdsk does not detect or correct it "
+					"so we will just ignore the invalid "
+					"flags and pretend they are not set.");
+		}
 		/* Now allocate memory for the attribute list. */
 		ni->attr_list_size = (u32)ntfs_attr_size(a);
 		ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index f5678d5d7919..bb408d4dcbb0 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -838,15 +838,19 @@ enum {
 	   F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
 	   F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest.  This mask
 	   is used to to obtain all flags that are valid for setting. */
-
 	/*
-	 * The following flags are only present in the FILE_NAME attribute (in
+	 * The following flag is only present in the FILE_NAME attribute (in
 	 * the field file_attributes).
 	 */
 	FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT	= const_cpu_to_le32(0x10000000),
 	/* Note, this is a copy of the corresponding bit from the mft record,
 	   telling us whether this is a directory or not, i.e. whether it has
 	   an index root attribute or not. */
+	/*
+	 * The following flag is present both in the STANDARD_INFORMATION
+	 * attribute and in the FILE_NAME attribute (in the field
+	 * file_attributes).
+	 */
 	FILE_ATTR_DUP_VIEW_INDEX_PRESENT	= const_cpu_to_le32(0x20000000),
 	/* Note, this is a copy of the corresponding bit from the mft record,
 	   telling us whether this file has a view index present (eg. object id
@@ -1071,9 +1075,15 @@ typedef struct {
 					   modified. */
 /* 20*/	sle64 last_access_time;		/* Time this mft record was last
 					   accessed. */
-/* 28*/	sle64 allocated_size;		/* Byte size of allocated space for the
-					   data attribute. NOTE: Is a multiple
-					   of the cluster size. */
+/* 28*/	sle64 allocated_size;		/* Byte size of on-disk allocated space
+					   for the data attribute.  So for
+					   normal $DATA, this is the
+					   allocated_size from the unnamed
+					   $DATA attribute and for compressed
+					   and/or sparse $DATA, this is the
+					   compressed_size from the unnamed
+					   $DATA attribute.  NOTE: This is a
+					   multiple of the cluster size. */
 /* 30*/	sle64 data_size;		/* Byte size of actual data in data
 					   attribute. */
 /* 38*/	FILE_ATTR_FLAGS file_attributes;	/* Flags describing the file. */
@@ -1904,12 +1914,13 @@ enum {
 	VOLUME_DELETE_USN_UNDERWAY	= const_cpu_to_le16(0x0010),
 	VOLUME_REPAIR_OBJECT_ID		= const_cpu_to_le16(0x0020),
 
+	VOLUME_CHKDSK_UNDERWAY		= const_cpu_to_le16(0x4000),
 	VOLUME_MODIFIED_BY_CHKDSK	= const_cpu_to_le16(0x8000),
 
-	VOLUME_FLAGS_MASK		= const_cpu_to_le16(0x803f),
+	VOLUME_FLAGS_MASK		= const_cpu_to_le16(0xc03f),
 
 	/* To make our life easier when checking if we must mount read-only. */
-	VOLUME_MUST_MOUNT_RO_MASK	= const_cpu_to_le16(0x8027),
+	VOLUME_MUST_MOUNT_RO_MASK	= const_cpu_to_le16(0xc027),
 } __attribute__ ((__packed__));
 
 typedef le16 VOLUME_FLAGS;
-- 
cgit v1.2.2


From 78af34f03d33d2ba179c9d35685860170b94a285 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Fri, 24 Feb 2006 10:32:33 +0000
Subject: NTFS: Implement support for sector sizes above 512 bytes (up to the
 maximum       supported by NTFS which is 4096 bytes).

---
 fs/ntfs/ChangeLog |   6 +++
 fs/ntfs/aops.c    |  18 +++----
 fs/ntfs/file.c    |   8 +--
 fs/ntfs/mft.c     |   8 +--
 fs/ntfs/super.c   | 151 +++++++++++++++++++++++++++++++++++-------------------
 5 files changed, 121 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 4a62201e8441..e66b4ac2fade 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -21,8 +21,14 @@ ToDo/Notes:
 
 2.1.26 - Minor bug fixes and updates.
 
+	- Fix a potential overflow in file.c where a cast to s64 was missing in
+	  a left shift of a page index.
+	- The struct inode has had its i_sem semaphore changed to a mutex named
+	  i_mutex.
 	- We have struct kmem_cache now so use it instead of the typedef
 	  kmem_cache_t.  (Pekka Enberg)
+	- Implement support for sector sizes above 512 bytes (up to the maximum
+	  supported by NTFS which is 4096 bytes).
 	- Miscellaneous updates to layout.h.
 	- Cope with attribute list attribute having invalid flags.  Windows
 	  copes with this and even chkdsk does not detect or fix this so we
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 1c0a4315876a..7e361da770b3 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -2,7 +2,7 @@
  * aops.c - NTFS kernel address space operations and page cache handling.
  *	    Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2005 Anton Altaparmakov
+ * Copyright (c) 2001-2006 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -200,8 +200,8 @@ static int ntfs_read_block(struct page *page)
 	/* $MFT/$DATA must have its complete runlist in memory at all times. */
 	BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
 
-	blocksize_bits = VFS_I(ni)->i_blkbits;
-	blocksize = 1 << blocksize_bits;
+	blocksize = vol->sb->s_blocksize;
+	blocksize_bits = vol->sb->s_blocksize_bits;
 
 	if (!page_has_buffers(page)) {
 		create_empty_buffers(page, blocksize, 0);
@@ -569,10 +569,8 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
 
 	BUG_ON(!NInoNonResident(ni));
 	BUG_ON(NInoMstProtected(ni));
-
-	blocksize_bits = vi->i_blkbits;
-	blocksize = 1 << blocksize_bits;
-
+	blocksize = vol->sb->s_blocksize;
+	blocksize_bits = vol->sb->s_blocksize_bits;
 	if (!page_has_buffers(page)) {
 		BUG_ON(!PageUptodate(page));
 		create_empty_buffers(page, blocksize,
@@ -949,8 +947,8 @@ static int ntfs_write_mst_block(struct page *page,
 	 */
 	BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
 			(NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
-	bh_size_bits = vi->i_blkbits;
-	bh_size = 1 << bh_size_bits;
+	bh_size = vol->sb->s_blocksize;
+	bh_size_bits = vol->sb->s_blocksize_bits;
 	max_bhs = PAGE_CACHE_SIZE / bh_size;
 	BUG_ON(!max_bhs);
 	BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
@@ -1596,7 +1594,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
 
 	BUG_ON(!PageUptodate(page));
 	end = ofs + ni->itype.index.block_size;
-	bh_size = 1 << VFS_I(ni)->i_blkbits;
+	bh_size = VFS_I(ni)->i_sb->s_blocksize;
 	spin_lock(&mapping->private_lock);
 	if (unlikely(!page_has_buffers(page))) {
 		spin_unlock(&mapping->private_lock);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 3a119a87686a..5027d3d1b3fe 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
 /*
  * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2005 Anton Altaparmakov
+ * Copyright (c) 2001-2006 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -529,8 +529,8 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
 			"index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
 			vi->i_ino, ni->type, pages[0]->index, nr_pages,
 			(long long)pos, bytes);
-	blocksize_bits = vi->i_blkbits;
-	blocksize = 1 << blocksize_bits;
+	blocksize = vol->sb->s_blocksize;
+	blocksize_bits = vol->sb->s_blocksize_bits;
 	u = 0;
 	do {
 		struct page *page = pages[u];
@@ -1525,7 +1525,7 @@ static inline int ntfs_commit_pages_after_non_resident_write(
 
 	vi = pages[0]->mapping->host;
 	ni = NTFS_I(vi);
-	blocksize = 1 << vi->i_blkbits;
+	blocksize = vi->i_sb->s_blocksize;
 	end = pos + bytes;
 	u = 0;
 	do {
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 0c65cbb8c5cf..6499aafc2258 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1,7 +1,7 @@
 /**
  * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2005 Anton Altaparmakov
+ * Copyright (c) 2001-2006 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -473,7 +473,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
 	runlist_element *rl;
 	unsigned int block_start, block_end, m_start, m_end, page_ofs;
 	int i_bhs, nr_bhs, err = 0;
-	unsigned char blocksize_bits = vol->mftmirr_ino->i_blkbits;
+	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
 
 	ntfs_debug("Entering for inode 0x%lx.", mft_no);
 	BUG_ON(!max_bhs);
@@ -672,8 +672,8 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
 {
 	ntfs_volume *vol = ni->vol;
 	struct page *page = ni->page;
-	unsigned char blocksize_bits = vol->mft_ino->i_blkbits;
-	unsigned int blocksize = 1 << blocksize_bits;
+	unsigned int blocksize = vol->sb->s_blocksize;
+	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
 	int max_bhs = vol->mft_record_size / blocksize;
 	struct buffer_head *bhs[max_bhs];
 	struct buffer_head *bh, *head;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index e9c0d80dfab1..489f7049146b 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1,7 +1,7 @@
 /*
  * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2005 Anton Altaparmakov
+ * Copyright (c) 2001-2006 Anton Altaparmakov
  * Copyright (c) 2001,2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -22,6 +22,7 @@
 
 #include <linux/stddef.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>	/* For bdev_hardsect_size(). */
@@ -641,7 +642,7 @@ static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb,
 {
 	const char *read_err_str = "Unable to read %s boot sector.";
 	struct buffer_head *bh_primary, *bh_backup;
-	long nr_blocks = NTFS_SB(sb)->nr_blocks;
+	sector_t nr_blocks = NTFS_SB(sb)->nr_blocks;
 
 	/* Try to read primary boot sector. */
 	if ((bh_primary = sb_bread(sb, 0))) {
@@ -688,13 +689,18 @@ hotfix_primary_boot_sector:
 		/*
 		 * If we managed to read sector zero and the volume is not
 		 * read-only, copy the found, valid backup boot sector to the
-		 * primary boot sector.
+		 * primary boot sector.  Note we only copy the actual boot
+		 * sector structure, not the actual whole device sector as that
+		 * may be bigger and would potentially damage the $Boot system
+		 * file (FIXME: Would be nice to know if the backup boot sector
+		 * on a large sector device contains the whole boot loader or
+		 * just the first 512 bytes).
 		 */
 		if (!(sb->s_flags & MS_RDONLY)) {
 			ntfs_warning(sb, "Hot-fix: Recovering invalid primary "
 					"boot sector from backup copy.");
 			memcpy(bh_primary->b_data, bh_backup->b_data,
-					sb->s_blocksize);
+					NTFS_BLOCK_SIZE);
 			mark_buffer_dirty(bh_primary);
 			sync_dirty_buffer(bh_primary);
 			if (buffer_uptodate(bh_primary)) {
@@ -733,9 +739,13 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 			vol->sector_size);
 	ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits,
 			vol->sector_size_bits);
-	if (vol->sector_size != vol->sb->s_blocksize)
-		ntfs_warning(vol->sb, "The boot sector indicates a sector size "
-				"different from the device sector size.");
+	if (vol->sector_size < vol->sb->s_blocksize) {
+		ntfs_error(vol->sb, "Sector size (%i) is smaller than the "
+				"device block size (%lu).  This is not "
+				"supported.  Sorry.", vol->sector_size,
+				vol->sb->s_blocksize);
+		return FALSE;
+	}
 	ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster);
 	sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1;
 	ntfs_debug("sectors_per_cluster_bits = 0x%x",
@@ -748,16 +758,11 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 	ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size,
 			vol->cluster_size);
 	ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask);
-	ntfs_debug("vol->cluster_size_bits = %i (0x%x)",
-			vol->cluster_size_bits, vol->cluster_size_bits);
-	if (vol->sector_size > vol->cluster_size) {
-		ntfs_error(vol->sb, "Sector sizes above the cluster size are "
-				"not supported.  Sorry.");
-		return FALSE;
-	}
-	if (vol->sb->s_blocksize > vol->cluster_size) {
-		ntfs_error(vol->sb, "Cluster sizes smaller than the device "
-				"sector size are not supported.  Sorry.");
+	ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits);
+	if (vol->cluster_size < vol->sector_size) {
+		ntfs_error(vol->sb, "Cluster size (%i) is smaller than the "
+				"sector size (%i).  This is not supported.  "
+				"Sorry.", vol->cluster_size, vol->sector_size);
 		return FALSE;
 	}
 	clusters_per_mft_record = b->clusters_per_mft_record;
@@ -786,11 +791,18 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 	 * we store $MFT/$DATA, the table of mft records in the page cache.
 	 */
 	if (vol->mft_record_size > PAGE_CACHE_SIZE) {
-		ntfs_error(vol->sb, "Mft record size %i (0x%x) exceeds the "
-				"page cache size on your system %lu (0x%lx).  "
+		ntfs_error(vol->sb, "Mft record size (%i) exceeds the "
+				"PAGE_CACHE_SIZE on your system (%lu).  "
 				"This is not supported.  Sorry.",
-				vol->mft_record_size, vol->mft_record_size,
-				PAGE_CACHE_SIZE, PAGE_CACHE_SIZE);
+				vol->mft_record_size, PAGE_CACHE_SIZE);
+		return FALSE;
+	}
+	/* We cannot support mft record sizes below the sector size. */
+	if (vol->mft_record_size < vol->sector_size) {
+		ntfs_error(vol->sb, "Mft record size (%i) is smaller than the "
+				"sector size (%i).  This is not supported.  "
+				"Sorry.", vol->mft_record_size,
+				vol->sector_size);
 		return FALSE;
 	}
 	clusters_per_index_record = b->clusters_per_index_record;
@@ -816,6 +828,14 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 	ntfs_debug("vol->index_record_size_bits = %i (0x%x)",
 			vol->index_record_size_bits,
 			vol->index_record_size_bits);
+	/* We cannot support index record sizes below the sector size. */
+	if (vol->index_record_size < vol->sector_size) {
+		ntfs_error(vol->sb, "Index record size (%i) is smaller than "
+				"the sector size (%i).  This is not "
+				"supported.  Sorry.", vol->index_record_size,
+				vol->sector_size);
+		return FALSE;
+	}
 	/*
 	 * Get the size of the volume in clusters and check for 64-bit-ness.
 	 * Windows currently only uses 32 bits to save the clusters so we do
@@ -845,15 +865,18 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 	}
 	ll = sle64_to_cpu(b->mft_lcn);
 	if (ll >= vol->nr_clusters) {
-		ntfs_error(vol->sb, "MFT LCN is beyond end of volume.  Weird.");
+		ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of "
+				"volume.  Weird.", (unsigned long long)ll,
+				(unsigned long long)ll);
 		return FALSE;
 	}
 	vol->mft_lcn = ll;
 	ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn);
 	ll = sle64_to_cpu(b->mftmirr_lcn);
 	if (ll >= vol->nr_clusters) {
-		ntfs_error(vol->sb, "MFTMirr LCN is beyond end of volume.  "
-				"Weird.");
+		ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end "
+				"of volume.  Weird.", (unsigned long long)ll,
+				(unsigned long long)ll);
 		return FALSE;
 	}
 	vol->mftmirr_lcn = ll;
@@ -2685,7 +2708,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 	ntfs_volume *vol;
 	struct buffer_head *bh;
 	struct inode *tmp_ino;
-	int result;
+	int blocksize, result;
 
 	ntfs_debug("Entering.");
 #ifndef NTFS_RW
@@ -2724,60 +2747,85 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 	if (!parse_options(vol, (char*)opt))
 		goto err_out_now;
 
+	/* We support sector sizes up to the PAGE_CACHE_SIZE. */
+	if (bdev_hardsect_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
+		if (!silent)
+			ntfs_error(sb, "Device has unsupported sector size "
+					"(%i).  The maximum supported sector "
+					"size on this architecture is %lu "
+					"bytes.",
+					bdev_hardsect_size(sb->s_bdev),
+					PAGE_CACHE_SIZE);
+		goto err_out_now;
+	}
 	/*
-	 * TODO: Fail safety check. In the future we should really be able to
-	 * cope with this being the case, but for now just bail out.
+	 * Setup the device access block size to NTFS_BLOCK_SIZE or the hard
+	 * sector size, whichever is bigger.
 	 */
-	if (bdev_hardsect_size(sb->s_bdev) > NTFS_BLOCK_SIZE) {
+	blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE);
+	if (blocksize < NTFS_BLOCK_SIZE) {
 		if (!silent)
-			ntfs_error(sb, "Device has unsupported hardsect_size.");
+			ntfs_error(sb, "Unable to set device block size.");
 		goto err_out_now;
 	}
-
-	/* Setup the device access block size to NTFS_BLOCK_SIZE. */
-	if (sb_set_blocksize(sb, NTFS_BLOCK_SIZE) != NTFS_BLOCK_SIZE) {
+	BUG_ON(blocksize != sb->s_blocksize);
+	ntfs_debug("Set device block size to %i bytes (block size bits %i).",
+			blocksize, sb->s_blocksize_bits);
+	/* Determine the size of the device in units of block_size bytes. */
+	if (!i_size_read(sb->s_bdev->bd_inode)) {
 		if (!silent)
-			ntfs_error(sb, "Unable to set block size.");
+			ntfs_error(sb, "Unable to determine device size.");
 		goto err_out_now;
 	}
-
-	/* Get the size of the device in units of NTFS_BLOCK_SIZE bytes. */
 	vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
-			NTFS_BLOCK_SIZE_BITS;
-
+			sb->s_blocksize_bits;
 	/* Read the boot sector and return unlocked buffer head to it. */
 	if (!(bh = read_ntfs_boot_sector(sb, silent))) {
 		if (!silent)
 			ntfs_error(sb, "Not an NTFS volume.");
 		goto err_out_now;
 	}
-
 	/*
-	 * Extract the data from the boot sector and setup the ntfs super block
+	 * Extract the data from the boot sector and setup the ntfs volume
 	 * using it.
 	 */
 	result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data);
-
-	/* Initialize the cluster and mft allocators. */
-	ntfs_setup_allocators(vol);
-
 	brelse(bh);
-
 	if (!result) {
 		if (!silent)
 			ntfs_error(sb, "Unsupported NTFS filesystem.");
 		goto err_out_now;
 	}
-
 	/*
-	 * TODO: When we start coping with sector sizes different from
-	 * NTFS_BLOCK_SIZE, we now probably need to set the blocksize of the
-	 * device (probably to NTFS_BLOCK_SIZE).
+	 * If the boot sector indicates a sector size bigger than the current
+	 * device block size, switch the device block size to the sector size.
+	 * TODO: It may be possible to support this case even when the set
+	 * below fails, we would just be breaking up the i/o for each sector
+	 * into multiple blocks for i/o purposes but otherwise it should just
+	 * work.  However it is safer to leave disabled until someone hits this
+	 * error message and then we can get them to try it without the setting
+	 * so we know for sure that it works.
 	 */
-
+	if (vol->sector_size > blocksize) {
+		blocksize = sb_set_blocksize(sb, vol->sector_size);
+		if (blocksize != vol->sector_size) {
+			if (!silent)
+				ntfs_error(sb, "Unable to set device block "
+						"size to sector size (%i).",
+						vol->sector_size);
+			goto err_out_now;
+		}
+		BUG_ON(blocksize != sb->s_blocksize);
+		vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >>
+				sb->s_blocksize_bits;
+		ntfs_debug("Changed device block size to %i bytes (block size "
+				"bits %i) to match volume sector size.",
+				blocksize, sb->s_blocksize_bits);
+	}
+	/* Initialize the cluster and mft allocators. */
+	ntfs_setup_allocators(vol);
 	/* Setup remaining fields in the super block. */
 	sb->s_magic = NTFS_SB_MAGIC;
-
 	/*
 	 * Ntfs allows 63 bits for the file size, i.e. correct would be:
 	 *	sb->s_maxbytes = ~0ULL >> 1;
@@ -2787,9 +2835,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 	 * without overflowing the index or to 2^63 - 1, whichever is smaller.
 	 */
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
-
+	/* Ntfs measures time in 100ns intervals. */
 	sb->s_time_gran = 100;
-
 	/*
 	 * Now load the metadata required for the page cache and our address
 	 * space operations to function. We do this by setting up a specialised
-- 
cgit v1.2.2


From 1cf3109ffb26a6ea572fd02436bd10458b4b2187 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Fri, 24 Feb 2006 10:48:14 +0000
Subject: NTFS: Do more detailed reporting of why we cannot mount read-write by
      special casing the VOLUME_MODIFIED_BY_CHKDSK flag.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |  2 ++
 fs/ntfs/Makefile  |  2 +-
 fs/ntfs/super.c   | 34 +++++++++++++++++++++++++++-------
 fs/ntfs/upcase.c  | 10 ++++------
 fs/ntfs/volume.h  | 28 +++++++++++++---------------
 5 files changed, 47 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index e66b4ac2fade..9d8ffa89e2c2 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -29,6 +29,8 @@ ToDo/Notes:
 	  kmem_cache_t.  (Pekka Enberg)
 	- Implement support for sector sizes above 512 bytes (up to the maximum
 	  supported by NTFS which is 4096 bytes).
+	- Do more detailed reporting of why we cannot mount read-write by
+	  special casing the VOLUME_MODIFIED_BY_CHKDSK flag.
 	- Miscellaneous updates to layout.h.
 	- Cope with attribute list attribute having invalid flags.  Windows
 	  copes with this and even chkdsk does not detect or fix this so we
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index d0d45d1c853a..d95fac7fdeb6 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
 	     index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
 	     unistr.o upcase.o
 
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.25\"
+EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.26\"
 
 ifeq ($(CONFIG_NTFS_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 489f7049146b..368a8ec10668 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -472,9 +472,16 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 			ntfs_error(sb, "Volume is dirty and read-only%s", es);
 			return -EROFS;
 		}
+		if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
+			ntfs_error(sb, "Volume has been modified by chkdsk "
+					"and is read-only%s", es);
+			return -EROFS;
+		}
 		if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
-			ntfs_error(sb, "Volume has unsupported flags set and "
-					"is read-only%s", es);
+			ntfs_error(sb, "Volume has unsupported flags set "
+					"(0x%x) and is read-only%s",
+					(unsigned)le16_to_cpu(vol->vol_flags),
+					es);
 			return -EROFS;
 		}
 		if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
@@ -1845,11 +1852,24 @@ get_ctx_vol_failed:
 	/* Make sure that no unsupported volume flags are set. */
 	if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
 		static const char *es1a = "Volume is dirty";
-		static const char *es1b = "Volume has unsupported flags set";
-		static const char *es2 = ".  Run chkdsk and mount in Windows.";
-		const char *es1;
-		
-		es1 = vol->vol_flags & VOLUME_IS_DIRTY ? es1a : es1b;
+		static const char *es1b = "Volume has been modified by chkdsk";
+		static const char *es1c = "Volume has unsupported flags set";
+		static const char *es2a = ".  Run chkdsk and mount in Windows.";
+		static const char *es2b = ".  Mount in Windows.";
+		const char *es1, *es2;
+
+		es2 = es2a;
+		if (vol->vol_flags & VOLUME_IS_DIRTY)
+			es1 = es1a;
+		else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
+			es1 = es1b;
+			es2 = es2b;
+		} else {
+			es1 = es1c;
+			ntfs_warning(sb, "Unsupported volume flags 0x%x "
+					"encountered.",
+					(unsigned)le16_to_cpu(vol->vol_flags));
+		}
 		/* If a read-write mount, convert it to a read-only mount. */
 		if (!(sb->s_flags & MS_RDONLY)) {
 			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c
index 879cdf1d5bd3..9101807dc81a 100644
--- a/fs/ntfs/upcase.c
+++ b/fs/ntfs/upcase.c
@@ -3,10 +3,7 @@
  *	      Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001 Richard Russon <ntfs@flatcap.org>
- * Copyright (c) 2001-2004 Anton Altaparmakov
- *
- * Modified for mkntfs inclusion 9 June 2001 by Anton Altaparmakov.
- * Modified for kernel inclusion 10 September 2001 by Anton Altparmakov.
+ * Copyright (c) 2001-2006 Anton Altaparmakov
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -75,12 +72,13 @@ ntfschar *generate_default_upcase(void)
 	if (!uc)
 		return uc;
 	memset(uc, 0, default_upcase_len * sizeof(ntfschar));
+	/* Generate the little endian Unicode upcase table used by ntfs. */
 	for (i = 0; i < default_upcase_len; i++)
 		uc[i] = cpu_to_le16(i);
 	for (r = 0; uc_run_table[r][0]; r++)
 		for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
-			uc[i] = cpu_to_le16((le16_to_cpu(uc[i]) +
-					uc_run_table[r][2]));
+			uc[i] = cpu_to_le16(le16_to_cpu(uc[i]) +
+					uc_run_table[r][2]);
 	for (r = 0; uc_dup_table[r][0]; r++)
 		for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
 			uc[i + 1] = cpu_to_le16(le16_to_cpu(uc[i + 1]) - 1);
diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h
index 375cd20a9f61..406ab55dfb32 100644
--- a/fs/ntfs/volume.h
+++ b/fs/ntfs/volume.h
@@ -2,7 +2,7 @@
  * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part
  *	      of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2005 Anton Altaparmakov
+ * Copyright (c) 2001-2006 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -41,10 +41,8 @@ typedef struct {
 	 * structure has stabilized... (AIA)
 	 */
 	/* Device specifics. */
-	struct super_block *sb;		/* Pointer back to the super_block,
-					   so we don't have to get the offset
-					   every time. */
-	LCN nr_blocks;			/* Number of NTFS_BLOCK_SIZE bytes
+	struct super_block *sb;		/* Pointer back to the super_block. */
+	LCN nr_blocks;			/* Number of sb->s_blocksize bytes
 					   sized blocks on the device. */
 	/* Configuration provided by user at mount time. */
 	unsigned long flags;		/* Miscellaneous flags, see below. */
@@ -141,8 +139,8 @@ typedef enum {
 	NV_ShowSystemFiles,	/* 1: Return system files in ntfs_readdir(). */
 	NV_CaseSensitive,	/* 1: Treat file names as case sensitive and
 				      create filenames in the POSIX namespace.
-				      Otherwise be case insensitive and create
-				      file names in WIN32 namespace. */
+				      Otherwise be case insensitive but still
+				      create file names in POSIX namespace. */
 	NV_LogFileEmpty,	/* 1: $LogFile journal is empty. */
 	NV_QuotaOutOfDate,	/* 1: $Quota is out of date. */
 	NV_UsnJrnlStamped,	/* 1: $UsnJrnl has been stamped. */
@@ -153,7 +151,7 @@ typedef enum {
  * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo()
  * functions.
  */
-#define NVOL_FNS(flag)					\
+#define DEFINE_NVOL_BIT_OPS(flag)					\
 static inline int NVol##flag(ntfs_volume *vol)		\
 {							\
 	return test_bit(NV_##flag, &(vol)->flags);	\
@@ -168,12 +166,12 @@ static inline void NVolClear##flag(ntfs_volume *vol)	\
 }
 
 /* Emit the ntfs volume bitops functions. */
-NVOL_FNS(Errors)
-NVOL_FNS(ShowSystemFiles)
-NVOL_FNS(CaseSensitive)
-NVOL_FNS(LogFileEmpty)
-NVOL_FNS(QuotaOutOfDate)
-NVOL_FNS(UsnJrnlStamped)
-NVOL_FNS(SparseEnabled)
+DEFINE_NVOL_BIT_OPS(Errors)
+DEFINE_NVOL_BIT_OPS(ShowSystemFiles)
+DEFINE_NVOL_BIT_OPS(CaseSensitive)
+DEFINE_NVOL_BIT_OPS(LogFileEmpty)
+DEFINE_NVOL_BIT_OPS(QuotaOutOfDate)
+DEFINE_NVOL_BIT_OPS(UsnJrnlStamped)
+DEFINE_NVOL_BIT_OPS(SparseEnabled)
 
 #endif /* _LINUX_NTFS_VOLUME_H */
-- 
cgit v1.2.2


From c04030e16dbea2f7581f82cc6688695927f6ac5b Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Fri, 24 Feb 2006 13:04:21 -0800
Subject: [PATCH] flags parameter for linkat

I'm currently at the POSIX meeting and one thing covered was the
incompatibility of Linux's link() with the POSIX definition.  The name.
Linux does not follow symlinks, POSIX requires it does.

Even if somebody thinks this is a good default behavior we cannot change this
because it would break the ABI.  But the fact remains that some application
might want this behavior.

We have one chance to help implementing this without breaking the behavior.
 For this we could use the new linkat interface which would need a new
flags parameter.  If the new parameter is AT_SYMLINK_FOLLOW the new
behavior could be invoked.

I do not want to introduce such a patch now.  But we could add the
parameter now, just don't use it.  The patch below would do this.  Can we
get this late patch applied before the release more or less fixes the
syscall API?

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namei.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index e28de846c591..557dcf395ca1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2224,13 +2224,17 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
  * and other special files.  --ADM
  */
 asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
-			   int newdfd, const char __user *newname)
+			   int newdfd, const char __user *newname,
+			   int flags)
 {
 	struct dentry *new_dentry;
 	struct nameidata nd, old_nd;
 	int error;
 	char * to;
 
+	if (flags != 0)
+		return -EINVAL;
+
 	to = getname(newname);
 	if (IS_ERR(to))
 		return PTR_ERR(to);
@@ -2263,7 +2267,7 @@ exit:
 
 asmlinkage long sys_link(const char __user *oldname, const char __user *newname)
 {
-	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname);
+	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
 }
 
 /*
-- 
cgit v1.2.2


From 8dde0509e74ff6044cf1788c917a22facce9f68d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 24 Feb 2006 13:04:23 -0800
Subject: [PATCH] ramfs: update dir mtime and ctime

Phil Marek <philipp.marek@bmlv.gv.at> points out that ramfs forgets to update
a directory's mtime and ctime when it is modified.

Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ramfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index c66bd5e4c05c..cde5d48994ae 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -27,6 +27,7 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
+#include <linux/time.h>
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/smp_lock.h>
@@ -104,6 +105,7 @@ ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 		d_instantiate(dentry, inode);
 		dget(dentry);	/* Extra count - pin the dentry in core */
 		error = 0;
+		dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 	}
 	return error;
 }
-- 
cgit v1.2.2


From 5342fba5412cead88b61ead07168615dbeba1ee3 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Sun, 26 Feb 2006 04:18:28 +0100
Subject: [PATCH] x86_64: Check for bad elf entry address.

Fixes a local DOS on Intel systems that lead to an endless
recursive fault.  AMD machines don't seem to be affected.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/binfmt_elf.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 1b117a441298..c2eac2a50bd2 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -938,6 +938,11 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		kfree(elf_interpreter);
 	} else {
 		elf_entry = loc->elf_ex.e_entry;
+		if (BAD_ADDR(elf_entry)) {
+			send_sig(SIGSEGV, current, 0);
+			retval = -ENOEXEC; /* Nobody gets to see this, but.. */
+			goto out_free_dentry;
+		}
 	}
 
 	kfree(elf_phdata);
-- 
cgit v1.2.2


From fc5870f66279fabedc9dbba7c28451bbb8f47778 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sun, 26 Feb 2006 04:18:55 +0100
Subject: [PATCH] x86_64: Fix ioctl compat code for /dev/rtc

RTC_IRQP_SET/RTC_EPOCH_SET don't take a pointer to an argument, but the
argument itself.  This actually simplifies the code and makes it work.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat_ioctl.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 057e60217fc5..537ac70edfe5 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -2531,18 +2531,9 @@ static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
 		val32 = kval;
 		return put_user(val32, (unsigned int __user *)arg);
 	case RTC_IRQP_SET32:
+		return sys_ioctl(fd, RTC_IRQP_SET, arg); 
 	case RTC_EPOCH_SET32:
-		ret = get_user(val32, (unsigned int __user *)arg);
-		if (ret)
-			return ret;
-		kval = val32;
-
-		set_fs(KERNEL_DS);
-		ret = sys_ioctl(fd, (cmd == RTC_IRQP_SET32) ?
-				RTC_IRQP_SET : RTC_EPOCH_SET,
-				(unsigned long)&kval);
-		set_fs(oldfs);
-		return ret;
+		return sys_ioctl(fd, RTC_EPOCH_SET, arg);
 	default:
 		/* unreached */
 		return -ENOIOCTLCMD;
-- 
cgit v1.2.2


From 07ff2fa8fcb3d9207f1c16e5acf9086d5731ed8b Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Tue, 28 Feb 2006 12:29:51 +1100
Subject: [XFS] Fix a realtime allocator regression introduced by an old iget
 race fix.  Noticed by Roger Willcocks.

SGI-PV: 949821
SGI-Modid: xfs-linux-melb:xfs-kern:25257a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_rtalloc.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 06fc061c50fc..5b413946b1c5 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -130,7 +130,8 @@ xfs_growfs_rt_alloc(
 		/*
 		 * Lock the inode.
 		 */
-		if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL, &ip)))
+		if ((error = xfs_trans_iget(mp, tp, ino, 0,
+						XFS_ILOCK_EXCL, &ip)))
 			goto error_exit;
 		XFS_BMAP_INIT(&flist, &firstblock);
 		/*
@@ -170,8 +171,8 @@ xfs_growfs_rt_alloc(
 			/*
 			 * Lock the bitmap inode.
 			 */
-			if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL,
-					&ip)))
+			if ((error = xfs_trans_iget(mp, tp, ino, 0,
+							XFS_ILOCK_EXCL, &ip)))
 				goto error_exit;
 			/*
 			 * Get a buffer for the block.
@@ -2023,8 +2024,8 @@ xfs_growfs_rt(
 		/*
 		 * Lock out other callers by grabbing the bitmap inode lock.
 		 */
-		if ((error = xfs_trans_iget(mp, tp, 0, mp->m_sb.sb_rbmino,
-				XFS_ILOCK_EXCL, &ip)))
+		if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+						XFS_ILOCK_EXCL, &ip)))
 			goto error_exit;
 		ASSERT(ip == mp->m_rbmip);
 		/*
@@ -2037,8 +2038,8 @@ xfs_growfs_rt(
 		/*
 		 * Get the summary inode into the transaction.
 		 */
-		if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino,
-				0, XFS_ILOCK_EXCL, &ip)))
+		if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0,
+						XFS_ILOCK_EXCL, &ip)))
 			goto error_exit;
 		ASSERT(ip == mp->m_rsumip);
 		/*
@@ -2158,10 +2159,9 @@ xfs_rtallocate_extent(
 	/*
 	 * Lock out other callers by grabbing the bitmap inode lock.
 	 */
-	error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip);
-	if (error) {
+	if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+					XFS_ILOCK_EXCL, &ip)))
 		return error;
-	}
 	sumbp = NULL;
 	/*
 	 * Allocate by size, or near another block, or exactly at some block.
@@ -2221,10 +2221,9 @@ xfs_rtfree_extent(
 	/*
 	 * Synchronize by locking the bitmap inode.
 	 */
-	error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip);
-	if (error) {
+	if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+					XFS_ILOCK_EXCL, &ip)))
 		return error;
-	}
 #if defined(__KERNEL__) && defined(DEBUG)
 	/*
 	 * Check to see that this whole range is currently allocated.
@@ -2365,8 +2364,8 @@ xfs_rtpick_extent(
 	__uint64_t	seq;		/* sequence number of file creation */
 	__uint64_t	*seqp;		/* pointer to seqno in inode */
 
-	error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip);
-	if (error)
+	if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+					XFS_ILOCK_EXCL, &ip)))
 		return error;
 	ASSERT(ip == mp->m_rbmip);
 	seqp = (__uint64_t *)&ip->i_d.di_atime;
-- 
cgit v1.2.2


From dae81d4774ecbeb7d24bb9a6a4db9f9baee54d85 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Tue, 28 Feb 2006 12:30:13 +1100
Subject: [XFS] Reduce stack use during quota mounts (caused a panic).  This
 regressed recently via the fix for inherited quota inode attributes.

SGI-PV: 947312
SGI-Modid: xfs-linux-melb:xfs-kern:25318a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/quota/xfs_qm.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 53a00fb217fa..7c0e39dc6189 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -68,6 +68,9 @@ kmem_zone_t	*qm_dqzone;
 kmem_zone_t	*qm_dqtrxzone;
 STATIC kmem_shaker_t	xfs_qm_shaker;
 
+STATIC cred_t	xfs_zerocr;
+STATIC xfs_inode_t	xfs_zeroino;
+
 STATIC void	xfs_qm_list_init(xfs_dqlist_t *, char *, int);
 STATIC void	xfs_qm_list_destroy(xfs_dqlist_t *);
 
@@ -1393,8 +1396,6 @@ xfs_qm_qino_alloc(
 	xfs_trans_t	*tp;
 	int		error;
 	unsigned long	s;
-	cred_t		zerocr;
-	xfs_inode_t	zeroino;
 	int		committed;
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
@@ -1406,11 +1407,9 @@ xfs_qm_qino_alloc(
 		xfs_trans_cancel(tp, 0);
 		return error;
 	}
-	memset(&zerocr, 0, sizeof(zerocr));
-	memset(&zeroino, 0, sizeof(zeroino));
 
-	if ((error = xfs_dir_ialloc(&tp, &zeroino, S_IFREG, 1, 0,
-				   &zerocr, 0, 1, ip, &committed))) {
+	if ((error = xfs_dir_ialloc(&tp, &xfs_zeroino, S_IFREG, 1, 0,
+				   &xfs_zerocr, 0, 1, ip, &committed))) {
 		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
 				 XFS_TRANS_ABORT);
 		return error;
-- 
cgit v1.2.2


From 2353e8e9b6ae29aad77935f21735a30f5cc419b4 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sgi.com>
Date: Tue, 28 Feb 2006 12:30:30 +1100
Subject: [XFS] Don't map non-uptodate buffers in xfs_probe_cluster; also fixes
 obscure corruption case

SGI-PV: 942658
SGI-Modid: xfs-linux-melb:xfs-kern:207119a

Signed-off-by: Eric Sandeen <sandeen@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 8f2beec526cf..74d8be87f983 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -540,7 +540,7 @@ xfs_probe_cluster(
 
 	/* First sum forwards in this page */
 	do {
-		if (mapped != buffer_mapped(bh))
+		if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh)))
 			return total;
 		total += bh->b_size;
 	} while ((bh = bh->b_this_page) != head);
-- 
cgit v1.2.2


From 50322fe7d46b544d5649edb58bdbe5c95dd44b98 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 28 Feb 2006 16:59:03 -0800
Subject: [PATCH] fuse: fix bug in negative lookup

If negative entries (nodeid == 0) were sent in reply to LOOKUP requests,
two bugs could be triggered:

- looking up a negative entry would return -EIO,

- revaildate on an entry which turned negative would send a FORGET
  request with zero nodeid, which would cause an abort() in the
  library.

The above would only happen if the 'negative_timeout=N' option was used,
otherwise lookups reply -ENOENT, which worked correctly.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dir.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 21fd59c7bc24..c72a8a97935c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -111,6 +111,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 
 		/* Doesn't hurt to "reset" the validity timeout */
 		fuse_invalidate_entry_cache(entry);
+
+		/* For negative dentries, always do a fresh lookup */
 		if (!inode)
 			return 0;
 
@@ -122,6 +124,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg);
 		request_send(fc, req);
 		err = req->out.h.error;
+		/* Zero nodeid is same as -ENOENT */
+		if (!err && !outarg.nodeid)
+			err = -ENOENT;
 		if (!err) {
 			struct fuse_inode *fi = get_fuse_inode(inode);
 			if (outarg.nodeid != get_node_id(inode)) {
@@ -190,8 +195,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	fuse_lookup_init(req, dir, entry, &outarg);
 	request_send(fc, req);
 	err = req->out.h.error;
-	if (!err && ((outarg.nodeid && invalid_nodeid(outarg.nodeid)) ||
-		     !valid_mode(outarg.attr.mode)))
+	/* Zero nodeid is same as -ENOENT, but with valid timeout */
+	if (!err && outarg.nodeid &&
+	    (invalid_nodeid(outarg.nodeid) || !valid_mode(outarg.attr.mode)))
 		err = -EIO;
 	if (!err && outarg.nodeid) {
 		inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
-- 
cgit v1.2.2


From 0551fbd29e16fccd46e41b7d01bf0f8f39b14212 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 28 Feb 2006 16:59:19 -0800
Subject: [PATCH] Add mm->task_size and fix powerpc vdso

This patch adds mm->task_size to keep track of the task size of a given mm
and uses that to fix the powerpc vdso so that it uses the mm task size to
decide what pages to fault in instead of the current thread flags (which
broke when ptracing).

(akpm: I expect that mm_struct.task_size will become the way in which we
finally sort out the confusion between 32-bit processes and 32-bit mm's.  It
may need tweaks, but at this stage this patch is powerpc-only.)

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 0e1c95074d42..0b515ac53134 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -885,6 +885,12 @@ int flush_old_exec(struct linux_binprm * bprm)
 	current->flags &= ~PF_RANDOMIZE;
 	flush_thread();
 
+	/* Set the new mm task size. We have to do that late because it may
+	 * depend on TIF_32BIT which is only updated in flush_thread() on
+	 * some architectures like powerpc
+	 */
+	current->mm->task_size = TASK_SIZE;
+
 	if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || 
 	    file_permission(bprm->file, MAY_READ) ||
 	    (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
-- 
cgit v1.2.2


From 6b7a6c94c9c15b2664b568ead83e6b3aaf60d65c Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 21 Feb 2006 11:57:30 -0500
Subject: [PATCH] ocfs2: fix -Wformat warnings when building UML on x86-64

 The check to determine which format string is appopriate for u64 and
 friends works in most cases, but UML on x86_64 doesn't define CONFIG_X86_64,
 so it results in screen fulls of compile-time warnings.

 This patch fixes it to handle that case.

 fs/ocfs2/cluster/masklog.h |    2 +-
 1 files changed, 1 insertion(+), 1 deletion(-)

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/masklog.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index e8c56a3d9c64..2cadc3009c83 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -256,7 +256,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 	}								\
 } while (0)
 
-#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64)
+#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64) || (defined(CONFIG_UML_X86) && defined(CONFIG_64BIT))
 #define MLFi64 "lld"
 #define MLFu64 "llu"
 #define MLFx64 "llx"
-- 
cgit v1.2.2


From d3178bcdd41b050e221337d7f5e30b3c58d4015a Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 24 Feb 2006 17:23:36 -0800
Subject: [PATCH] ocfs2: remove pointless max journal size limit

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/ocfs2_fs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index dfb8a5bedfc8..c5b1ac547c15 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -138,7 +138,6 @@
 
 /* Journal limits (in bytes) */
 #define OCFS2_MIN_JOURNAL_SIZE		(4 * 1024 * 1024)
-#define OCFS2_MAX_JOURNAL_SIZE		(500 * 1024 * 1024)
 
 struct ocfs2_system_inode_info {
 	char	*si_name;
-- 
cgit v1.2.2


From d267a56c883b350a2fa80f1daf4636809e3f8e67 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 23 Feb 2006 13:23:39 -0800
Subject: [PATCH] ocfs2: remove unused code

Remove some #ifdef'd out code which was inadvertantly introduced in our
initial merge.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/file.c  | 51 +--------------------------------------------------
 fs/ocfs2/ocfs2.h |  3 ---
 2 files changed, 1 insertion(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 1715bc90e705..8a4048b55fdc 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -933,9 +933,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_dentry->d_inode;
 	loff_t newsize, saved_pos;
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-#endif
 
 	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
 		   (unsigned int)count,
@@ -951,14 +948,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 		return -EIO;
 	}
 
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-	/* ugh, work around some applications which open everything O_DIRECT +
-	 * O_APPEND and really don't mean to use O_DIRECT. */
-	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
-	    (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT)) 
-		filp->f_flags &= ~O_DIRECT;
-#endif
-
 	mutex_lock(&inode->i_mutex);
 	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
 	if (filp->f_flags & O_DIRECT) {
@@ -1079,27 +1068,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	/* communicate with ocfs2_dio_end_io */
 	ocfs2_iocb_set_rw_locked(iocb);
 
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
-	    filp->f_flags & O_DIRECT) {
-		unsigned int saved_flags = filp->f_flags;
-		int sector_size = 1 << osb->s_sectsize_bits;
-
-		if ((saved_pos & (sector_size - 1)) ||
-		    (count & (sector_size - 1)) ||
-		    ((unsigned long)buf & (sector_size - 1))) {
-			filp->f_flags |= O_SYNC;
-			filp->f_flags &= ~O_DIRECT;
-		}
-
-		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
-						    &iocb->ki_pos);
-
-		filp->f_flags = saved_flags;
-	} else
-#endif
-		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
-						    &iocb->ki_pos);
+	ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
 
 	/* buffered aio wouldn't have proper lock coverage today */
 	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
@@ -1140,9 +1109,6 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 	int ret = 0, rw_level = -1, have_alloc_sem = 0;
 	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_dentry->d_inode;
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-#endif
 
 	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
 		   (unsigned int)count,
@@ -1155,21 +1121,6 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 		goto bail;
 	}
 
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
-		if (filp->f_flags & O_DIRECT) {
-			int sector_size = 1 << osb->s_sectsize_bits;
-
-			if ((pos & (sector_size - 1)) ||
-			    (count & (sector_size - 1)) ||
-			    ((unsigned long)buf & (sector_size - 1)) ||
-			    (i_size_read(inode) & (sector_size -1))) {
-				filp->f_flags &= ~O_DIRECT;
-			}
-		}
-	}
-#endif
-
 	/* 
 	 * buffered reads protect themselves in ->readpage().  O_DIRECT reads
 	 * need locks to protect pending reads from racing with truncate.
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 8d8e4779df92..19360e3d842e 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -174,9 +174,6 @@ enum ocfs2_mount_options
 	OCFS2_MOUNT_NOINTR  = 1 << 2,   /* Don't catch signals */
 	OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
 	OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
-#ifdef OCFS2_ORACORE_WORKAROUNDS
-	OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */
-#endif
 };
 
 #define OCFS2_OSB_SOFT_RO	0x0001
-- 
cgit v1.2.2


From 362342f68e331f080d0438f08af1e2c570b0b5fe Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 21 Feb 2006 16:46:33 -0800
Subject: [PATCH] ocfs2: remove non existing function prototypes

Remove some prototypes from tcp.h for functions which have long been gone.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/tcp.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index a6f4585501c8..616ff2b8434a 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -85,13 +85,10 @@ enum {
 	O2NET_DRIVER_READY,
 };
 
-int o2net_init_tcp_sock(struct inode *inode);
 int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
 		       u8 target_node, int *status);
 int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
 			   size_t veclen, u8 target_node, int *status);
-int o2net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len,
-			    struct inode *group);
 
 int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
 			   o2net_msg_handler_func *func, void *data,
@@ -107,7 +104,5 @@ void o2net_disconnect_node(struct o2nm_node *node);
 
 int o2net_init(void);
 void o2net_exit(void);
-int o2net_proc_init(struct proc_dir_entry *parent);
-void o2net_proc_exit(struct proc_dir_entry *parent);
 
 #endif /* O2CLUSTER_TCP_H */
-- 
cgit v1.2.2


From 895928b8380cc697ac56e9732cedf549c0a4f79c Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 21 Feb 2006 16:54:00 -0800
Subject: [PATCH] ocfs2: complete failure recovery for nodemanager init

 This patch finishes cleaning up the node manager allocations if it fails
 to initialize.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/nodemanager.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index cf7828f23361..e1fceb8aa32d 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -756,7 +756,7 @@ static int __init init_o2nm(void)
 	if (!ocfs2_table_header) {
 		printk(KERN_ERR "nodemanager: unable to register sysctl\n");
 		ret = -ENOMEM; /* or something. */
-		goto out;
+		goto out_o2net;
 	}
 
 	ret = o2net_register_hb_callbacks();
@@ -780,6 +780,8 @@ out_callbacks:
 	o2net_unregister_hb_callbacks();
 out_sysctl:
 	unregister_sysctl_table(ocfs2_table_header);
+out_o2net:
+	o2net_exit();
 out:
 	return ret;
 }
-- 
cgit v1.2.2


From b4df6ed8db0c387d38292e31f00adc4cd297ed5a Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 22 Feb 2006 17:35:08 -0800
Subject: [PATCH] ocfs2: fix orphan recovery deadlock

Orphan dir recovery can deadlock with another process in
ocfs2_delete_inode() in some corner cases. Fix this by tracking recovery
state more closely and allowing it to handle inode wipes which might
deadlock.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/heartbeat.c |   1 +
 fs/ocfs2/inode.c     |  46 ++++++++++++++++++-
 fs/ocfs2/journal.c   | 124 ++++++++++++++++++++++++++++++++++++++-------------
 fs/ocfs2/ocfs2.h     |   4 ++
 fs/ocfs2/super.c     |  11 +++++
 5 files changed, 154 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 0bbd22f46c80..cbfd45a97a63 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -67,6 +67,7 @@ void ocfs2_init_node_maps(struct ocfs2_super *osb)
 	ocfs2_node_map_init(&osb->mounted_map);
 	ocfs2_node_map_init(&osb->recovery_map);
 	ocfs2_node_map_init(&osb->umount_map);
+	ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
 }
 
 static void ocfs2_do_node_down(int node_num,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 8122489c5762..315472a5c192 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -41,6 +41,7 @@
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
+#include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
 #include "namei.h"
@@ -544,6 +545,42 @@ bail:
 	return status;
 }
 
+/* 
+ * Serialize with orphan dir recovery. If the process doing
+ * recovery on this orphan dir does an iget() with the dir
+ * i_mutex held, we'll deadlock here. Instead we detect this
+ * and exit early - recovery will wipe this inode for us.
+ */
+static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
+					     int slot)
+{
+	int ret = 0;
+
+	spin_lock(&osb->osb_lock);
+	if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) {
+		mlog(0, "Recovery is happening on orphan dir %d, will skip "
+		     "this inode\n", slot);
+		ret = -EDEADLK;
+		goto out;
+	}
+	/* This signals to the orphan recovery process that it should
+	 * wait for us to handle the wipe. */
+	osb->osb_orphan_wipes[slot]++;
+out:
+	spin_unlock(&osb->osb_lock);
+	return ret;
+}
+
+static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb,
+					 int slot)
+{
+	spin_lock(&osb->osb_lock);
+	osb->osb_orphan_wipes[slot]--;
+	spin_unlock(&osb->osb_lock);
+
+	wake_up(&osb->osb_wipe_event);
+}
+
 static int ocfs2_wipe_inode(struct inode *inode,
 			    struct buffer_head *di_bh)
 {
@@ -555,6 +592,11 @@ static int ocfs2_wipe_inode(struct inode *inode,
 	/* We've already voted on this so it should be readonly - no
 	 * spinlock needed. */
 	orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+
+	status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
+	if (status)
+		return status;
+
 	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
 						       ORPHAN_DIR_SYSTEM_INODE,
 						       orphaned_slot);
@@ -597,6 +639,7 @@ bail_unlock_dir:
 	brelse(orphan_dir_bh);
 bail:
 	iput(orphan_dir_inode);
+	ocfs2_signal_wipe_completion(osb, orphaned_slot);
 
 	return status;
 }
@@ -822,7 +865,8 @@ void ocfs2_delete_inode(struct inode *inode)
 
 	status = ocfs2_wipe_inode(inode, di_bh);
 	if (status < 0) {
-		mlog_errno(status);
+		if (status != -EDEADLK)
+			mlog_errno(status);
 		goto bail_unlock_inode;
 	}
 
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index d329c9df90ae..4be801f4559b 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1408,21 +1408,17 @@ bail:
 	return status;
 }
 
-static int ocfs2_recover_orphans(struct ocfs2_super *osb,
-				 int slot)
+static int ocfs2_queue_orphans(struct ocfs2_super *osb,
+			       int slot,
+			       struct inode **head)
 {
-	int status = 0;
-	int have_disk_lock = 0;
-	struct inode *inode = NULL;
-	struct inode *iter;
+	int status;
 	struct inode *orphan_dir_inode = NULL;
+	struct inode *iter;
 	unsigned long offset, blk, local;
 	struct buffer_head *bh = NULL;
 	struct ocfs2_dir_entry *de;
 	struct super_block *sb = osb->sb;
-	struct ocfs2_inode_info *oi;
-
-	mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
 
 	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
 						       ORPHAN_DIR_SYSTEM_INODE,
@@ -1430,17 +1426,15 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 	if  (!orphan_dir_inode) {
 		status = -ENOENT;
 		mlog_errno(status);
-		goto out;
-	}
+		return status;
+	}	
 
 	mutex_lock(&orphan_dir_inode->i_mutex);
 	status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0);
 	if (status < 0) {
-		mutex_unlock(&orphan_dir_inode->i_mutex);
 		mlog_errno(status);
 		goto out;
 	}
-	have_disk_lock = 1;
 
 	offset = 0;
 	iter = NULL;
@@ -1451,11 +1445,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 		if (!bh)
 			status = -EINVAL;
 		if (status < 0) {
-			mutex_unlock(&orphan_dir_inode->i_mutex);
 			if (bh)
 				brelse(bh);
 			mlog_errno(status);
-			goto out;
+			goto out_unlock;
 		}
 
 		local = 0;
@@ -1465,11 +1458,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 
 			if (!ocfs2_check_dir_entry(orphan_dir_inode,
 						  de, bh, local)) {
-				mutex_unlock(&orphan_dir_inode->i_mutex);
 				status = -EINVAL;
 				mlog_errno(status);
 				brelse(bh);
-				goto out;
+				goto out_unlock;
 			}
 
 			local += le16_to_cpu(de->rec_len);
@@ -1504,18 +1496,95 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 
 			mlog(0, "queue orphan %"MLFu64"\n",
 			     OCFS2_I(iter)->ip_blkno);
-			OCFS2_I(iter)->ip_next_orphan = inode;
-			inode = iter;
+			/* No locking is required for the next_orphan
+			 * queue as there is only ever a single
+			 * process doing orphan recovery. */
+			OCFS2_I(iter)->ip_next_orphan = *head;
+			*head = iter;
 		}
 		brelse(bh);
 	}
-	mutex_unlock(&orphan_dir_inode->i_mutex);
 
+out_unlock:
 	ocfs2_meta_unlock(orphan_dir_inode, 0);
-	have_disk_lock = 0;
-
+out:
+	mutex_unlock(&orphan_dir_inode->i_mutex);
 	iput(orphan_dir_inode);
-	orphan_dir_inode = NULL;
+	return status;
+}
+
+static int ocfs2_orphan_recovery_can_continue(struct ocfs2_super *osb,
+					      int slot)
+{
+	int ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = !osb->osb_orphan_wipes[slot];
+	spin_unlock(&osb->osb_lock);
+	return ret;
+}
+
+static void ocfs2_mark_recovering_orphan_dir(struct ocfs2_super *osb,
+					     int slot)
+{
+	spin_lock(&osb->osb_lock);
+	/* Mark ourselves such that new processes in delete_inode()
+	 * know to quit early. */
+	ocfs2_node_map_set_bit(osb, &osb->osb_recovering_orphan_dirs, slot);
+	while (osb->osb_orphan_wipes[slot]) {
+		/* If any processes are already in the middle of an
+		 * orphan wipe on this dir, then we need to wait for
+		 * them. */
+		spin_unlock(&osb->osb_lock);
+		wait_event_interruptible(osb->osb_wipe_event,
+					 ocfs2_orphan_recovery_can_continue(osb, slot));
+		spin_lock(&osb->osb_lock);
+	}
+	spin_unlock(&osb->osb_lock);
+}
+
+static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,
+					      int slot)
+{
+	ocfs2_node_map_clear_bit(osb, &osb->osb_recovering_orphan_dirs, slot);
+}
+
+/*
+ * Orphan recovery. Each mounted node has it's own orphan dir which we
+ * must run during recovery. Our strategy here is to build a list of
+ * the inodes in the orphan dir and iget/iput them. The VFS does
+ * (most) of the rest of the work.
+ *
+ * Orphan recovery can happen at any time, not just mount so we have a
+ * couple of extra considerations.
+ *
+ * - We grab as many inodes as we can under the orphan dir lock -
+ *   doing iget() outside the orphan dir risks getting a reference on
+ *   an invalid inode.
+ * - We must be sure not to deadlock with other processes on the
+ *   system wanting to run delete_inode(). This can happen when they go
+ *   to lock the orphan dir and the orphan recovery process attempts to
+ *   iget() inside the orphan dir lock. This can be avoided by
+ *   advertising our state to ocfs2_delete_inode().
+ */
+static int ocfs2_recover_orphans(struct ocfs2_super *osb,
+				 int slot)
+{
+	int ret = 0;
+	struct inode *inode = NULL;
+	struct inode *iter;
+	struct ocfs2_inode_info *oi;
+
+	mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
+
+	ocfs2_mark_recovering_orphan_dir(osb, slot);
+	ret = ocfs2_queue_orphans(osb, slot, &inode);
+	ocfs2_clear_recovering_orphan_dir(osb, slot);
+
+	/* Error here should be noted, but we want to continue with as
+	 * many queued inodes as we've got. */
+	if (ret)
+		mlog_errno(ret);
 
 	while (inode) {
 		oi = OCFS2_I(inode);
@@ -1541,14 +1610,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 		inode = iter;
 	}
 
-out:
-	if (have_disk_lock)
-		ocfs2_meta_unlock(orphan_dir_inode, 0);
-
-	if (orphan_dir_inode)
-		iput(orphan_dir_inode);
-
-	return status;
+	return ret;
 }
 
 static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 19360e3d842e..e89de9b6e491 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -287,6 +287,10 @@ struct ocfs2_super
 	struct inode			*osb_tl_inode;
 	struct buffer_head		*osb_tl_bh;
 	struct work_struct		osb_truncate_log_wq;
+
+	struct ocfs2_node_map		osb_recovering_orphan_dirs;
+	unsigned int			*osb_orphan_wipes;
+	wait_queue_head_t		osb_wipe_event;
 };
 
 #define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 046824b6b625..8dd3aafec499 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1325,6 +1325,16 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	}
 	mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots);
 
+	init_waitqueue_head(&osb->osb_wipe_event);
+	osb->osb_orphan_wipes = kcalloc(osb->max_slots,
+					sizeof(*osb->osb_orphan_wipes),
+					GFP_KERNEL);
+	if (!osb->osb_orphan_wipes) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
 	osb->s_feature_compat =
 		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
 	osb->s_feature_ro_compat =
@@ -1638,6 +1648,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
 	if (osb->slot_info)
 		ocfs2_free_slot_info(osb->slot_info);
 
+	kfree(osb->osb_orphan_wipes);
 	/* FIXME
 	 * This belongs in journal shutdown, but because we have to
 	 * allocate osb->journal at the start of ocfs2_initalize_osb(),
-- 
cgit v1.2.2


From 93cc9ac4555a9b95c78b2f5dfe536fe8196002a7 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 27 Feb 2006 16:53:05 -0800
Subject: ocfs2: Set .owner on masklog sysfs attributes.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/masklog.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index fd741cea5705..636593bf4d17 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -74,6 +74,7 @@ struct mlog_attribute {
 #define define_mask(_name) {			\
 	.attr = {				\
 		.name = #_name,			\
+		.owner = THIS_MODULE,		\
 		.mode = S_IRUGO | S_IWUSR,	\
 	},					\
 	.mask = ML_##_name,			\
-- 
cgit v1.2.2


From 110ba90858a7f619ff26c6b9b43c27b3c0872335 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 28 Feb 2006 17:58:36 -0800
Subject: ocfs2: Respond to on-disk corruption in the extent map code.

The extent map code has long noticed when the on-disk extent information
is corrupt.  However, so far it has only returned an error.  We should
take the filesystem read-only, as it is corrupt.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/extent_map.c | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index b6ba292e9544..e6f207eebab4 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -181,6 +181,12 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode,
 			ret = -EBADR;
 			if (rec_end > OCFS2_I(inode)->ip_clusters) {
 				mlog_errno(ret);
+				ocfs2_error(inode->i_sb,
+					    "Extent %d at e_blkno %"MLFu64" of inode %"MLFu64" goes past ip_clusters of %u\n",
+					    i,
+					    le64_to_cpu(rec->e_blkno),
+					    OCFS2_I(inode)->ip_blkno,
+					    OCFS2_I(inode)->ip_clusters);
 				goto out_free;
 			}
 
@@ -226,6 +232,12 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode,
 			ret = -EBADR;
 			if (blkno) {
 				mlog_errno(ret);
+				ocfs2_error(inode->i_sb,
+					    "Multiple extents for (cpos = %u, clusters = %u) on inode %"MLFu64"; e_blkno %"MLFu64" and rec %d at e_blkno %"MLFu64"\n",
+					    cpos, clusters,
+					    OCFS2_I(inode)->ip_blkno,
+					    blkno, i,
+					    le64_to_cpu(rec->e_blkno));
 				goto out_free;
 			}
 
@@ -238,6 +250,10 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode,
 		 */
 		ret = -EBADR;
 		if (!blkno) {
+			ocfs2_error(inode->i_sb,
+				    "No record found for (cpos = %u, clusters = %u) on inode %"MLFu64"\n",
+				    cpos, clusters,
+				    OCFS2_I(inode)->ip_blkno);
 			mlog_errno(ret);
 			goto out_free;
 		}
@@ -266,6 +282,20 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode,
 
 	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
 		rec = &el->l_recs[i];
+
+		if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
+		    OCFS2_I(inode)->ip_clusters) {
+			ret = -EBADR;
+			mlog_errno(ret);
+			ocfs2_error(inode->i_sb,
+				    "Extent %d at e_blkno %"MLFu64" of inode %"MLFu64" goes past ip_clusters of %u\n",
+				    i,
+				    le64_to_cpu(rec->e_blkno),
+				    OCFS2_I(inode)->ip_blkno,
+				    OCFS2_I(inode)->ip_clusters);
+			return ret;
+		}
+
 		ret = ocfs2_extent_map_insert(inode, rec,
 					      le16_to_cpu(el->l_tree_depth));
 		if (ret) {
@@ -526,6 +556,10 @@ static int ocfs2_extent_map_insert(struct inode *inode,
 		    OCFS2_I(inode)->ip_map.em_clusters) {
 			ret = -EBADR;
 			mlog_errno(ret);
+			ocfs2_error(inode->i_sb,
+				    "Zero e_clusters on non-tail extent record at e_blkno %"MLFu64" on inode %"MLFu64"\n",
+				    le64_to_cpu(rec->e_blkno),
+				    OCFS2_I(inode)->ip_blkno);
 			return ret;
 		}
 
@@ -588,12 +622,12 @@ static int ocfs2_extent_map_insert(struct inode *inode,
  * Existing record in the extent map:
  *
  *	cpos = 10, len = 10
- * 	|---------|
+ *	|---------|
  *
  * New Record:
  *
  *	cpos = 10, len = 20
- * 	|------------------|
+ *	|------------------|
  *
  * The passed record is the new on-disk record.  The new_clusters value
  * is how many clusters were added to the file.  If the append is a
-- 
cgit v1.2.2


From b7668c72d2ae004363fb0588600bfa942e1b245c Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 28 Feb 2006 23:28:01 -0800
Subject: [PATCH] ocfs2: added source addr to bind() in o2net_start_connect()

to prevent confusion when a virtual ip is created on the same interface

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/tcp.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d22d4cf08db1..0f60cc0d3985 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1318,7 +1318,7 @@ static void o2net_start_connect(void *arg)
 {
 	struct o2net_node *nn = arg;
 	struct o2net_sock_container *sc = NULL;
-	struct o2nm_node *node = NULL;
+	struct o2nm_node *node = NULL, *mynode = NULL;
 	struct socket *sock = NULL;
 	struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
 	int ret = 0;
@@ -1334,6 +1334,12 @@ static void o2net_start_connect(void *arg)
 		goto out;
 	}
 
+	mynode = o2nm_get_node_by_num(o2nm_this_node());
+	if (mynode == NULL) {
+		ret = 0;
+		goto out;
+	}
+
 	spin_lock(&nn->nn_lock);
 	/* see if we already have one pending or have given up */
 	if (nn->nn_sc || nn->nn_persistent_error)
@@ -1361,12 +1367,14 @@ static void o2net_start_connect(void *arg)
 	sock->sk->sk_allocation = GFP_ATOMIC;
 
 	myaddr.sin_family = AF_INET;
+	myaddr.sin_addr.s_addr = (__force u32)mynode->nd_ipv4_address;
 	myaddr.sin_port = (__force u16)htons(0); /* any port */
 
 	ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
 			      sizeof(myaddr));
 	if (ret) {
-		mlog(0, "bind failed: %d\n", ret);
+		mlog(ML_ERROR, "bind failed with %d at address %u.%u.%u.%u\n",
+		     ret, NIPQUAD(mynode->nd_ipv4_address));
 		goto out;
 	}
 
@@ -1407,6 +1415,8 @@ out:
 		sc_put(sc);
 	if (node)
 		o2nm_node_put(node);
+	if (mynode)
+		o2nm_node_put(mynode);
 
 	return;
 }
-- 
cgit v1.2.2


From 81f2094a631df1ba275f4d4bd7ea5bacfd8dbcfc Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 28 Feb 2006 17:31:22 -0800
Subject: [PATCH] ocfs2: use hlists for lockres hash

Switch from list_head to hlist_head. Make the size of the hash dependent
upon the allocated area, rather than a constant.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmcommon.h   |  8 +++-----
 fs/ocfs2/dlm/dlmdebug.c    | 12 +++++-------
 fs/ocfs2/dlm/dlmdomain.c   | 39 +++++++++++++++++++--------------------
 fs/ocfs2/dlm/dlmmaster.c   |  4 ++--
 fs/ocfs2/dlm/dlmrecovery.c | 23 ++++++++++++-----------
 5 files changed, 41 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 23ceaa7127b4..9c772583744a 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,9 +37,7 @@
 #define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
 #define DLM_THREAD_MS                  200   // flush at least every 200 ms
 
-#define DLM_HASH_BITS     7
-#define DLM_HASH_SIZE     (1 << DLM_HASH_BITS)
-#define DLM_HASH_MASK     (DLM_HASH_SIZE - 1)
+#define DLM_HASH_BUCKETS     (PAGE_SIZE / sizeof(struct hlist_head))
 
 enum dlm_ast_type {
 	DLM_AST = 0,
@@ -87,7 +85,7 @@ enum dlm_ctxt_state {
 struct dlm_ctxt
 {
 	struct list_head list;
-	struct list_head *resources;
+	struct hlist_head *lockres_hash;
 	struct list_head dirty_list;
 	struct list_head purge_list;
 	struct list_head pending_asts;
@@ -217,7 +215,7 @@ struct dlm_lock_resource
 {
 	/* WARNING: Please see the comment in dlm_init_lockres before
 	 * adding fields here. */
-	struct list_head list;
+	struct hlist_node hash_node;
 	struct kref      refs;
 
 	/* please keep these next 3 in this order
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index f339fe27975a..54f61b76ab51 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -117,8 +117,8 @@ EXPORT_SYMBOL_GPL(dlm_print_one_lock);
 void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
 {
 	struct dlm_lock_resource *res;
-	struct list_head *iter;
-	struct list_head *bucket;
+	struct hlist_node *iter;
+	struct hlist_head *bucket;
 	int i;
 
 	mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
@@ -129,12 +129,10 @@ void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
 	}
 
 	spin_lock(&dlm->spinlock);
-	for (i=0; i<DLM_HASH_SIZE; i++) {
-		bucket = &(dlm->resources[i]);
-		list_for_each(iter, bucket) {
-			res = list_entry(iter, struct dlm_lock_resource, list);
+	for (i=0; i<DLM_HASH_BUCKETS; i++) {
+		bucket = &(dlm->lockres_hash[i]);
+		hlist_for_each_entry(res, iter, bucket, hash_node)
 			dlm_print_one_lock_resource(res);
-		}
 	}
 	spin_unlock(&dlm->spinlock);
 }
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6ee30837389c..8f3a9e3106fd 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -77,26 +77,26 @@ static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
 
 void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
 {
-	list_del_init(&lockres->list);
+	hlist_del_init(&lockres->hash_node);
 	dlm_lockres_put(lockres);
 }
 
 void __dlm_insert_lockres(struct dlm_ctxt *dlm,
 		       struct dlm_lock_resource *res)
 {
-	struct list_head *bucket;
+	struct hlist_head *bucket;
 	struct qstr *q;
 
 	assert_spin_locked(&dlm->spinlock);
 
 	q = &res->lockname;
 	q->hash = full_name_hash(q->name, q->len);
-	bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]);
+	bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]);
 
 	/* get a reference for our hashtable */
 	dlm_lockres_get(res);
 
-	list_add_tail(&res->list, bucket);
+	hlist_add_head(&res->hash_node, bucket);
 }
 
 struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
@@ -104,9 +104,9 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
 					 unsigned int len)
 {
 	unsigned int hash;
-	struct list_head *iter;
+	struct hlist_node *iter;
 	struct dlm_lock_resource *tmpres=NULL;
-	struct list_head *bucket;
+	struct hlist_head *bucket;
 
 	mlog_entry("%.*s\n", len, name);
 
@@ -114,11 +114,11 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
 
 	hash = full_name_hash(name, len);
 
-	bucket = &(dlm->resources[hash & DLM_HASH_MASK]);
+	bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]);
 
 	/* check for pre-existing lock */
-	list_for_each(iter, bucket) {
-		tmpres = list_entry(iter, struct dlm_lock_resource, list);
+	hlist_for_each(iter, bucket) {
+		tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node);
 		if (tmpres->lockname.len == len &&
 		    memcmp(tmpres->lockname.name, name, len) == 0) {
 			dlm_lockres_get(tmpres);
@@ -193,8 +193,8 @@ static int dlm_wait_on_domain_helper(const char *domain)
 
 static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
 {
-	if (dlm->resources)
-		free_page((unsigned long) dlm->resources);
+	if (dlm->lockres_hash)
+		free_page((unsigned long) dlm->lockres_hash);
 
 	if (dlm->name)
 		kfree(dlm->name);
@@ -303,10 +303,10 @@ static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
 	mlog(0, "Migrating locks from domain %s\n", dlm->name);
 restart:
 	spin_lock(&dlm->spinlock);
-	for (i=0; i<DLM_HASH_SIZE; i++) {
-		while (!list_empty(&dlm->resources[i])) {
-			res = list_entry(dlm->resources[i].next,
-				     struct dlm_lock_resource, list);
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		while (!hlist_empty(&dlm->lockres_hash[i])) {
+			res = hlist_entry(dlm->lockres_hash[i].first,
+					  struct dlm_lock_resource, hash_node);
 			/* need reference when manually grabbing lockres */
 			dlm_lockres_get(res);
 			/* this should unhash the lockres
@@ -1191,18 +1191,17 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 		goto leave;
 	}
 
-	dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL);
-	if (!dlm->resources) {
+	dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL);
+	if (!dlm->lockres_hash) {
 		mlog_errno(-ENOMEM);
 		kfree(dlm->name);
 		kfree(dlm);
 		dlm = NULL;
 		goto leave;
 	}
-	memset(dlm->resources, 0, PAGE_SIZE);
 
-	for (i=0; i<DLM_HASH_SIZE; i++)
-		INIT_LIST_HEAD(&dlm->resources[i]);
+	for (i=0; i<DLM_HASH_BUCKETS; i++)
+		INIT_HLIST_HEAD(&dlm->lockres_hash[i]);
 
 	strcpy(dlm->name, domain);
 	dlm->key = key;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 2e2e95e69499..847dd3cc4cf5 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -564,7 +564,7 @@ static void dlm_lockres_release(struct kref *kref)
 
 	/* By the time we're ready to blow this guy away, we shouldn't
 	 * be on any lists. */
-	BUG_ON(!list_empty(&res->list));
+	BUG_ON(!hlist_unhashed(&res->hash_node));
 	BUG_ON(!list_empty(&res->granted));
 	BUG_ON(!list_empty(&res->converting));
 	BUG_ON(!list_empty(&res->blocked));
@@ -605,7 +605,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
 
 	init_waitqueue_head(&res->wq);
 	spin_lock_init(&res->spinlock);
-	INIT_LIST_HEAD(&res->list);
+	INIT_HLIST_NODE(&res->hash_node);
 	INIT_LIST_HEAD(&res->granted);
 	INIT_LIST_HEAD(&res->converting);
 	INIT_LIST_HEAD(&res->blocked);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index ed76bda1a534..1e232000f3f7 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1693,7 +1693,10 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
 					      u8 dead_node, u8 new_master)
 {
 	int i;
-	struct list_head *iter, *iter2, *bucket;
+	struct list_head *iter, *iter2;
+	struct hlist_node *hash_iter;
+	struct hlist_head *bucket;
+
 	struct dlm_lock_resource *res;
 
 	mlog_entry_void();
@@ -1717,10 +1720,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
 	 * for now we need to run the whole hash, clear
 	 * the RECOVERING state and set the owner
 	 * if necessary */
-	for (i=0; i<DLM_HASH_SIZE; i++) {
-		bucket = &(dlm->resources[i]);
-		list_for_each(iter, bucket) {
-			res = list_entry (iter, struct dlm_lock_resource, list);
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		bucket = &(dlm->lockres_hash[i]);
+		hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
 			if (res->state & DLM_LOCK_RES_RECOVERING) {
 				if (res->owner == dead_node) {
 					mlog(0, "(this=%u) res %.*s owner=%u "
@@ -1852,10 +1854,10 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
 
 static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
 {
-	struct list_head *iter;
+	struct hlist_node *iter;
 	struct dlm_lock_resource *res;
 	int i;
-	struct list_head *bucket;
+	struct hlist_head *bucket;
 	struct dlm_lock *lock;
 
 
@@ -1876,10 +1878,9 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
 	 *    can be kicked again to see if any ASTs or BASTs
 	 *    need to be fired as a result.
 	 */
-	for (i=0; i<DLM_HASH_SIZE; i++) {
-		bucket = &(dlm->resources[i]);
-		list_for_each(iter, bucket) {
-			res = list_entry (iter, struct dlm_lock_resource, list);
+	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+		bucket = &(dlm->lockres_hash[i]);
+		hlist_for_each_entry(res, iter, bucket, hash_node) {
  			/* always prune any $RECOVERY entries for dead nodes,
  			 * otherwise hangs can occur during later recovery */
 			if (dlm_is_recovery_lock(res->lockname.name,
-- 
cgit v1.2.2


From 6a3124a3946c16159c3faf83e62ffdb5d1134b3a Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Thu, 2 Mar 2006 02:54:30 -0800
Subject: [PATCH] v9fs: fix atomic create open

In order to assure atomic create+open v9fs stores the open fid produced by
v9fs_vfs_create in the dentry, from where v9fs_file_open retrieves it and
associates it with the open file.

This patch modifies v9fs to use nameidata.intent.open values to do the atomic
create+open.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/fid.c       |  73 ++-------
 fs/9p/fid.h       |   5 +-
 fs/9p/v9fs_vfs.h  |   1 +
 fs/9p/vfs_file.c  | 106 +++++-------
 fs/9p/vfs_inode.c | 478 +++++++++++++++++++++++++++++++++++-------------------
 fs/9p/vfs_super.c |  12 +-
 6 files changed, 379 insertions(+), 296 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index eda449778fa5..c27f546dd25b 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -40,7 +40,7 @@
  *
  */
 
-static int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry)
+int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry)
 {
 	struct list_head *fid_list = (struct list_head *)dentry->d_fsdata;
 	dprintk(DEBUG_9P, "fid %d (%p) dentry %s (%p)\n", fid->fid, fid,
@@ -68,14 +68,11 @@ static int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry)
  *
  */
 
-struct v9fs_fid *v9fs_fid_create(struct dentry *dentry,
-	struct v9fs_session_info *v9ses, int fid, int create)
+struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *v9ses, int fid)
 {
 	struct v9fs_fid *new;
 
-	dprintk(DEBUG_9P, "fid create dentry %p, fid %d, create %d\n",
-		dentry, fid, create);
-
+	dprintk(DEBUG_9P, "fid create fid %d\n", fid);
 	new = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL);
 	if (new == NULL) {
 		dprintk(DEBUG_ERROR, "Out of Memory\n");
@@ -85,19 +82,13 @@ struct v9fs_fid *v9fs_fid_create(struct dentry *dentry,
 	new->fid = fid;
 	new->v9ses = v9ses;
 	new->fidopen = 0;
-	new->fidcreate = create;
 	new->fidclunked = 0;
 	new->iounit = 0;
 	new->rdir_pos = 0;
 	new->rdir_fcall = NULL;
+	INIT_LIST_HEAD(&new->list);
 
-	if (v9fs_fid_insert(new, dentry) == 0)
 		return new;
-	else {
-		dprintk(DEBUG_ERROR, "Problems inserting to dentry\n");
-		kfree(new);
-		return NULL;
-	}
 }
 
 /**
@@ -119,7 +110,7 @@ void v9fs_fid_destroy(struct v9fs_fid *fid)
 static struct v9fs_fid *v9fs_fid_walk_up(struct dentry *dentry)
 {
 	int fidnum, cfidnum, err;
-	struct v9fs_fid *cfid;
+	struct v9fs_fid *cfid, *fid;
 	struct dentry *cde;
 	struct v9fs_session_info *v9ses;
 
@@ -158,7 +149,16 @@ static struct v9fs_fid *v9fs_fid_walk_up(struct dentry *dentry)
 		cde = cde->d_parent;
 	}
 
-	return v9fs_fid_create(dentry, v9ses, fidnum, 0);
+	fid = v9fs_fid_create(v9ses, fidnum);
+	if (fid) {
+		err = v9fs_fid_insert(fid, dentry);
+		if (err < 0) {
+			kfree(fid);
+			goto clunk_fid;
+		}
+	}
+
+	return fid;
 
 clunk_fid:
 	v9fs_t_clunk(v9ses, fidnum);
@@ -179,29 +179,12 @@ clunk_fid:
 struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry)
 {
 	struct list_head *fid_list = (struct list_head *)dentry->d_fsdata;
-	struct v9fs_fid *current_fid = NULL;
-	struct v9fs_fid *temp = NULL;
 	struct v9fs_fid *return_fid = NULL;
 
 	dprintk(DEBUG_9P, " dentry: %s (%p)\n", dentry->d_iname, dentry);
 
-	if (fid_list) {
-		list_for_each_entry_safe(current_fid, temp, fid_list, list) {
-			if (!current_fid->fidcreate) {
-				return_fid = current_fid;
-				break;
-			}
-		}
-
-		if (!return_fid)
-			return_fid = current_fid;
-	}
-
-	/* we are at the root but didn't match */
-	if ((!return_fid) && (dentry->d_parent == dentry)) {
-		/* TODO: clone attach with new uid */
-		return_fid = current_fid;
-	}
+	if (fid_list)
+		return_fid = list_entry(fid_list->next, struct v9fs_fid, list);
 
 	if (!return_fid) {
 		struct dentry *par = current->fs->pwd->d_parent;
@@ -228,25 +211,3 @@ struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry)
 
 	return return_fid;
 }
-
-struct v9fs_fid *v9fs_fid_get_created(struct dentry *dentry)
-{
-	struct list_head *fid_list;
-	struct v9fs_fid *fid, *ftmp, *ret;
-
-	dprintk(DEBUG_9P, " dentry: %s (%p)\n", dentry->d_iname, dentry);
-	fid_list = (struct list_head *)dentry->d_fsdata;
-	ret = NULL;
-	if (fid_list) {
-		list_for_each_entry_safe(fid, ftmp, fid_list, list) {
-			if (fid->fidcreate && fid->pid == current->pid) {
-				list_del(&fid->list);
-				ret = fid;
-				break;
-			}
-		}
-	}
-
-	dprintk(DEBUG_9P, "return %p\n", ret);
-	return ret;
-}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index 84c673a44c83..7ccf0d064e25 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -33,7 +33,6 @@ struct v9fs_fid {
 
 	u32 fid;
 	unsigned char fidopen;	  /* set when fid is opened */
-	unsigned char fidcreate;  /* set when fid was just created */
 	unsigned char fidclunked; /* set when fid has already been clunked */
 
 	struct v9fs_qid qid;
@@ -56,5 +55,5 @@ struct v9fs_fid {
 struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry);
 struct v9fs_fid *v9fs_fid_get_created(struct dentry *);
 void v9fs_fid_destroy(struct v9fs_fid *fid);
-struct v9fs_fid *v9fs_fid_create(struct dentry *,
-	struct v9fs_session_info *v9ses, int fid, int create);
+struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *, int fid);
+int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry);
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 69cf2905dc90..a759278acaae 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -51,3 +51,4 @@ int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
 void v9fs_inode2stat(struct inode *inode, struct v9fs_stat *stat);
 void v9fs_dentry_release(struct dentry *);
+int v9fs_uflags2omode(int uflags);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index c7e14d917215..de3a129698da 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -53,94 +53,70 @@
 int v9fs_file_open(struct inode *inode, struct file *file)
 {
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
-	struct v9fs_fid *v9fid, *fid;
+	struct v9fs_fid *vfid;
 	struct v9fs_fcall *fcall = NULL;
-	int open_mode = 0;
-	unsigned int iounit = 0;
-	int newfid = -1;
-	long result = -1;
+	int omode;
+	int fid = V9FS_NOFID;
+	int err;
 
 	dprintk(DEBUG_VFS, "inode: %p file: %p \n", inode, file);
 
-	v9fid = v9fs_fid_get_created(file->f_dentry);
-	if (!v9fid)
-		v9fid = v9fs_fid_lookup(file->f_dentry);
-
-	if (!v9fid) {
+	vfid = v9fs_fid_lookup(file->f_dentry);
+	if (!vfid) {
 		dprintk(DEBUG_ERROR, "Couldn't resolve fid from dentry\n");
 		return -EBADF;
 	}
 
-	if (!v9fid->fidcreate) {
-		fid = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL);
-		if (fid == NULL) {
-			dprintk(DEBUG_ERROR, "Out of Memory\n");
-			return -ENOMEM;
-		}
-
-		fid->fidopen = 0;
-		fid->fidcreate = 0;
-		fid->fidclunked = 0;
-		fid->iounit = 0;
-		fid->v9ses = v9ses;
-
-		newfid = v9fs_get_idpool(&v9ses->fidpool);
-		if (newfid < 0) {
+	fid = v9fs_get_idpool(&v9ses->fidpool);
+	if (fid < 0) {
 			eprintk(KERN_WARNING, "newfid fails!\n");
 			return -ENOSPC;
 		}
 
-		result =
-		    v9fs_t_walk(v9ses, v9fid->fid, newfid, NULL, NULL);
-
-		if (result < 0) {
-			v9fs_put_idpool(newfid, &v9ses->fidpool);
+	err = v9fs_t_walk(v9ses, vfid->fid, fid, NULL, NULL);
+	if (err < 0) {
 			dprintk(DEBUG_ERROR, "rewalk didn't work\n");
-			return -EBADF;
+		goto put_fid;
+	}
+
+	vfid = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL);
+	if (vfid == NULL) {
+		dprintk(DEBUG_ERROR, "out of memory\n");
+		goto clunk_fid;
 		}
 
-		fid->fid = newfid;
-		v9fid = fid;
 		/* TODO: do special things for O_EXCL, O_NOFOLLOW, O_SYNC */
 		/* translate open mode appropriately */
-		open_mode = file->f_flags & 0x3;
+	omode = v9fs_uflags2omode(file->f_flags);
+	err = v9fs_t_open(v9ses, fid, omode, &fcall);
+	if (err < 0) {
+		PRINT_FCALL_ERROR("open failed", fcall);
+		goto destroy_vfid;
+	}
 
-		if (file->f_flags & O_EXCL)
-			open_mode |= V9FS_OEXCL;
+	file->private_data = vfid;
+	vfid->fid = fid;
+	vfid->fidopen = 1;
+	vfid->fidclunked = 0;
+	vfid->iounit = fcall->params.ropen.iounit;
+	vfid->rdir_pos = 0;
+	vfid->rdir_fcall = NULL;
+	vfid->filp = file;
+	kfree(fcall);
 
-		if (v9ses->extended) {
-			if (file->f_flags & O_TRUNC)
-				open_mode |= V9FS_OTRUNC;
+	return 0;
 
-			if (file->f_flags & O_APPEND)
-				open_mode |= V9FS_OAPPEND;
-		}
+destroy_vfid:
+	v9fs_fid_destroy(vfid);
 
-		result = v9fs_t_open(v9ses, newfid, open_mode, &fcall);
-		if (result < 0) {
-			PRINT_FCALL_ERROR("open failed", fcall);
-			kfree(fcall);
-			return result;
-		}
+clunk_fid:
+	v9fs_t_clunk(v9ses, fid);
 
-		iounit = fcall->params.ropen.iounit;
+put_fid:
+	v9fs_put_idpool(fid, &v9ses->fidpool);
 		kfree(fcall);
-	} else {
-		/* create case */
-		newfid = v9fid->fid;
-		iounit = v9fid->iounit;
-		v9fid->fidcreate = 0;
-	}
-
-	file->private_data = v9fid;
-
-	v9fid->rdir_pos = 0;
-	v9fid->rdir_fcall = NULL;
-	v9fid->fidopen = 1;
-	v9fid->filp = file;
-	v9fid->iounit = iounit;
 
-	return 0;
+	return err;
 }
 
 /**
@@ -289,9 +265,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
 		total += result;
 	} while (count);
 
-	if(inode->i_mapping->nrpages)
 		invalidate_inode_pages2(inode->i_mapping);
-
 	return total;
 }
 
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 63e5b0398e8b..dce729d42869 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -125,6 +125,38 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
 	return res;
 }
 
+int v9fs_uflags2omode(int uflags)
+{
+	int ret;
+
+	ret = 0;
+	switch (uflags&3) {
+	default:
+	case O_RDONLY:
+		ret = V9FS_OREAD;
+		break;
+
+	case O_WRONLY:
+		ret = V9FS_OWRITE;
+		break;
+
+	case O_RDWR:
+		ret = V9FS_ORDWR;
+		break;
+	}
+
+	if (uflags & O_EXCL)
+		ret |= V9FS_OEXCL;
+
+	if (uflags & O_TRUNC)
+		ret |= V9FS_OTRUNC;
+
+	if (uflags & O_APPEND)
+		ret |= V9FS_OAPPEND;
+
+	return ret;
+}
+
 /**
  * v9fs_blank_wstat - helper function to setup a 9P stat structure
  * @v9ses: 9P session info (for determining extended mode)
@@ -163,7 +195,7 @@ v9fs_blank_wstat(struct v9fs_wstat *wstat)
 
 struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 {
-	struct inode *inode = NULL;
+	struct inode *inode;
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
 
 	dprintk(DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
@@ -222,171 +254,135 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 	return inode;
 }
 
-/**
- * v9fs_create - helper function to create files and directories
- * @dir: directory inode file is being created in
- * @file_dentry: dentry file is being created in
- * @perm: permissions file is being created with
- * @open_mode: resulting open mode for file
- *
- */
-
 static int
-v9fs_create(struct inode *dir,
-	    struct dentry *file_dentry,
-	    unsigned int perm, unsigned int open_mode)
+v9fs_create(struct v9fs_session_info *v9ses, u32 pfid, char *name,
+	u32 perm, u8 mode, u32 *fidp, struct v9fs_qid *qid, u32 *iounit)
 {
-	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
-	struct super_block *sb = dir->i_sb;
-	struct v9fs_fid *dirfid =
-	    v9fs_fid_lookup(file_dentry->d_parent);
-	struct v9fs_fid *fid = NULL;
-	struct inode *file_inode = NULL;
-	struct v9fs_fcall *fcall = NULL;
-	struct v9fs_qid qid;
-	int dirfidnum = -1;
-	long newfid = -1;
-	int result = 0;
-	unsigned int iounit = 0;
-	int wfidno = -1;
+	u32 fid;
 	int err;
+	struct v9fs_fcall *fcall;
 
-	perm = unixmode2p9mode(v9ses, perm);
-
-	dprintk(DEBUG_VFS, "dir: %p dentry: %p perm: %o mode: %o\n", dir,
-		file_dentry, perm, open_mode);
-
-	if (!dirfid)
-		return -EBADF;
-
-	dirfidnum = dirfid->fid;
-	if (dirfidnum < 0) {
-		dprintk(DEBUG_ERROR, "No fid for the directory #%lu\n",
-			dir->i_ino);
-		return -EBADF;
-	}
-
-	if (file_dentry->d_inode) {
-		dprintk(DEBUG_ERROR,
-			"Odd. There is an inode for dir %lu, name :%s:\n",
-			dir->i_ino, file_dentry->d_name.name);
-		return -EEXIST;
-	}
-
-	newfid = v9fs_get_idpool(&v9ses->fidpool);
-	if (newfid < 0) {
+	fid = v9fs_get_idpool(&v9ses->fidpool);
+	if (fid < 0) {
 		eprintk(KERN_WARNING, "no free fids available\n");
-		return -ENOSPC;
+		err = -ENOSPC;
+		goto error;
 	}
 
-	result = v9fs_t_walk(v9ses, dirfidnum, newfid, NULL, &fcall);
-	if (result < 0) {
+	err = v9fs_t_walk(v9ses, pfid, fid, NULL, &fcall);
+	if (err < 0) {
 		PRINT_FCALL_ERROR("clone error", fcall);
-		v9fs_put_idpool(newfid, &v9ses->fidpool);
-		newfid = -1;
-		goto CleanUpFid;
+		goto error;
 	}
-
 	kfree(fcall);
-	fcall = NULL;
 
-	result = v9fs_t_create(v9ses, newfid, (char *)file_dentry->d_name.name,
-			       perm, open_mode, &fcall);
-	if (result < 0) {
+	err = v9fs_t_create(v9ses, fid, name, perm, mode, &fcall);
+	if (err < 0) {
 		PRINT_FCALL_ERROR("create fails", fcall);
-		goto CleanUpFid;
+		goto error;
 	}
 
-	iounit = fcall->params.rcreate.iounit;
-	qid = fcall->params.rcreate.qid;
+	if (iounit)
+		*iounit = fcall->params.rcreate.iounit;
+
+	if (qid)
+		*qid = fcall->params.rcreate.qid;
+
+	if (fidp)
+		*fidp = fid;
+
 	kfree(fcall);
-	fcall = NULL;
+	return 0;
 
-	if (!(perm&V9FS_DMDIR)) {
-		fid = v9fs_fid_create(file_dentry, v9ses, newfid, 1);
-		dprintk(DEBUG_VFS, "fid %p %d\n", fid, fid->fidcreate);
-		if (!fid) {
-			result = -ENOMEM;
-			goto CleanUpFid;
-		}
+error:
+	if (fid >= 0)
+		v9fs_put_idpool(fid, &v9ses->fidpool);
 
-		fid->qid = qid;
-		fid->iounit = iounit;
-	} else {
-		err = v9fs_t_clunk(v9ses, newfid);
-		newfid = -1;
-		if (err < 0)
-			dprintk(DEBUG_ERROR, "clunk for mkdir failed: %d\n", err);
-	}
+	kfree(fcall);
+	return err;
+}
+
+static struct v9fs_fid*
+v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry)
+{
+	int err;
+	u32 nfid;
+	struct v9fs_fid *ret;
+	struct v9fs_fcall *fcall;
 
-	/* walk to the newly created file and put the fid in the dentry */
-	wfidno = v9fs_get_idpool(&v9ses->fidpool);
-	if (wfidno < 0) {
+	nfid = v9fs_get_idpool(&v9ses->fidpool);
+	if (nfid < 0) {
 		eprintk(KERN_WARNING, "no free fids available\n");
-		return -ENOSPC;
+		err = -ENOSPC;
+		goto error;
 	}
 
-	result = v9fs_t_walk(v9ses, dirfidnum, wfidno,
-		(char *) file_dentry->d_name.name, &fcall);
-	if (result < 0) {
-		PRINT_FCALL_ERROR("clone error", fcall);
-		v9fs_put_idpool(wfidno, &v9ses->fidpool);
-		wfidno = -1;
-		goto CleanUpFid;
+	err = v9fs_t_walk(v9ses, fid, nfid, (char *) dentry->d_name.name,
+		&fcall);
+
+	if (err < 0) {
+		PRINT_FCALL_ERROR("walk error", fcall);
+		v9fs_put_idpool(nfid, &v9ses->fidpool);
+		goto error;
 	}
+
 	kfree(fcall);
 	fcall = NULL;
+	ret = v9fs_fid_create(v9ses, nfid);
+	if (!ret) {
+		err = -ENOMEM;
+		goto clunk_fid;
+	}
 
-	if (!v9fs_fid_create(file_dentry, v9ses, wfidno, 0)) {
-		v9fs_put_idpool(wfidno, &v9ses->fidpool);
-
-		goto CleanUpFid;
+	err = v9fs_fid_insert(ret, dentry);
+	if (err < 0) {
+		v9fs_fid_destroy(ret);
+		goto clunk_fid;
 	}
 
-	if ((perm & V9FS_DMSYMLINK) || (perm & V9FS_DMLINK) ||
-	    (perm & V9FS_DMNAMEDPIPE) || (perm & V9FS_DMSOCKET) ||
-	    (perm & V9FS_DMDEVICE))
-		return 0;
+	return ret;
 
-	result = v9fs_t_stat(v9ses, wfidno, &fcall);
-	if (result < 0) {
-		PRINT_FCALL_ERROR("stat error", fcall);
-		goto CleanUpFid;
-	}
+clunk_fid:
+	v9fs_t_clunk(v9ses, nfid);
+
+error:
+	kfree(fcall);
+	return ERR_PTR(err);
+}
 
+struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, u32 fid,
+	struct super_block *sb)
+{
+	int err, umode;
+	struct inode *ret;
+	struct v9fs_fcall *fcall;
 
-	file_inode = v9fs_get_inode(sb,
-		p9mode2unixmode(v9ses, fcall->params.rstat.stat.mode));
+	ret = NULL;
+	err = v9fs_t_stat(v9ses, fid, &fcall);
+	if (err) {
+		PRINT_FCALL_ERROR("stat error", fcall);
+		goto error;
+	}
 
-	if ((!file_inode) || IS_ERR(file_inode)) {
-		dprintk(DEBUG_ERROR, "create inode failed\n");
-		result = -EBADF;
-		goto CleanUpFid;
+	umode = p9mode2unixmode(v9ses, fcall->params.rstat.stat.mode);
+	ret = v9fs_get_inode(sb, umode);
+	if (IS_ERR(ret)) {
+		err = PTR_ERR(ret);
+		ret = NULL;
+		goto error;
 	}
 
-	v9fs_stat2inode(&fcall->params.rstat.stat, file_inode, sb);
+	v9fs_stat2inode(&fcall->params.rstat.stat, ret, sb);
 	kfree(fcall);
-	fcall = NULL;
-	file_dentry->d_op = &v9fs_dentry_operations;
-	d_instantiate(file_dentry, file_inode);
-
-	return 0;
+	return ret;
 
-      CleanUpFid:
+error:
 	kfree(fcall);
-	fcall = NULL;
+	if (ret)
+		iput(ret);
 
-	if (newfid >= 0) {
- 		err = v9fs_t_clunk(v9ses, newfid);
- 		if (err < 0)
- 			dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
-	}
-	if (wfidno >= 0) {
- 		err = v9fs_t_clunk(v9ses, wfidno);
- 		if (err < 0)
- 			dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
-	}
-	return result;
+	return ERR_PTR(err);
 }
 
 /**
@@ -440,20 +436,97 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 	return result;
 }
 
+static int
+v9fs_open_created(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
 /**
  * v9fs_vfs_create - VFS hook to create files
  * @inode: directory inode that is being deleted
  * @dentry:  dentry that is being deleted
- * @perm: create permissions
+ * @mode: create permissions
  * @nd: path information
  *
  */
 
 static int
-v9fs_vfs_create(struct inode *inode, struct dentry *dentry, int perm,
+v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 		struct nameidata *nd)
 {
-	return v9fs_create(inode, dentry, perm, O_RDWR);
+	int err;
+	u32 fid, perm, iounit;
+	int flags;
+	struct v9fs_session_info *v9ses;
+	struct v9fs_fid *dfid, *vfid, *ffid;
+	struct inode *inode;
+	struct v9fs_qid qid;
+	struct file *filp;
+
+	inode = NULL;
+	vfid = NULL;
+	v9ses = v9fs_inode2v9ses(dir);
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	perm = unixmode2p9mode(v9ses, mode);
+
+	if (nd && nd->flags & LOOKUP_OPEN)
+		flags = nd->intent.open.flags - 1;
+	else
+		flags = O_RDWR;
+
+	err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name,
+		perm, v9fs_uflags2omode(flags), &fid, &qid, &iounit);
+
+	if (err)
+		goto error;
+
+	vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry);
+	if (IS_ERR(vfid)) {
+		err = PTR_ERR(vfid);
+		vfid = NULL;
+		goto error;
+	}
+
+	inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		inode = NULL;
+		goto error;
+	}
+
+	dentry->d_op = &v9fs_dentry_operations;
+	d_instantiate(dentry, inode);
+
+	if (nd && nd->flags & LOOKUP_OPEN) {
+		ffid = v9fs_fid_create(v9ses, fid);
+		if (!ffid)
+			return -ENOMEM;
+
+		filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created);
+		if (IS_ERR(filp)) {
+			v9fs_fid_destroy(ffid);
+			return PTR_ERR(filp);
+		}
+
+		ffid->rdir_pos = 0;
+		ffid->rdir_fcall = NULL;
+		ffid->fidopen = 1;
+		ffid->iounit = iounit;
+		ffid->filp = filp;
+		filp->private_data = ffid;
+	}
+
+	return 0;
+
+error:
+	if (vfid)
+		v9fs_fid_destroy(vfid);
+
+	if (inode)
+		iput(inode);
+
+	return err;
 }
 
 /**
@@ -464,9 +537,57 @@ v9fs_vfs_create(struct inode *inode, struct dentry *dentry, int perm,
  *
  */
 
-static int v9fs_vfs_mkdir(struct inode *inode, struct dentry *dentry, int mode)
+static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-	return v9fs_create(inode, dentry, mode | S_IFDIR, O_RDONLY);
+	int err;
+	u32 fid, perm;
+	struct v9fs_session_info *v9ses;
+	struct v9fs_fid *dfid, *vfid;
+	struct inode *inode;
+
+	inode = NULL;
+	vfid = NULL;
+	v9ses = v9fs_inode2v9ses(dir);
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	perm = unixmode2p9mode(v9ses, mode | S_IFDIR);
+
+	err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name,
+		perm, V9FS_OREAD, &fid, NULL, NULL);
+
+	if (err) {
+		dprintk(DEBUG_ERROR, "create error %d\n", err);
+		goto error;
+	}
+
+	err = v9fs_t_clunk(v9ses, fid);
+	if (err) {
+		dprintk(DEBUG_ERROR, "clunk error %d\n", err);
+		goto error;
+	}
+
+	vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry);
+	if (IS_ERR(vfid)) {
+		err = PTR_ERR(vfid);
+		vfid = NULL;
+		goto error;
+	}
+
+	inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		inode = NULL;
+		goto error;
+	}
+
+	dentry->d_op = &v9fs_dentry_operations;
+	d_instantiate(dentry, inode);
+	return 0;
+
+error:
+	if (vfid)
+		v9fs_fid_destroy(vfid);
+
+	return err;
 }
 
 /**
@@ -516,9 +637,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 		return ERR_PTR(-ENOSPC);
 	}
 
-	result =
-	    v9fs_t_walk(v9ses, dirfidnum, newfid, (char *)dentry->d_name.name,
-			NULL);
+	result = v9fs_t_walk(v9ses, dirfidnum, newfid,
+		(char *)dentry->d_name.name, NULL);
 	if (result < 0) {
 		v9fs_put_idpool(newfid, &v9ses->fidpool);
 		if (result == -ENOENT) {
@@ -551,13 +671,17 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat.qid);
 
-	fid = v9fs_fid_create(dentry, v9ses, newfid, 0);
+	fid = v9fs_fid_create(v9ses, newfid);
 	if (fid == NULL) {
 		dprintk(DEBUG_ERROR, "couldn't insert\n");
 		result = -ENOMEM;
 		goto FreeFcall;
 	}
 
+	result = v9fs_fid_insert(fid, dentry);
+	if (result < 0)
+		goto FreeFcall;
+
 	fid->qid = fcall->params.rstat.stat.qid;
 
 	dentry->d_op = &v9fs_dentry_operations;
@@ -886,8 +1010,8 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
 	}
 
 	/* copy extension buffer into buffer */
-	if (fcall->params.rstat.stat.extension.len+1 < buflen)
-		buflen = fcall->params.rstat.stat.extension.len + 1;
+	if (fcall->params.rstat.stat.extension.len < buflen)
+		buflen = fcall->params.rstat.stat.extension.len;
 
 	memcpy(buffer, fcall->params.rstat.stat.extension.str, buflen - 1);
 	buffer[buflen-1] = 0;
@@ -951,7 +1075,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	if (!link)
 		link = ERR_PTR(-ENOMEM);
 	else {
-		len = v9fs_readlink(dentry, link, PATH_MAX);
+		len = v9fs_readlink(dentry, link, strlen(link));
 
 		if (len < 0) {
 			__putname(link);
@@ -983,53 +1107,75 @@ static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void
 static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 	int mode, const char *extension)
 {
-	int err, retval;
+	int err;
+	u32 fid, perm;
 	struct v9fs_session_info *v9ses;
+	struct v9fs_fid *dfid, *vfid;
+	struct inode *inode;
 	struct v9fs_fcall *fcall;
-	struct v9fs_fid *fid;
 	struct v9fs_wstat wstat;
 
-	v9ses = v9fs_inode2v9ses(dir);
-	retval = -EPERM;
 	fcall = NULL;
+	inode = NULL;
+	vfid = NULL;
+	v9ses = v9fs_inode2v9ses(dir);
+	dfid = v9fs_fid_lookup(dentry->d_parent);
+	perm = unixmode2p9mode(v9ses, mode);
 
 	if (!v9ses->extended) {
 		dprintk(DEBUG_ERROR, "not extended\n");
-		goto free_mem;
+		return -EPERM;
 	}
 
-	/* issue a create */
-	retval = v9fs_create(dir, dentry, mode, 0);
-	if (retval != 0)
-		goto free_mem;
+	err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name,
+		perm, V9FS_OREAD, &fid, NULL, NULL);
 
-	fid = v9fs_fid_get_created(dentry);
-	if (!fid) {
-		dprintk(DEBUG_ERROR, "couldn't resolve fid from dentry\n");
-		goto free_mem;
+	if (err)
+		goto error;
+
+	err = v9fs_t_clunk(v9ses, fid);
+	if (err)
+		goto error;
+
+	vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry);
+	if (IS_ERR(vfid)) {
+		err = PTR_ERR(vfid);
+		vfid = NULL;
+		goto error;
+	}
+
+	inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		inode = NULL;
+		goto error;
 	}
 
 	/* issue a Twstat */
 	v9fs_blank_wstat(&wstat);
 	wstat.muid = v9ses->name;
 	wstat.extension = (char *) extension;
-	retval = v9fs_t_wstat(v9ses, fid->fid, &wstat, &fcall);
-	if (retval < 0) {
-		PRINT_FCALL_ERROR("wstat error", fcall);
-		goto free_mem;
-	}
-
-	err = v9fs_t_clunk(v9ses, fid->fid);
+	err = v9fs_t_wstat(v9ses, vfid->fid, &wstat, &fcall);
 	if (err < 0) {
-		dprintk(DEBUG_ERROR, "clunk failed: %d\n", err);
-		goto free_mem;
+		PRINT_FCALL_ERROR("wstat error", fcall);
+		goto error;
 	}
 
-	d_drop(dentry);		/* FID - will this also clunk? */
+	kfree(fcall);
+	dentry->d_op = &v9fs_dentry_operations;
+	d_instantiate(dentry, inode);
+	return 0;
 
-free_mem:
+error:
 	kfree(fcall);
-	return retval;
+	if (vfid)
+		v9fs_fid_destroy(vfid);
+
+	if (inode)
+		iput(inode);
+
+	return err;
+
 }
 
 /**
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 2c4fa75be025..0c85872be51a 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -146,7 +146,6 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 	inode->i_gid = gid;
 
 	root = d_alloc_root(inode);
-
 	if (!root) {
 		retval = -ENOMEM;
 		goto put_back_sb;
@@ -157,24 +156,27 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 	stat_result = v9fs_t_stat(v9ses, newfid, &fcall);
 	if (stat_result < 0) {
 		dprintk(DEBUG_ERROR, "stat error\n");
+		kfree(fcall);
 		v9fs_t_clunk(v9ses, newfid);
-		v9fs_put_idpool(newfid, &v9ses->fidpool);
 	} else {
 		/* Setup the Root Inode */
-		root_fid = v9fs_fid_create(root, v9ses, newfid, 0);
+		kfree(fcall);
+		root_fid = v9fs_fid_create(v9ses, newfid);
 		if (root_fid == NULL) {
 			retval = -ENOMEM;
 			goto put_back_sb;
 		}
 
+		retval = v9fs_fid_insert(root_fid, root);
+		if (retval < 0)
+			goto put_back_sb;
+
 		root_fid->qid = fcall->params.rstat.stat.qid;
 		root->d_inode->i_ino =
 		    v9fs_qid2ino(&fcall->params.rstat.stat.qid);
 		v9fs_stat2inode(&fcall->params.rstat.stat, root->d_inode, sb);
 	}
 
-	kfree(fcall);
-
 	if (stat_result < 0) {
 		retval = stat_result;
 		goto put_back_sb;
-- 
cgit v1.2.2


From 74b8054c730785cd9db093e48f53337e521b6270 Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Thu, 2 Mar 2006 02:54:32 -0800
Subject: [PATCH] v9fs: fix bug in atomic create open fix

Lucho's atomic create+open fix had a bug in the super block initialization
causing all mounts to fail.  He was freeing an fcall too early.  This patch
fixes that oversight.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/vfs_super.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 0c85872be51a..cdf787ee08de 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -160,7 +160,6 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 		v9fs_t_clunk(v9ses, newfid);
 	} else {
 		/* Setup the Root Inode */
-		kfree(fcall);
 		root_fid = v9fs_fid_create(v9ses, newfid);
 		if (root_fid == NULL) {
 			retval = -ENOMEM;
@@ -168,8 +167,10 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 		}
 
 		retval = v9fs_fid_insert(root_fid, root);
-		if (retval < 0)
+		if (retval < 0) {
+			kfree(fcall);
 			goto put_back_sb;
+		}
 
 		root_fid->qid = fcall->params.rstat.stat.qid;
 		root->d_inode->i_ino =
@@ -177,6 +178,8 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 		v9fs_stat2inode(&fcall->params.rstat.stat, root->d_inode, sb);
 	}
 
+	kfree(fcall);
+
 	if (stat_result < 0) {
 		retval = stat_result;
 		goto put_back_sb;
-- 
cgit v1.2.2


From 46f6dac259717551916405ee3388de89fb152bca Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@gmail.com>
Date: Thu, 2 Mar 2006 02:54:33 -0800
Subject: [PATCH] v9fs: simplify fid mapping

v9fs has been plagued by an over-complicated approach trying to map Linux
dentry semantics to Plan 9 fid semantics.  Our previous approach called for
aggressive flushing of the dcache resulting in several problems (including
wierd cwd behavior when running /bin/pwd).

This patch dramatically simplifies our handling of this fid management.  Fids
will not be clunked as promptly, but the new approach is more functionally
correct.  We now clunk un-open fids only when their dentry ref_count reaches 0
(and d_delete is called).

Another simplification is we no longer seek to match fids to the process-id or
uid of the action initiator.  The uid-matching will need to be revisited when
we fix the security model.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/fid.c        | 94 ++++--------------------------------------------------
 fs/9p/fid.h        |  1 -
 fs/9p/v9fs.c       |  1 +
 fs/9p/vfs_dentry.c | 45 +++++---------------------
 4 files changed, 15 insertions(+), 126 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index c27f546dd25b..c4d13bf904d2 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -1,7 +1,7 @@
 /*
  * V9FS FID Management
  *
- *  Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2005, 2006 by Eric Van Hensbergen <ericvh@gmail.com>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -57,7 +57,6 @@ int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry)
 	}
 
 	fid->uid = current->uid;
-	fid->pid = current->pid;
 	list_add(&fid->list, fid_list);
 	return 0;
 }
@@ -88,7 +87,7 @@ struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *v9ses, int fid)
 	new->rdir_fcall = NULL;
 	INIT_LIST_HEAD(&new->list);
 
-		return new;
+	return new;
 }
 
 /**
@@ -103,76 +102,14 @@ void v9fs_fid_destroy(struct v9fs_fid *fid)
 	kfree(fid);
 }
 
-/**
- * v9fs_fid_walk_up - walks from the process current directory
- * 	up to the specified dentry.
- */
-static struct v9fs_fid *v9fs_fid_walk_up(struct dentry *dentry)
-{
-	int fidnum, cfidnum, err;
-	struct v9fs_fid *cfid, *fid;
-	struct dentry *cde;
-	struct v9fs_session_info *v9ses;
-
-	v9ses = v9fs_inode2v9ses(current->fs->pwd->d_inode);
-	cfid = v9fs_fid_lookup(current->fs->pwd);
-	if (cfid == NULL) {
-		dprintk(DEBUG_ERROR, "process cwd doesn't have a fid\n");
-		return ERR_PTR(-ENOENT);
-	}
-
-	cfidnum = cfid->fid;
-	cde = current->fs->pwd;
-	/* TODO: take advantage of multiwalk */
-
-	fidnum = v9fs_get_idpool(&v9ses->fidpool);
-	if (fidnum < 0) {
-		dprintk(DEBUG_ERROR, "could not get a new fid num\n");
-		err = -ENOENT;
-		goto clunk_fid;
-	}
-
-	while (cde != dentry) {
-		if (cde == cde->d_parent) {
-			dprintk(DEBUG_ERROR, "can't find dentry\n");
-			err = -ENOENT;
-			goto clunk_fid;
-		}
-
-		err = v9fs_t_walk(v9ses, cfidnum, fidnum, "..", NULL);
-		if (err < 0) {
-			dprintk(DEBUG_ERROR, "problem walking to parent\n");
-			goto clunk_fid;
-		}
-
-		cfidnum = fidnum;
-		cde = cde->d_parent;
-	}
-
-	fid = v9fs_fid_create(v9ses, fidnum);
-	if (fid) {
-		err = v9fs_fid_insert(fid, dentry);
-		if (err < 0) {
-			kfree(fid);
-			goto clunk_fid;
-		}
-	}
-
-	return fid;
-
-clunk_fid:
-	v9fs_t_clunk(v9ses, fidnum);
-	return ERR_PTR(err);
-}
-
 /**
  * v9fs_fid_lookup - retrieve the right fid from a  particular dentry
  * @dentry: dentry to look for fid in
  * @type: intent of lookup (operation or traversal)
  *
- * search list of fids associated with a dentry for a fid with a matching
- * thread id or uid.  If that fails, look up the dentry's parents to see if you
- * can find a matching fid.
+ * find a fid in the dentry
+ *
+ * TODO: only match fids that have the same uid as current user
  *
  */
 
@@ -187,26 +124,7 @@ struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry)
 		return_fid = list_entry(fid_list->next, struct v9fs_fid, list);
 
 	if (!return_fid) {
-		struct dentry *par = current->fs->pwd->d_parent;
-		int count = 1;
-		while (par != NULL) {
-			if (par == dentry)
-				break;
-			count++;
-			if (par == par->d_parent) {
-				dprintk(DEBUG_ERROR,
-					"got to root without finding dentry\n");
-				break;
-			}
-			par = par->d_parent;
-		}
-
-/* XXX - there may be some duplication we can get rid of */
-		if (par == dentry) {
-			return_fid = v9fs_fid_walk_up(dentry);
-			if (IS_ERR(return_fid))
-				return_fid = NULL;
-		}
+		dprintk(DEBUG_ERROR, "Couldn't find a fid in dentry\n");
 	}
 
 	return return_fid;
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index 7ccf0d064e25..1fc2dd08d75a 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -44,7 +44,6 @@ struct v9fs_fid {
 	struct v9fs_fcall *rdir_fcall;
 
 	/* management stuff */
-	pid_t pid;		/* thread associated with this fid */
 	uid_t uid;		/* user associated with this fid */
 
 	/* private data */
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index ef3386549140..61352491ba36 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -397,6 +397,7 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
 	}
 
 	if (v9ses->afid != ~0) {
+		dprintk(DEBUG_ERROR, "afid not equal to ~0\n");
 		if (v9fs_t_clunk(v9ses, v9ses->afid))
 			dprintk(DEBUG_ERROR, "clunk failed\n");
 	}
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 2dd806dac9f1..12c9cc926b71 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -43,47 +43,18 @@
 #include "fid.h"
 
 /**
- * v9fs_dentry_validate - VFS dcache hook to validate cache
- * @dentry:  dentry that is being validated
- * @nd: path data
+ * v9fs_dentry_delete - called when dentry refcount equals 0
+ * @dentry:  dentry in question
  *
- * dcache really shouldn't be used for 9P2000 as at all due to
- * potential attached semantics to directory traversal (walk).
- *
- * FUTURE: look into how to use dcache to allow multi-stage
- * walks in Plan 9 & potential for better dcache operation which
- * would remain valid for Plan 9 semantics.  Older versions
- * had validation via stat for those interested.  However, since
- * stat has the same approximate overhead as walk there really
- * is no difference.  The only improvement would be from a
- * time-decay cache like NFS has and that undermines the
- * synchronous nature of 9P2000.
+ * By returning 1 here we should remove cacheing of unused
+ * dentry components.
  *
  */
 
-static int v9fs_dentry_validate(struct dentry *dentry, struct nameidata *nd)
+int v9fs_dentry_delete(struct dentry *dentry)
 {
-	struct dentry *dc = current->fs->pwd;
-
-	dprintk(DEBUG_VFS, "dentry: %s (%p)\n", dentry->d_iname, dentry);
-	if (v9fs_fid_lookup(dentry)) {
-		dprintk(DEBUG_VFS, "VALID\n");
-		return 1;
-	}
-
-	while (dc != NULL) {
-		if (dc == dentry) {
-			dprintk(DEBUG_VFS, "VALID\n");
-			return 1;
-		}
-		if (dc == dc->d_parent)
-			break;
-
-		dc = dc->d_parent;
-	}
-
-	dprintk(DEBUG_VFS, "INVALID\n");
-	return 0;
+	dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
+	return 1;
 }
 
 /**
@@ -118,6 +89,6 @@ void v9fs_dentry_release(struct dentry *dentry)
 }
 
 struct dentry_operations v9fs_dentry_operations = {
-	.d_revalidate = v9fs_dentry_validate,
+	.d_delete = v9fs_dentry_delete,
 	.d_release = v9fs_dentry_release,
 };
-- 
cgit v1.2.2


From c499ec24c31edf270e777a868ffd0daddcfe7ebd Mon Sep 17 00:00:00 2001
From: "Vladimir V. Saveliev" <vs@namesys.com>
Date: Thu, 2 Mar 2006 02:54:39 -0800
Subject: [PATCH] reiserfs: do not check if unsigned < 0

This patch fixes bugs in reiserfs where unsigned integers were checked
whether they are less then 0.

Signed-off-by: Vladimir V. Saveliev <vs@namesys.com>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Hans Reiser <reiser@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/file.c    | 14 +++++++-------
 fs/reiserfs/inode.c   |  8 ++------
 fs/reiserfs/journal.c |  3 +--
 3 files changed, 10 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index f3473176c83a..be12879bb179 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1464,13 +1464,11 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 		   partially overwritten pages, if needed. And lock the pages,
 		   so that nobody else can access these until we are done.
 		   We get number of actual blocks needed as a result. */
-		blocks_to_allocate =
-		    reiserfs_prepare_file_region_for_write(inode, pos,
-							   num_pages,
-							   write_bytes,
-							   prepared_pages);
-		if (blocks_to_allocate < 0) {
-			res = blocks_to_allocate;
+		res = reiserfs_prepare_file_region_for_write(inode, pos,
+							     num_pages,
+							     write_bytes,
+							     prepared_pages);
+		if (res < 0) {
 			reiserfs_release_claimed_blocks(inode->i_sb,
 							num_pages <<
 							(PAGE_CACHE_SHIFT -
@@ -1478,6 +1476,8 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 			break;
 		}
 
+		blocks_to_allocate = res;
+
 		/* First we correct our estimate of how many blocks we need */
 		reiserfs_release_claimed_blocks(inode->i_sb,
 						(num_pages <<
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index b33d67bba2fd..d60f6238c66a 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -627,11 +627,6 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 	reiserfs_write_lock(inode->i_sb);
 	version = get_inode_item_key_version(inode);
 
-	if (block < 0) {
-		reiserfs_write_unlock(inode->i_sb);
-		return -EIO;
-	}
-
 	if (!file_capable(inode, block)) {
 		reiserfs_write_unlock(inode->i_sb);
 		return -EFBIG;
@@ -934,12 +929,13 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
 				     //pos_in_item * inode->i_sb->s_blocksize,
 				     TYPE_INDIRECT, 3);	// key type is unimportant
 
+			RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
+			       "green-805: invalid offset");
 			blocks_needed =
 			    1 +
 			    ((cpu_key_k_offset(&key) -
 			      cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
 			     s_blocksize_bits);
-			RFALSE(blocks_needed < 0, "green-805: invalid offset");
 
 			if (blocks_needed == 1) {
 				un = &unf_single;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index b7a179560ab4..5a9d2722fa0a 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2319,8 +2319,7 @@ static int journal_read(struct super_block *p_s_sb)
 		return 1;
 	}
 	jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
-	if (le32_to_cpu(jh->j_first_unflushed_offset) >= 0 &&
-	    le32_to_cpu(jh->j_first_unflushed_offset) <
+	if (le32_to_cpu(jh->j_first_unflushed_offset) <
 	    SB_ONDISK_JOURNAL_SIZE(p_s_sb)
 	    && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
 		oldest_start =
-- 
cgit v1.2.2