From 6f80dfe55fcbde1e588bc2cfa1273403b710872d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Sun, 7 Nov 2010 23:01:17 +0100
Subject: hfsplus: fix option parsing during remount

hfsplus only actually uses the force option during remount, but it uses
the full option parser with a fake superblock to do so.  This means remount
will fail if any nls option is set (which happens frequently with older
mount tools), even if it is the same.

Fix this by adding a simpler version of the parser that only parses the force
option for remount.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/hfsplus_fs.h |  1 +
 fs/hfsplus/options.c    | 26 ++++++++++++++++++++++++++
 fs/hfsplus/super.c      |  8 +++-----
 3 files changed, 30 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index cb3653efb57a..f07aa640c27d 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -362,6 +362,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
 
 /* options.c */
 int hfsplus_parse_options(char *, struct hfsplus_sb_info *);
+int hfsplus_parse_options_remount(char *input, int *force);
 void hfsplus_fill_defaults(struct hfsplus_sb_info *);
 int hfsplus_show_options(struct seq_file *, struct vfsmount *);
 
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index f9ab276a4d8d..43b02b5525eb 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -65,6 +65,32 @@ static inline int match_fourchar(substring_t *arg, u32 *result)
 	return 0;
 }
 
+int hfsplus_parse_options_remount(char *input, int *force)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int token;
+
+	if (!input)
+		return 0;
+
+	while ((p = strsep(&input, ",")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case opt_force:
+			*force = 1;
+			break;
+		default:
+			break;
+		}
+	}
+
+	return 1;
+}
+
 /* Parse options from mount. Returns 0 on failure */
 /* input is the options passed to mount() as a string */
 int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 52cc746d3ba3..1c356a2fba01 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -263,11 +263,9 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 		return 0;
 	if (!(*flags & MS_RDONLY)) {
 		struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
-		struct hfsplus_sb_info sbi;
+		int force = 0;
 
-		memset(&sbi, 0, sizeof(struct hfsplus_sb_info));
-		sbi.nls = HFSPLUS_SB(sb)->nls;
-		if (!hfsplus_parse_options(data, &sbi))
+		if (!hfsplus_parse_options_remount(data, &force))
 			return -EINVAL;
 
 		if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
@@ -275,7 +273,7 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 			       "running fsck.hfsplus is recommended.  leaving read-only.\n");
 			sb->s_flags |= MS_RDONLY;
 			*flags |= MS_RDONLY;
-		} else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) {
+		} else if (force) {
 			/* nothing */
 		} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
 			printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
-- 
cgit v1.2.2


From 23308ba54dcdb54481163bfb07dd8aeca76a7a2e Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 4 Nov 2010 16:20:24 +0100
Subject: console: add /proc/consoles

It allows users to see what consoles are currently known to the system
and with what flags.

It is based on Werner's patch, the part about traversing fds was
removed, the code was moved to kernel/printk.c, where consoles are
handled and it makes more sense to me.

Signed-off-by: Jiri Slaby <jslaby@suse.cz> [cleanups]
Signed-off-by: "Dr. Werner Fink" <werner@suse.de>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/proc/Makefile       |   1 +
 fs/proc/proc_console.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 115 insertions(+)
 create mode 100644 fs/proc/proc_console.c

(limited to 'fs')

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 2758e2afc518..288a49e098bf 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -15,6 +15,7 @@ proc-y	+= devices.o
 proc-y	+= interrupts.o
 proc-y	+= loadavg.o
 proc-y	+= meminfo.o
+proc-y	+= proc_console.o
 proc-y	+= stat.o
 proc-y	+= uptime.o
 proc-y	+= version.o
diff --git a/fs/proc/proc_console.c b/fs/proc/proc_console.c
new file mode 100644
index 000000000000..8a707609f528
--- /dev/null
+++ b/fs/proc/proc_console.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2010 Werner Fink, Jiri Slaby
+ *
+ * Licensed under GPLv2
+ */
+
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/tty_driver.h>
+
+/*
+ * This is handler for /proc/consoles
+ */
+static int show_console_dev(struct seq_file *m, void *v)
+{
+	static const struct {
+		short flag;
+		char name;
+	} con_flags[] = {
+		{ CON_ENABLED,		'E' },
+		{ CON_CONSDEV,		'C' },
+		{ CON_BOOT,		'B' },
+		{ CON_PRINTBUFFER,	'p' },
+		{ CON_BRL,		'b' },
+		{ CON_ANYTIME,		'a' },
+	};
+	char flags[ARRAY_SIZE(con_flags) + 1];
+	struct console *con = v;
+	unsigned int a;
+	int len;
+	dev_t dev = 0;
+
+	if (con->device) {
+		const struct tty_driver *driver;
+		int index;
+		driver = con->device(con, &index);
+		if (driver) {
+			dev = MKDEV(driver->major, driver->minor_start);
+			dev += index;
+		}
+	}
+
+	for (a = 0; a < ARRAY_SIZE(con_flags); a++)
+		flags[a] = (con->flags & con_flags[a].flag) ?
+			con_flags[a].name : ' ';
+	flags[a] = 0;
+
+	seq_printf(m, "%s%d%n", con->name, con->index, &len);
+	len = 21 - len;
+	if (len < 1)
+		len = 1;
+	seq_printf(m, "%*c%c%c%c (%s)", len, ' ', con->read ? 'R' : '-',
+			con->write ? 'W' : '-', con->unblank ? 'U' : '-',
+			flags);
+	if (dev)
+		seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
+
+	seq_printf(m, "\n");
+
+	return 0;
+}
+
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+	struct console *con;
+	loff_t off = 0;
+
+	acquire_console_sem();
+	for_each_console(con)
+		if (off++ == *pos)
+			break;
+
+	return con;
+}
+
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct console *con = v;
+	++*pos;
+	return con->next;
+}
+
+static void c_stop(struct seq_file *m, void *v)
+{
+	release_console_sem();
+}
+
+static const struct seq_operations consoles_op = {
+	.start	= c_start,
+	.next	= c_next,
+	.stop	= c_stop,
+	.show	= show_console_dev
+};
+
+static int consoles_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &consoles_op);
+}
+
+static const struct file_operations proc_consoles_operations = {
+	.open		= consoles_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int register_proc_consoles(void)
+{
+	proc_create("consoles", 0, NULL, &proc_consoles_operations);
+	return 0;
+}
+module_init(register_proc_consoles);
-- 
cgit v1.2.2


From 6d1bbfc4c0458c514126ccf7d6ce9232d9dbc872 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Tue, 23 Nov 2010 14:37:40 +0100
Subject: hfsplus: silence a few debug printks

Turn a few noisy debug printks that show up during xfstests into
complied out debug print statements.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/bnode.c | 2 +-
 fs/hfsplus/brec.c  | 2 +-
 fs/hfsplus/btree.c | 2 +-
 fs/hfsplus/super.c | 2 --
 4 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 29da6574ba77..c8aa1659b838 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -358,7 +358,7 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
 
 	// move down?
 	if (!node->prev && !node->next) {
-		printk(KERN_DEBUG "hfs_btree_del_level\n");
+		dprint(DBG_BNODE_MOD, "hfs_btree_del_level\n");
 	}
 	if (!node->parent) {
 		tree->root = 0;
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 2f39d05443e1..81f7e6e51d5a 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -375,7 +375,7 @@ again:
 		end_off = hfs_bnode_read_u16(parent, end_rec_off);
 		if (end_rec_off - end_off < diff) {
 
-			printk(KERN_DEBUG "hfs: splitting index node...\n");
+			dprint(DBG_BNODE_MOD, "hfs: splitting index node.\n");
 			fd->bnode = parent;
 			new_node = hfs_bnode_split(fd);
 			if (IS_ERR(new_node))
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 22e4d4e32999..97556f932d57 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -287,7 +287,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 		kunmap(*pagep);
 		nidx = node->next;
 		if (!nidx) {
-			printk(KERN_DEBUG "hfs: create new bmap node...\n");
+			dprint(DBG_BNODE_MOD, "hfs: create new bmap node.\n");
 			next_node = hfs_bmap_new_bmap(node, idx);
 		} else
 			next_node = hfs_bnode_find(tree, nidx);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 1c356a2fba01..9bda9fd35511 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -451,8 +451,6 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	sync_dirty_buffer(sbi->s_vhbh);
 
 	if (!sbi->hidden_dir) {
-		printk(KERN_DEBUG "hfs: create hidden dir...\n");
-
 		mutex_lock(&sbi->vh_mutex);
 		sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
 		hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
-- 
cgit v1.2.2


From 3b5ce8ae31e3c66655207907527476bbd3e5063b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Tue, 23 Nov 2010 14:37:43 +0100
Subject: hfsplus: always use hfsplus_sync_fs to write the volume header

Remove opencoded writing of the volume header in hfsplus_fill_super
and hfsplus_put_super and offload it to hfsplus_sync_fs.  In the
put_super case this means we only write the superblock once instead
of twice.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/super.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 9bda9fd35511..6a2349058618 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -215,16 +215,14 @@ static void hfsplus_put_super(struct super_block *sb)
 	if (!sb->s_fs_info)
 		return;
 
-	if (sb->s_dirt)
-		hfsplus_write_super(sb);
 	if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
 		struct hfsplus_vh *vhdr = sbi->s_vhdr;
 
 		vhdr->modify_date = hfsp_now2mt();
 		vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
 		vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
-		mark_buffer_dirty(sbi->s_vhbh);
-		sync_dirty_buffer(sbi->s_vhbh);
+
+		hfsplus_sync_fs(sb, 1);
 	}
 
 	hfs_btree_close(sbi->cat_tree);
@@ -447,8 +445,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	be32_add_cpu(&vhdr->write_count, 1);
 	vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
 	vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
-	mark_buffer_dirty(sbi->s_vhbh);
-	sync_dirty_buffer(sbi->s_vhbh);
+	hfsplus_sync_fs(sb, 1);
 
 	if (!sbi->hidden_dir) {
 		mutex_lock(&sbi->vh_mutex);
-- 
cgit v1.2.2


From 52399b171dfaea02b6944cd6feba49b624147126 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Tue, 23 Nov 2010 14:37:47 +0100
Subject: hfsplus: use raw bio access for the volume headers

The hfsplus backup volume header is located two blocks from the end of
the device.  In case of device sizes that are not 4k aligned this means
we can't access it using buffer_heads when using the default 4k block
size.

Switch to using raw bios to read/write all buffer headers.  We were not
relying on any caching behaviour of the buffer heads anyway.  Additionally
always read in the backup volume header during mount to verify that we
can actually read it.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/hfsplus_fs.h |   8 ++-
 fs/hfsplus/super.c      |  48 +++++++-------
 fs/hfsplus/wrapper.c    | 163 ++++++++++++++++++++++++++++++------------------
 3 files changed, 131 insertions(+), 88 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index f07aa640c27d..276ddb0fd0fd 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -107,8 +107,8 @@ struct hfsplus_vh;
 struct hfs_btree;
 
 struct hfsplus_sb_info {
-	struct buffer_head *s_vhbh;
 	struct hfsplus_vh *s_vhdr;
+	struct hfsplus_vh *s_backup_vhdr;
 	struct hfs_btree *ext_tree;
 	struct hfs_btree *cat_tree;
 	struct hfs_btree *attr_tree;
@@ -118,7 +118,8 @@ struct hfsplus_sb_info {
 
 	/* Runtime variables */
 	u32 blockoffset;
-	u32 sect_count;
+	sector_t part_start;
+	sector_t sect_count;
 	int fs_shift;
 
 	/* immutable data from the volume header */
@@ -385,8 +386,9 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
 
 /* wrapper.c */
 int hfsplus_read_wrapper(struct super_block *);
-
 int hfs_part_find(struct super_block *, sector_t *, sector_t *);
+int hfsplus_submit_bio(struct block_device *bdev, sector_t sector,
+		void *data, int rw);
 
 /* access macros */
 static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 6a2349058618..fe8f7bffbea5 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -157,45 +157,40 @@ int hfsplus_sync_fs(struct super_block *sb, int wait)
 {
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct hfsplus_vh *vhdr = sbi->s_vhdr;
+	int write_backup = 0;
+	int error, error2;
 
 	dprint(DBG_SUPER, "hfsplus_write_super\n");
 
-	mutex_lock(&sbi->vh_mutex);
-	mutex_lock(&sbi->alloc_mutex);
 	sb->s_dirt = 0;
 
+	mutex_lock(&sbi->vh_mutex);
+	mutex_lock(&sbi->alloc_mutex);
 	vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
 	vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
 	vhdr->folder_count = cpu_to_be32(sbi->folder_count);
 	vhdr->file_count = cpu_to_be32(sbi->file_count);
 
-	mark_buffer_dirty(sbi->s_vhbh);
 	if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
-		if (sbi->sect_count) {
-			struct buffer_head *bh;
-			u32 block, offset;
-
-			block = sbi->blockoffset;
-			block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9);
-			offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1);
-			printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n",
-					  sbi->blockoffset, sbi->sect_count,
-					  block, offset);
-			bh = sb_bread(sb, block);
-			if (bh) {
-				vhdr = (struct hfsplus_vh *)(bh->b_data + offset);
-				if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) {
-					memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr));
-					mark_buffer_dirty(bh);
-					brelse(bh);
-				} else
-					printk(KERN_WARNING "hfs: backup not found!\n");
-			}
-		}
+		memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr));
+		write_backup = 1;
 	}
+
+	error = hfsplus_submit_bio(sb->s_bdev,
+				   sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
+				   sbi->s_vhdr, WRITE_SYNC);
+	if (!write_backup)
+		goto out;
+
+	error2 = hfsplus_submit_bio(sb->s_bdev,
+				  sbi->part_start + sbi->sect_count - 2,
+				  sbi->s_backup_vhdr, WRITE_SYNC);
+	if (!error)
+		error2 = error;
+out:
 	mutex_unlock(&sbi->alloc_mutex);
 	mutex_unlock(&sbi->vh_mutex);
-	return 0;
+	return error;
 }
 
 static void hfsplus_write_super(struct super_block *sb)
@@ -229,7 +224,8 @@ static void hfsplus_put_super(struct super_block *sb)
 	hfs_btree_close(sbi->ext_tree);
 	iput(sbi->alloc_file);
 	iput(sbi->hidden_dir);
-	brelse(sbi->s_vhbh);
+	kfree(sbi->s_vhdr);
+	kfree(sbi->s_backup_vhdr);
 	unload_nls(sbi->nls);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 8972c20b3216..15e0eabb489e 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -24,6 +24,40 @@ struct hfsplus_wd {
 	u16 embed_count;
 };
 
+static void hfsplus_end_io_sync(struct bio *bio, int err)
+{
+	if (err)
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	complete(bio->bi_private);
+}
+
+int hfsplus_submit_bio(struct block_device *bdev, sector_t sector,
+		void *data, int rw)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_NOIO, 1);
+	bio->bi_sector = sector;
+	bio->bi_bdev = bdev;
+	bio->bi_end_io = hfsplus_end_io_sync;
+	bio->bi_private = &wait;
+
+	/*
+	 * We always submit one sector at a time, so bio_add_page must not fail.
+	 */
+	if (bio_add_page(bio, virt_to_page(data), HFSPLUS_SECTOR_SIZE,
+			 offset_in_page(data)) != HFSPLUS_SECTOR_SIZE)
+		BUG();
+
+	submit_bio(rw, bio);
+	wait_for_completion(&wait);
+
+	if (!bio_flagged(bio, BIO_UPTODATE))
+		return -EIO;
+	return 0;
+}
+
 static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
 {
 	u32 extent;
@@ -88,100 +122,111 @@ static int hfsplus_get_last_session(struct super_block *sb,
 int hfsplus_read_wrapper(struct super_block *sb)
 {
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
-	struct buffer_head *bh;
-	struct hfsplus_vh *vhdr;
 	struct hfsplus_wd wd;
 	sector_t part_start, part_size;
 	u32 blocksize;
+	int error = 0;
 
+	error = -EINVAL;
 	blocksize = sb_min_blocksize(sb, HFSPLUS_SECTOR_SIZE);
 	if (!blocksize)
-		return -EINVAL;
+		goto out;
 
 	if (hfsplus_get_last_session(sb, &part_start, &part_size))
-		return -EINVAL;
+		goto out;
 	if ((u64)part_start + part_size > 0x100000000ULL) {
 		pr_err("hfs: volumes larger than 2TB are not supported yet\n");
-		return -EINVAL;
+		goto out;
 	}
-	while (1) {
-		bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
-		if (!bh)
-			return -EIO;
-
-		if (vhdr->signature == cpu_to_be16(HFSP_WRAP_MAGIC)) {
-			if (!hfsplus_read_mdb(vhdr, &wd))
-				goto error;
-			wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
-			part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
-			part_size = wd.embed_count * wd.ablk_size;
-			brelse(bh);
-			bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
-			if (!bh)
-				return -EIO;
-		}
-		if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
-			break;
-		if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
-			set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
-			break;
-		}
-		brelse(bh);
 
-		/* check for a partition block
+	error = -ENOMEM;
+	sbi->s_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
+	if (!sbi->s_vhdr)
+		goto out;
+	sbi->s_backup_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
+	if (!sbi->s_backup_vhdr)
+		goto out_free_vhdr;
+
+reread:
+	error = hfsplus_submit_bio(sb->s_bdev,
+				   part_start + HFSPLUS_VOLHEAD_SECTOR,
+				   sbi->s_vhdr, READ);
+	if (error)
+		goto out_free_backup_vhdr;
+
+	error = -EINVAL;
+	switch (sbi->s_vhdr->signature) {
+	case cpu_to_be16(HFSPLUS_VOLHEAD_SIGX):
+		set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
+		/*FALLTHRU*/
+	case cpu_to_be16(HFSPLUS_VOLHEAD_SIG):
+		break;
+	case cpu_to_be16(HFSP_WRAP_MAGIC):
+		if (!hfsplus_read_mdb(sbi->s_vhdr, &wd))
+			goto out;
+		wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
+		part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
+		part_size = wd.embed_count * wd.ablk_size;
+		goto reread;
+	default:
+		/*
+		 * Check for a partition block.
+		 *
 		 * (should do this only for cdrom/loop though)
 		 */
 		if (hfs_part_find(sb, &part_start, &part_size))
-			return -EINVAL;
+			goto out;
+		goto reread;
+	}
+
+	error = hfsplus_submit_bio(sb->s_bdev,
+				   part_start + part_size - 2,
+				   sbi->s_backup_vhdr, READ);
+	if (error)
+		goto out_free_backup_vhdr;
+
+	error = -EINVAL;
+	if (sbi->s_backup_vhdr->signature != sbi->s_vhdr->signature) {
+		printk(KERN_WARNING
+			"hfs: invalid secondary volume header\n");
+		goto out_free_backup_vhdr;
 	}
 
-	blocksize = be32_to_cpu(vhdr->blocksize);
-	brelse(bh);
+	blocksize = be32_to_cpu(sbi->s_vhdr->blocksize);
 
-	/* block size must be at least as large as a sector
-	 * and a multiple of 2
+	/*
+	 * Block size must be at least as large as a sector and a multiple of 2.
 	 */
-	if (blocksize < HFSPLUS_SECTOR_SIZE ||
-	    ((blocksize - 1) & blocksize))
-		return -EINVAL;
+	if (blocksize < HFSPLUS_SECTOR_SIZE || ((blocksize - 1) & blocksize))
+		goto out_free_backup_vhdr;
 	sbi->alloc_blksz = blocksize;
 	sbi->alloc_blksz_shift = 0;
 	while ((blocksize >>= 1) != 0)
 		sbi->alloc_blksz_shift++;
 	blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
 
-	/* align block size to block offset */
+	/*
+	 * Align block size to block offset.
+	 */
 	while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
 		blocksize >>= 1;
 
 	if (sb_set_blocksize(sb, blocksize) != blocksize) {
 		printk(KERN_ERR "hfs: unable to set blocksize to %u!\n", blocksize);
-		return -EINVAL;
+		goto out_free_backup_vhdr;
 	}
 
 	sbi->blockoffset =
 		part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
+	sbi->part_start = part_start;
 	sbi->sect_count = part_size;
 	sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
-
-	bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
-	if (!bh)
-		return -EIO;
-
-	/* should still be the same... */
-	if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
-		if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
-			goto error;
-	} else {
-		if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
-			goto error;
-	}
-
-	sbi->s_vhbh = bh;
-	sbi->s_vhdr = vhdr;
-
 	return 0;
- error:
-	brelse(bh);
-	return -EINVAL;
+
+out_free_backup_vhdr:
+	kfree(sbi->s_backup_vhdr);
+out_free_vhdr:
+	kfree(sbi->s_vhdr);
+out:
+	return error;
 }
-- 
cgit v1.2.2


From 358f26d52680cb150907302d4334359de7dd2d59 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Tue, 23 Nov 2010 14:37:51 +0100
Subject: hfsplus: use raw bio access for partition tables

Switch the hfsplus partition table reding for cdroms to use our bio
helpers.  Again we don't rely on any caching in the buffer_heads, and
this gets rid of the last buffer_head use in hfsplus.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/hfsplus_fs.h |  17 -------
 fs/hfsplus/part_tbl.c   | 124 +++++++++++++++++++++++++++---------------------
 2 files changed, 71 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 276ddb0fd0fd..0d8532d96694 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -401,23 +401,6 @@ static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
 	return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
 }
 
-#define sb_bread512(sb, sec, data) ({			\
-	struct buffer_head *__bh;			\
-	sector_t __block;				\
-	loff_t __start;					\
-	int __offset;					\
-							\
-	__start = (loff_t)(sec) << HFSPLUS_SECTOR_SHIFT;\
-	__block = __start >> (sb)->s_blocksize_bits;	\
-	__offset = __start & ((sb)->s_blocksize - 1);	\
-	__bh = sb_bread((sb), __block);			\
-	if (likely(__bh != NULL))			\
-		data = (void *)(__bh->b_data + __offset);\
-	else						\
-		data = NULL;				\
-	__bh;						\
-})
-
 /* time macros */
 #define __hfsp_mt2ut(t)		(be32_to_cpu(t) - 2082844800U)
 #define __hfsp_ut2mt(t)		(cpu_to_be32(t + 2082844800U))
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 208b16c645cc..58918222a604 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -13,6 +13,7 @@
  *
  */
 
+#include <linux/slab.h>
 #include "hfsplus_fs.h"
 
 /* offsets to various blocks */
@@ -65,70 +66,87 @@ struct old_pmap {
 	}	pdEntry[42];
 } __packed;
 
+static int hfs_parse_old_pmap(struct super_block *sb, struct old_pmap *pm,
+		sector_t *part_start, sector_t *part_size)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	int i;
+
+	for (i = 0; i < 42; i++) {
+		struct old_pmap_entry *p = &pm->pdEntry[i];
+
+		if (p->pdStart && p->pdSize &&
+		    p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
+		    (sbi->part < 0 || sbi->part == i)) {
+			*part_start += be32_to_cpu(p->pdStart);
+			*part_size = be32_to_cpu(p->pdSize);
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm,
+		sector_t *part_start, sector_t *part_size)
+{
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+	int size = be32_to_cpu(pm->pmMapBlkCnt);
+	int res;
+	int i = 0;
+
+	do {
+		if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
+		    (sbi->part < 0 || sbi->part == i)) {
+			*part_start += be32_to_cpu(pm->pmPyPartStart);
+			*part_size = be32_to_cpu(pm->pmPartBlkCnt);
+			return 0;
+		}
+
+		if (++i >= size)
+			return -ENOENT;
+
+		res = hfsplus_submit_bio(sb->s_bdev,
+					 *part_start + HFS_PMAP_BLK + i,
+					 pm, READ);
+		if (res)
+			return res;
+	} while (pm->pmSig == cpu_to_be16(HFS_NEW_PMAP_MAGIC));
+
+	return -ENOENT;
+}
+
 /*
- * hfs_part_find()
- *
- * Parse the partition map looking for the
- * start and length of the 'part'th HFS partition.
+ * Parse the partition map looking for the start and length of a
+ * HFS/HFS+ partition.
  */
 int hfs_part_find(struct super_block *sb,
-		  sector_t *part_start, sector_t *part_size)
+		sector_t *part_start, sector_t *part_size)
 {
-	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
-	struct buffer_head *bh;
-	__be16 *data;
-	int i, size, res;
+	void *data;
+	int res;
+
+	data = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
 
-	res = -ENOENT;
-	bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK, data);
-	if (!bh)
-		return -EIO;
+	res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK,
+				 data, READ);
+	if (res)
+		return res;
 
-	switch (be16_to_cpu(*data)) {
+	switch (be16_to_cpu(*((__be16 *)data))) {
 	case HFS_OLD_PMAP_MAGIC:
-	  {
-		struct old_pmap *pm;
-		struct old_pmap_entry *p;
-
-		pm = (struct old_pmap *)bh->b_data;
-		p = pm->pdEntry;
-		size = 42;
-		for (i = 0; i < size; p++, i++) {
-			if (p->pdStart && p->pdSize &&
-			    p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
-			    (sbi->part < 0 || sbi->part == i)) {
-				*part_start += be32_to_cpu(p->pdStart);
-				*part_size = be32_to_cpu(p->pdSize);
-				res = 0;
-			}
-		}
+		res = hfs_parse_old_pmap(sb, data, part_start, part_size);
 		break;
-	  }
 	case HFS_NEW_PMAP_MAGIC:
-	  {
-		struct new_pmap *pm;
-
-		pm = (struct new_pmap *)bh->b_data;
-		size = be32_to_cpu(pm->pmMapBlkCnt);
-		for (i = 0; i < size;) {
-			if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
-			    (sbi->part < 0 || sbi->part == i)) {
-				*part_start += be32_to_cpu(pm->pmPyPartStart);
-				*part_size = be32_to_cpu(pm->pmPartBlkCnt);
-				res = 0;
-				break;
-			}
-			brelse(bh);
-			bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK + ++i, pm);
-			if (!bh)
-				return -EIO;
-			if (pm->pmSig != cpu_to_be16(HFS_NEW_PMAP_MAGIC))
-				break;
-		}
+		res = hfs_parse_new_pmap(sb, data, part_start, part_size);
+		break;
+	default:
+		res = -ENOENT;
 		break;
-	  }
 	}
-	brelse(bh);
 
+	kfree(data);
 	return res;
 }
-- 
cgit v1.2.2


From 7dc4f001123f9ebe3b010a6c26acd18698ad205f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Tue, 23 Nov 2010 14:37:57 +0100
Subject: hfsplus: make sure sync writes out all metadata

hfsplus stores all metadata except for the volume headers in special
inodes.  While these are marked hashed and periodically written out
by the flusher threads, we can't rely on that for sync.  For the case
of a data integrity sync the VM has life-lock avoidance code that
avoids writing inodes again that are redirtied during the sync,
which is something that can happen easily for hfsplus.  So make sure
we explicitly write out the metadata inodes at the beginning of
hfsplus_sync_fs.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/super.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index fe8f7bffbea5..df6bea02b2b4 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -164,6 +164,22 @@ int hfsplus_sync_fs(struct super_block *sb, int wait)
 
 	sb->s_dirt = 0;
 
+	/*
+	 * Explicitly write out the special metadata inodes.
+	 *
+	 * While these special inodes are marked as hashed and written
+	 * out peridocically by the flusher threads we redirty them
+	 * during writeout of normal inodes, and thus the life lock
+	 * prevents us from getting the latest state to disk.
+	 */
+	error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
+	error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
+	if (!error)
+		error = error2;
+	error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
+	if (!error)
+		error = error2;
+
 	mutex_lock(&sbi->vh_mutex);
 	mutex_lock(&sbi->alloc_mutex);
 	vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
@@ -176,9 +192,11 @@ int hfsplus_sync_fs(struct super_block *sb, int wait)
 		write_backup = 1;
 	}
 
-	error = hfsplus_submit_bio(sb->s_bdev,
+	error2 = hfsplus_submit_bio(sb->s_bdev,
 				   sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
 				   sbi->s_vhdr, WRITE_SYNC);
+	if (!error)
+		error = error2;
 	if (!write_backup)
 		goto out;
 
-- 
cgit v1.2.2


From f02e26f8d90f8cde98314c72c2e890bc281a8346 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Tue, 23 Nov 2010 14:38:02 +0100
Subject: hfsplus: avoid useless work in hfsplus_sync_fs

There is no reason to write out the metadata inodes or volume headers
during a non-blocking sync, as we are almost guaranteed to dirty them
again during the inode writeouts.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/super.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index df6bea02b2b4..56e6cf80c5e0 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -160,6 +160,9 @@ int hfsplus_sync_fs(struct super_block *sb, int wait)
 	int write_backup = 0;
 	int error, error2;
 
+	if (!wait)
+		return 0;
+
 	dprint(DBG_SUPER, "hfsplus_write_super\n");
 
 	sb->s_dirt = 0;
-- 
cgit v1.2.2


From 281469766bdde2d14bc73e1fec347e6dd7f63319 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Tue, 23 Nov 2010 14:38:06 +0100
Subject: hfsplus: simplify fsync

Remove lots of code we don't need from fsync, we just need to call
->write_inode on the inode if it's dirty, for which sync_inode_metadata
is a lot more efficient than write_inode_now, and we need to write
out the various metadata inodes, which we now do explicitly instead
of by calling ->sync_fs.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/inode.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 8afd7e84f98d..be68961da997 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -302,29 +302,28 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
 	return 0;
 }
 
-static int hfsplus_file_fsync(struct file *filp, int datasync)
+static int hfsplus_file_fsync(struct file *file, int datasync)
 {
-	struct inode *inode = filp->f_mapping->host;
-	struct super_block * sb;
-	int ret, err;
-
-	/* sync the inode to buffers */
-	ret = write_inode_now(inode, 0);
-
-	/* sync the superblock to buffers */
-	sb = inode->i_sb;
-	if (sb->s_dirt) {
-		if (!(sb->s_flags & MS_RDONLY))
-			hfsplus_sync_fs(sb, 1);
-		else
-			sb->s_dirt = 0;
-	}
+	struct inode *inode = file->f_mapping->host;
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+	int error, error2;
 
-	/* .. finally sync the buffers to disk */
-	err = sync_blockdev(sb->s_bdev);
-	if (!ret)
-		ret = err;
-	return ret;
+	/*
+	 * Sync inode metadata into the catalog and extent trees.
+	 */
+	sync_inode_metadata(inode, 1);
+
+	/*
+	 * And explicitly write out the btrees.
+	 */
+	error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
+	error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
+	if (!error)
+		error = error2;
+	error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
+	if (!error)
+		error = error2;
+	return error;
 }
 
 static const struct inode_operations hfsplus_file_inode_operations = {
-- 
cgit v1.2.2


From eb29d66d4f2dc98a81ae590bbdddc8cfa8964d73 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Tue, 23 Nov 2010 14:38:10 +0100
Subject: hfsplus: write up fsync for directories

fsync is supposed to not just work on regular files, but also on
directories.  Fortunately enough hfsplus_file_fsync works just fine
for directories, so we can just wire it up.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/dir.c        | 1 +
 fs/hfsplus/hfsplus_fs.h | 1 +
 fs/hfsplus/inode.c      | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 9d59c0571f59..e44c78a837e8 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -485,6 +485,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
 };
 
 const struct file_operations hfsplus_dir_operations = {
+	.fsync		= hfsplus_file_fsync,
 	.read		= generic_read_dir,
 	.readdir	= hfsplus_readdir,
 	.unlocked_ioctl = hfsplus_ioctl,
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 0d8532d96694..625549579e9e 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -352,6 +352,7 @@ int hfsplus_cat_read_inode(struct inode *, struct hfs_find_data *);
 int hfsplus_cat_write_inode(struct inode *);
 struct inode *hfsplus_new_inode(struct super_block *, int);
 void hfsplus_delete_inode(struct inode *);
+int hfsplus_file_fsync(struct file *file, int datasync);
 
 /* ioctl.c */
 long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index be68961da997..8cd8dc2e7fdd 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -302,7 +302,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
 	return 0;
 }
 
-static int hfsplus_file_fsync(struct file *file, int datasync)
+int hfsplus_file_fsync(struct file *file, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
-- 
cgit v1.2.2


From b33b7921db14abcd10c30d0ccfc68e364f5ef7fe Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Tue, 23 Nov 2010 14:38:13 +0100
Subject: hfsplus: split up inode flags

Split the flags field in the hfsplus inode into an extent_state
flag that is locked by the extent_lock, and a new flags field
that uses atomic bitops.  The second will grow more flags in the
next patch.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/extents.c    | 20 ++++++++++----------
 fs/hfsplus/hfsplus_fs.h | 18 ++++++++++++------
 fs/hfsplus/inode.c      |  9 ++++++---
 fs/hfsplus/super.c      |  1 +
 4 files changed, 29 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 0c9cb1820a52..06b4fc3151a3 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -95,24 +95,24 @@ static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data
 				HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
 
 	res = hfs_brec_find(fd);
-	if (hip->flags & HFSPLUS_FLG_EXT_NEW) {
+	if (hip->extent_state & HFSPLUS_EXT_NEW) {
 		if (res != -ENOENT)
 			return;
 		hfs_brec_insert(fd, hip->cached_extents,
 				sizeof(hfsplus_extent_rec));
-		hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+		hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
 	} else {
 		if (res)
 			return;
 		hfs_bnode_write(fd->bnode, hip->cached_extents,
 				fd->entryoffset, fd->entrylength);
-		hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY;
+		hip->extent_state &= ~HFSPLUS_EXT_DIRTY;
 	}
 }
 
 static void hfsplus_ext_write_extent_locked(struct inode *inode)
 {
-	if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) {
+	if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) {
 		struct hfs_find_data fd;
 
 		hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
@@ -155,7 +155,7 @@ static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct in
 
 	WARN_ON(!mutex_is_locked(&hip->extents_lock));
 
-	if (hip->flags & HFSPLUS_FLG_EXT_DIRTY)
+	if (hip->extent_state & HFSPLUS_EXT_DIRTY)
 		__hfsplus_ext_write_extent(inode, fd);
 
 	res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
@@ -167,7 +167,7 @@ static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct in
 		hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents);
 	} else {
 		hip->cached_start = hip->cached_blocks = 0;
-		hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+		hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
 	}
 	return res;
 }
@@ -429,7 +429,7 @@ int hfsplus_file_extend(struct inode *inode)
 					 start, len);
 		if (!res) {
 			hfsplus_dump_extent(hip->cached_extents);
-			hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
+			hip->extent_state |= HFSPLUS_EXT_DIRTY;
 			hip->cached_blocks += len;
 		} else if (res == -ENOSPC)
 			goto insert_extent;
@@ -450,7 +450,7 @@ insert_extent:
 	hip->cached_extents[0].start_block = cpu_to_be32(start);
 	hip->cached_extents[0].block_count = cpu_to_be32(len);
 	hfsplus_dump_extent(hip->cached_extents);
-	hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
+	hip->extent_state |= HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW;
 	hip->cached_start = hip->alloc_blocks;
 	hip->cached_blocks = len;
 
@@ -513,12 +513,12 @@ void hfsplus_file_truncate(struct inode *inode)
 				     alloc_cnt - start, alloc_cnt - blk_cnt);
 		hfsplus_dump_extent(hip->cached_extents);
 		if (blk_cnt > start) {
-			hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
+			hip->extent_state |= HFSPLUS_EXT_DIRTY;
 			break;
 		}
 		alloc_cnt = start;
 		hip->cached_start = hip->cached_blocks = 0;
-		hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+		hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
 		hfs_brec_remove(&fd);
 	}
 	hfs_find_exit(&fd);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 625549579e9e..9889d0033b8f 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -171,7 +171,7 @@ struct hfsplus_inode_info {
 	u32 cached_blocks;
 	hfsplus_extent_rec first_extents;
 	hfsplus_extent_rec cached_extents;
-	unsigned long flags;
+	unsigned int extent_state;
 	struct mutex extents_lock;
 
 	/*
@@ -185,6 +185,11 @@ struct hfsplus_inode_info {
 	 */
 	u32 linkid;
 
+	/*
+	 * Accessed using atomic bitops.
+	 */
+	unsigned long flags;
+
 	/*
 	 * Protected by i_mutex.
 	 */
@@ -196,12 +201,13 @@ struct hfsplus_inode_info {
 	struct inode vfs_inode;
 };
 
-#define HFSPLUS_FLG_RSRC	0x0001
-#define HFSPLUS_FLG_EXT_DIRTY	0x0002
-#define HFSPLUS_FLG_EXT_NEW	0x0004
+#define HFSPLUS_EXT_DIRTY	0x0001
+#define HFSPLUS_EXT_NEW		0x0002
+
+#define HFSPLUS_I_RSRC		0	/* represents a resource fork */
 
-#define HFSPLUS_IS_DATA(inode)   (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC))
-#define HFSPLUS_IS_RSRC(inode)   (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)
+#define HFSPLUS_IS_RSRC(inode) \
+	test_bit(HFSPLUS_I_RSRC, &HFSPLUS_I(inode)->flags)
 
 struct hfs_find_data {
 	/* filled by caller */
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 8cd8dc2e7fdd..0946e2cdca5e 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -190,7 +190,9 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
 	inode->i_ino = dir->i_ino;
 	INIT_LIST_HEAD(&hip->open_dir_list);
 	mutex_init(&hip->extents_lock);
-	hip->flags = HFSPLUS_FLG_RSRC;
+	hip->extent_state = 0;
+	hip->flags = 0;
+	set_bit(HFSPLUS_I_RSRC, &hip->flags);
 
 	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	err = hfsplus_find_cat(sb, dir->i_ino, &fd);
@@ -369,6 +371,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
 	INIT_LIST_HEAD(&hip->open_dir_list);
 	mutex_init(&hip->extents_lock);
 	atomic_set(&hip->opencnt, 0);
+	hip->extent_state = 0;
 	hip->flags = 0;
 	memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
 	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
@@ -498,8 +501,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 		hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
 					sizeof(struct hfsplus_cat_file));
 
-		hfsplus_inode_read_fork(inode, HFSPLUS_IS_DATA(inode) ?
-					&file->data_fork : &file->rsrc_fork);
+		hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ?
+					&file->rsrc_fork : &file->data_fork);
 		hfsplus_get_perms(inode, &file->permissions, 0);
 		inode->i_nlink = 1;
 		if (S_ISREG(inode->i_mode)) {
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 56e6cf80c5e0..985423728e1d 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -66,6 +66,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
 	INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
 	mutex_init(&HFSPLUS_I(inode)->extents_lock);
 	HFSPLUS_I(inode)->flags = 0;
+	HFSPLUS_I(inode)->extent_state = 0;
 	HFSPLUS_I(inode)->rsrc_inode = NULL;
 	atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
 
-- 
cgit v1.2.2


From e34947056076ca5467ee8256d2d9cbc594a79b37 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Tue, 23 Nov 2010 14:38:15 +0100
Subject: hfsplus: optimize fsync

Avoid doing unessecary work in fsync.  Do nothing unless the inode
was marked dirty, and only write the various metadata inodes out if
they contain any dirty state from this inode.  This is archived by
adding three new dirty bits to the hfsplus-specific inode which are
set in the correct places.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/catalog.c    | 10 ++++++----
 fs/hfsplus/extents.c    | 30 +++++++++++++++++++++++-------
 fs/hfsplus/hfsplus_fs.h | 37 ++++++++++++++++++++++++++-----------
 fs/hfsplus/inode.c      | 27 +++++++++++++++++++--------
 fs/hfsplus/ioctl.c      |  4 +++-
 fs/hfsplus/super.c      |  2 +-
 6 files changed, 78 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 8af45fc5b051..0aa40b36a9e7 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -227,7 +227,8 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct ino
 
 	dir->i_size++;
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
-	mark_inode_dirty(dir);
+	hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
+
 	hfs_find_exit(&fd);
 	return 0;
 
@@ -308,7 +309,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 
 	dir->i_size--;
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
-	mark_inode_dirty(dir);
+	hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
 out:
 	hfs_find_exit(&fd);
 
@@ -353,7 +354,6 @@ int hfsplus_rename_cat(u32 cnid,
 		goto out;
 	dst_dir->i_size++;
 	dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC;
-	mark_inode_dirty(dst_dir);
 
 	/* finally remove the old entry */
 	hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
@@ -365,7 +365,6 @@ int hfsplus_rename_cat(u32 cnid,
 		goto out;
 	src_dir->i_size--;
 	src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC;
-	mark_inode_dirty(src_dir);
 
 	/* remove old thread entry */
 	hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL);
@@ -387,6 +386,9 @@ int hfsplus_rename_cat(u32 cnid,
 		goto out;
 	}
 	err = hfs_brec_insert(&dst_fd, &entry, entry_size);
+
+	hfsplus_mark_inode_dirty(dst_dir, HFSPLUS_I_CAT_DIRTY);
+	hfsplus_mark_inode_dirty(src_dir, HFSPLUS_I_CAT_DIRTY);
 out:
 	hfs_bnode_put(dst_fd.bnode);
 	hfs_find_exit(&src_fd);
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 06b4fc3151a3..b1127ef26750 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -108,6 +108,14 @@ static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data
 				fd->entryoffset, fd->entrylength);
 		hip->extent_state &= ~HFSPLUS_EXT_DIRTY;
 	}
+
+	/*
+	 * We can't just use hfsplus_mark_inode_dirty here, because we
+	 * also get called from hfsplus_write_inode, which should not
+	 * redirty the inode.  Instead the callers have to be careful
+	 * to explicily mark the inode dirty, too.
+	 */
+	set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags);
 }
 
 static void hfsplus_ext_write_extent_locked(struct inode *inode)
@@ -197,6 +205,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	int res = -EIO;
 	u32 ablock, dblock, mask;
+	int was_dirty = 0;
 	int shift;
 
 	/* Convert inode block to disk allocation block */
@@ -223,14 +232,20 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
 		return -EIO;
 
 	mutex_lock(&hip->extents_lock);
+
+	/*
+	 * hfsplus_ext_read_extent will write out a cached extent into
+	 * the extents btree.  In that case we may have to mark the inode
+	 * dirty even for a pure read of an extent here.
+	 */
+	was_dirty = (hip->extent_state & HFSPLUS_EXT_DIRTY);
 	res = hfsplus_ext_read_extent(inode, ablock);
-	if (!res) {
-		dblock = hfsplus_ext_find_block(hip->cached_extents,
-						ablock - hip->cached_start);
-	} else {
+	if (res) {
 		mutex_unlock(&hip->extents_lock);
 		return -EIO;
 	}
+	dblock = hfsplus_ext_find_block(hip->cached_extents,
+					ablock - hip->cached_start);
 	mutex_unlock(&hip->extents_lock);
 
 done:
@@ -242,8 +257,9 @@ done:
 		hip->phys_size += sb->s_blocksize;
 		hip->fs_blocks++;
 		inode_add_bytes(inode, sb->s_blocksize);
-		mark_inode_dirty(inode);
 	}
+	if (create || was_dirty)
+		mark_inode_dirty(inode);
 	return 0;
 }
 
@@ -438,7 +454,7 @@ out:
 	mutex_unlock(&hip->extents_lock);
 	if (!res) {
 		hip->alloc_blocks += len;
-		mark_inode_dirty(inode);
+		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
 	}
 	return res;
 
@@ -529,5 +545,5 @@ out:
 	hip->phys_size = inode->i_size;
 	hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
 	inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
-	mark_inode_dirty(inode);
+	hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
 }
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 9889d0033b8f..65c698f78ef7 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -157,6 +157,11 @@ struct hfsplus_sb_info {
 #define HFSPLUS_SB_HFSX		3
 #define HFSPLUS_SB_CASEFOLD	4
 
+static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
 
 struct hfsplus_inode_info {
 	atomic_t opencnt;
@@ -205,10 +210,31 @@ struct hfsplus_inode_info {
 #define HFSPLUS_EXT_NEW		0x0002
 
 #define HFSPLUS_I_RSRC		0	/* represents a resource fork */
+#define HFSPLUS_I_CAT_DIRTY	1	/* has changes in the catalog tree */
+#define HFSPLUS_I_EXT_DIRTY	2	/* has changes in the extent tree */
+#define HFSPLUS_I_ALLOC_DIRTY	3	/* has changes in the allocation file */
 
 #define HFSPLUS_IS_RSRC(inode) \
 	test_bit(HFSPLUS_I_RSRC, &HFSPLUS_I(inode)->flags)
 
+static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
+{
+	return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
+}
+
+/*
+ * Mark an inode dirty, and also mark the btree in which the
+ * specific type of metadata is stored.
+ * For data or metadata that gets written back by into the catalog btree
+ * by hfsplus_write_inode a plain mark_inode_dirty call is enough.
+ */
+static inline void hfsplus_mark_inode_dirty(struct inode *inode,
+		unsigned int flag)
+{
+	set_bit(flag, &HFSPLUS_I(inode)->flags);
+	mark_inode_dirty(inode);
+}
+
 struct hfs_find_data {
 	/* filled by caller */
 	hfsplus_btree_key *search_key;
@@ -397,17 +423,6 @@ int hfs_part_find(struct super_block *, sector_t *, sector_t *);
 int hfsplus_submit_bio(struct block_device *bdev, sector_t sector,
 		void *data, int rw);
 
-/* access macros */
-static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
-{
-	return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
-}
-
 /* time macros */
 #define __hfsp_mt2ut(t)		(be32_to_cpu(t) - 2082844800U)
 #define __hfsp_ut2mt(t)		(cpu_to_be32(t + 2082844800U))
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 0946e2cdca5e..bf6535b73261 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -307,8 +307,9 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
 int hfsplus_file_fsync(struct file *file, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
+	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
-	int error, error2;
+	int error = 0, error2;
 
 	/*
 	 * Sync inode metadata into the catalog and extent trees.
@@ -318,13 +319,21 @@ int hfsplus_file_fsync(struct file *file, int datasync)
 	/*
 	 * And explicitly write out the btrees.
 	 */
-	error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
-	error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
-	if (!error)
-		error = error2;
-	error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
-	if (!error)
-		error = error2;
+	if (test_and_clear_bit(HFSPLUS_I_CAT_DIRTY, &hip->flags))
+		error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
+
+	if (test_and_clear_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags)) {
+		error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
+		if (!error)
+			error = error2;
+	}
+
+	if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) {
+		error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
+		if (!error)
+			error = error2;
+	}
+
 	return error;
 }
 
@@ -590,6 +599,8 @@ int hfsplus_cat_write_inode(struct inode *inode)
 		hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
 					 sizeof(struct hfsplus_cat_file));
 	}
+
+	set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags);
 out:
 	hfs_find_exit(&fd);
 	return 0;
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 40a85a3ded6e..f5a7224f4cbb 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -147,9 +147,11 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
 			res = -ERANGE;
 	} else
 		res = -EOPNOTSUPP;
-	if (!res)
+	if (!res) {
 		hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
 				sizeof(struct hfsplus_cat_file));
+		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
+	}
 out:
 	hfs_find_exit(&fd);
 	return res;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 985423728e1d..036650123c4c 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -472,7 +472,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 				   &str, sbi->hidden_dir);
 		mutex_unlock(&sbi->vh_mutex);
 
-		mark_inode_dirty(sbi->hidden_dir);
+		hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY);
 	}
 out:
 	unload_nls(sbi->nls);
-- 
cgit v1.2.2


From 34a2d313c51f47cae50ccb89f4196462665f2c48 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Tue, 23 Nov 2010 14:38:21 +0100
Subject: hfsplus: flush disk caches in sync and fsync

Flush the disk cache in fsync and sync to make sure data actually is
on disk on completion of these system calls.  There is a nobarrier
mount option to disable this behaviour.  It's slightly misnamed now
that barrier actually are gone, but it matches the name used by all
major filesystems.

Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/hfsplus_fs.h |  1 +
 fs/hfsplus/inode.c      |  4 ++++
 fs/hfsplus/options.c    | 11 +++++++++++
 fs/hfsplus/super.c      |  5 +++++
 4 files changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 65c698f78ef7..17ac254e72cb 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -156,6 +156,7 @@ struct hfsplus_sb_info {
 #define HFSPLUS_SB_FORCE	2
 #define HFSPLUS_SB_HFSX		3
 #define HFSPLUS_SB_CASEFOLD	4
+#define HFSPLUS_SB_NOBARRIER	5
 
 static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
 {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index bf6535b73261..bda7464c205d 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -8,6 +8,7 @@
  * Inode handling routines
  */
 
+#include <linux/blkdev.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
@@ -334,6 +335,9 @@ int hfsplus_file_fsync(struct file *file, int datasync)
 			error = error2;
 	}
 
+	if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+
 	return error;
 }
 
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 43b02b5525eb..dbd9d0c426cb 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -23,6 +23,7 @@ enum {
 	opt_umask, opt_uid, opt_gid,
 	opt_part, opt_session, opt_nls,
 	opt_nodecompose, opt_decompose,
+	opt_barrier, opt_nobarrier,
 	opt_force, opt_err
 };
 
@@ -37,6 +38,8 @@ static const match_table_t tokens = {
 	{ opt_nls, "nls=%s" },
 	{ opt_decompose, "decompose" },
 	{ opt_nodecompose, "nodecompose" },
+	{ opt_barrier, "barrier" },
+	{ opt_nobarrier, "nobarrier" },
 	{ opt_force, "force" },
 	{ opt_err, NULL }
 };
@@ -174,6 +177,12 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
 		case opt_nodecompose:
 			set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
 			break;
+		case opt_barrier:
+			clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
+			break;
+		case opt_nobarrier:
+			set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
+			break;
 		case opt_force:
 			set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
 			break;
@@ -212,5 +221,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
 		seq_printf(seq, ",nls=%s", sbi->nls->charset);
 	if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
 		seq_printf(seq, ",nodecompose");
+	if (test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
+		seq_printf(seq, ",nobarrier");
 	return 0;
 }
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 036650123c4c..154478c71f25 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
+#include <linux/blkdev.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/vfs.h>
@@ -212,6 +213,10 @@ int hfsplus_sync_fs(struct super_block *sb, int wait)
 out:
 	mutex_unlock(&sbi->alloc_mutex);
 	mutex_unlock(&sbi->vh_mutex);
+
+	if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
+		blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+
 	return error;
 }
 
-- 
cgit v1.2.2


From d2115778c7ea0df2201f1ad9aab948c49ffa1078 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 3 Nov 2010 19:58:53 +0000
Subject: GFS2: Change two WQ_RESCUERs into WQ_MEM_RECLAIM

The WQ_RESCUER flag should only be used internally to the
workqueue implementation.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
 fs/gfs2/glock.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f92c17704169..5a56568d289b 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1782,12 +1782,13 @@ int __init gfs2_glock_init(void)
 	}
 #endif
 
-	glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
+	glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
 					  WQ_HIGHPRI | WQ_FREEZEABLE, 0);
 	if (IS_ERR(glock_workqueue))
 		return PTR_ERR(glock_workqueue);
-	gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
-						WQ_FREEZEABLE, 0);
+	gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
+						WQ_MEM_RECLAIM | WQ_FREEZEABLE,
+						0);
 	if (IS_ERR(gfs2_delete_workqueue)) {
 		destroy_workqueue(glock_workqueue);
 		return PTR_ERR(gfs2_delete_workqueue);
-- 
cgit v1.2.2


From cc18152eb7c27653199546bd14e991a451ab8d1b Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 5 Nov 2010 16:12:36 -0700
Subject: GFS2: fs/gfs2/glock.c: Convert sprintf_symbol to %pS

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 5a56568d289b..13155f60b59d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1626,18 +1626,17 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
 static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 {
 	struct task_struct *gh_owner = NULL;
-	char buffer[KSYM_SYMBOL_LEN];
 	char flags_buf[32];
 
-	sprint_symbol(buffer, gh->gh_ip);
 	if (gh->gh_owner_pid)
 		gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
-	gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
-		  state2str(gh->gh_state),
-		  hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
-		  gh->gh_error, 
-		  gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
-		  gh_owner ? gh_owner->comm : "(ended)", buffer);
+	gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
+		       state2str(gh->gh_state),
+		       hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+		       gh->gh_error,
+		       gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+		       gh_owner ? gh_owner->comm : "(ended)",
+		       (void *)gh->gh_ip);
 	return 0;
 }
 
-- 
cgit v1.2.2


From 9e55cd53728719ac3a3234a6618259ab8e203a10 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 9 Nov 2010 14:09:53 +0000
Subject: GFS2: Remove unreachable calls to vmtruncate

Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c     | 9 ---------
 fs/gfs2/ops_inode.c | 9 ---------
 fs/gfs2/xattr.c     | 9 ---------
 3 files changed, 27 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e1213f7f9217..14e682dbe8bf 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -916,17 +916,8 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 	if (error)
 		return error;
 
-	if ((attr->ia_valid & ATTR_SIZE) &&
-	    attr->ia_size != i_size_read(inode)) {
-		error = vmtruncate(inode, attr->ia_size);
-		if (error)
-			return error;
-	}
-
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
-
-	gfs2_assert_warn(GFS2_SB(inode), !error);
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 12cbea7502c2..457ecd155187 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1104,17 +1104,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 	if (error)
 		goto out_end_trans;
 
-	if ((attr->ia_valid & ATTR_SIZE) &&
-	    attr->ia_size != i_size_read(inode)) {
-		int error;
-
-		error = vmtruncate(inode, attr->ia_size);
-		gfs2_assert_warn(sdp, !error);
-	}
-
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
-
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 30b58f07c8a6..ecfd40dd28ad 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1325,17 +1325,8 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 	if (error)
 		goto out_trans_end;
 
-	if ((attr->ia_valid & ATTR_SIZE) &&
-	    attr->ia_size != i_size_read(inode)) {
-		int error;
-
-		error = vmtruncate(inode, attr->ia_size);
-		gfs2_assert_warn(GFS2_SB(inode), !error);
-	}
-
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
-
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
-- 
cgit v1.2.2


From 2ae51ed7b548c1d943d080da617515e801ea5c3e Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 10 Nov 2010 15:14:57 +0000
Subject: GFS2: Clean up duplicated setattr code

While preparing the last patch I noticed that the gfs2_setattr_simple
code had been duplicated into two other places. This patch updates
those to call gfs2_setattr_simple rather than open coding it.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_inode.c |  9 +--------
 fs/gfs2/xattr.c     | 14 +-------------
 2 files changed, 2 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 457ecd155187..1db6b7343229 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1069,7 +1069,6 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct buffer_head *dibh;
 	u32 ouid, ogid, nuid, ngid;
 	int error;
 
@@ -1100,16 +1099,10 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 	if (error)
 		goto out_gunlock_q;
 
-	error = gfs2_meta_inode_buffer(ip, &dibh);
+	error = gfs2_setattr_simple(ip, attr);
 	if (error)
 		goto out_end_trans;
 
-	setattr_copy(inode, attr);
-	mark_inode_dirty(inode);
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(ip, dibh->b_data);
-	brelse(dibh);
-
 	if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
 		u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
 		gfs2_quota_change(ip, -blocks, ouid, ogid);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index ecfd40dd28ad..439b61c03262 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1296,10 +1296,8 @@ fail:
 
 int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 {
-	struct inode *inode = &ip->i_inode;
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_ea_location el;
-	struct buffer_head *dibh;
 	int error;
 
 	error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
@@ -1321,17 +1319,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 	if (error)
 		return error;
 
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
-		goto out_trans_end;
-
-	setattr_copy(inode, attr);
-	mark_inode_dirty(inode);
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(ip, dibh->b_data);
-	brelse(dibh);
-
-out_trans_end:
+	error = gfs2_setattr_simple(ip, attr);
 	gfs2_trans_end(sdp);
 	return error;
 }
-- 
cgit v1.2.2


From 5e69069c1afb655b5f1a154856ccdb4bb7327b81 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 9 Nov 2010 16:35:20 -0800
Subject: GFS2: fs/gfs2/glock.c: Use printf extension %pV

Using %pV reduces the number of printk calls and
eliminates any possible message interleaving from
other printk calls.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 13155f60b59d..a9b53a48abeb 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -951,17 +951,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
 
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 
 	va_start(args, fmt);
+
 	if (seq) {
 		struct gfs2_glock_iter *gi = seq->private;
 		vsprintf(gi->string, fmt, args);
 		seq_printf(seq, gi->string);
 	} else {
-		printk(KERN_ERR " ");
-		vprintk(fmt, args);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+
+		printk(KERN_ERR " %pV", &vaf);
 	}
+
 	va_end(args);
 }
 
-- 
cgit v1.2.2


From edc221d00bd5c6da0e5c67701f3782b72796619f Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 10 Nov 2010 13:19:06 -0800
Subject: GFS2: fs/gfs2/glock.h: Add __attribute__((format(printf,2,3)) to
 gfs2_print_dbg

Functions that use printf formatting, especially
those that use %pV, should have their uses of
printf format and arguments checked by the compiler.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index db1c26d6d220..a12d11767752 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -212,6 +212,8 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
 int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+
+__attribute__ ((format(printf, 2, 3)))
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 
 /**
-- 
cgit v1.2.2


From 802ec9b6682349d9d9c92a9e55f44324d2954f41 Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Thu, 18 Nov 2010 11:26:46 -0500
Subject: GFS2: Allow gfs2 to update quota usage values through the quotactl
 interface

With this patch the gfs2_set_dqblk() function will be able to update the
quota usage block count (FS_DQ_BCOUNT) in addition to the already supported
FS_DQ_BHARD (limit) and FS_DQ_BSOFT (warn) fields of the dquot structure.

Signed-off-by: Abhi Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/quota.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index f606baf9ba72..a689901963de 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -666,6 +666,10 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 			qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
 			qd->qd_qb.qb_limit = qp->qu_limit;
 		}
+		if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
+			qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+			qd->qd_qb.qb_value = qp->qu_value;
+		}
 	}
 
 	/* Write the quota into the quota file on disk */
@@ -1509,7 +1513,7 @@ out:
 }
 
 /* GFS2 only supports a subset of the XFS fields */
-#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
 
 static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
 			  struct fs_disk_quota *fdq)
@@ -1569,9 +1573,15 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
 	if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
 	    ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
 		fdq->d_fieldmask ^= FS_DQ_BSOFT;
+
 	if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
 	    ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
 		fdq->d_fieldmask ^= FS_DQ_BHARD;
+
+	if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
+	    ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+		fdq->d_fieldmask ^= FS_DQ_BCOUNT;
+
 	if (fdq->d_fieldmask == 0)
 		goto out_i;
 
@@ -1620,4 +1630,3 @@ const struct quotactl_ops gfs2_quotactl_ops = {
 	.get_dqblk	= gfs2_get_dqblk,
 	.set_dqblk	= gfs2_set_dqblk,
 };
-
-- 
cgit v1.2.2


From 921169ca2f7c8a0a2ccda2ce33c465dfe3ae30ef Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 29 Nov 2010 12:50:38 +0000
Subject: GFS2: Clean up of gdlm_lock function

The DLM never returns -EAGAIN in response to dlm_lock(), and even
if it did, the test in gdlm_lock() was wrong anyway. Once that
test is removed, it is possible to greatly simplify this code
by simply using a "normal" error return code (0 for success).

We then no longer need the LM_OUT_ASYNC return code which can
be removed.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c    | 31 +++++++++----------------------
 fs/gfs2/glock.h    | 12 ++++--------
 fs/gfs2/lock_dlm.c | 14 ++++----------
 3 files changed, 17 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a9b53a48abeb..2dd1d7238111 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -541,21 +541,6 @@ out_locked:
 	spin_unlock(&gl->gl_spin);
 }
 
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-				 unsigned int req_state,
-				 unsigned int flags)
-{
-	int ret = LM_OUT_ERROR;
-
-	if (!sdp->sd_lockstruct.ls_ops->lm_lock)
-		return req_state == LM_ST_UNLOCKED ? 0 : req_state;
-
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
-							 req_state, flags);
-	return ret;
-}
-
 /**
  * do_xmote - Calls the DLM to change the state of a lock
  * @gl: The lock state
@@ -575,8 +560,8 @@ __acquires(&gl->gl_spin)
 
 	lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
 		      LM_FLAG_PRIORITY);
-	BUG_ON(gl->gl_state == target);
-	BUG_ON(gl->gl_state == gl->gl_target);
+	GLOCK_BUG_ON(gl, gl->gl_state == target);
+	GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
 	if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
 	    glops->go_inval) {
 		set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
@@ -594,15 +579,17 @@ __acquires(&gl->gl_spin)
 	    gl->gl_state == LM_ST_DEFERRED) &&
 	    !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
 		lck_flags |= LM_FLAG_TRY_1CB;
-	ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
 
-	if (!(ret & LM_OUT_ASYNC)) {
-		finish_xmote(gl, ret);
+	if (sdp->sd_lockstruct.ls_ops->lm_lock)	{
+		/* lock_dlm */
+		ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
+		GLOCK_BUG_ON(gl, ret);
+	} else { /* lock_nolock */
+		finish_xmote(gl, target);
 		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 			gfs2_glock_put(gl);
-	} else {
-		GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
 	}
+
 	spin_lock(&gl->gl_spin);
 }
 
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index a12d11767752..ba6865c6e7e0 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -91,7 +91,7 @@ enum {
 #define GL_NOCACHE		0x00000400
   
 /*
- * lm_lock() and lm_async_cb return flags
+ * lm_async_cb return flags
  *
  * LM_OUT_ST_MASK
  * Masks the lower two bits of lock state in the returned value.
@@ -99,15 +99,11 @@ enum {
  * LM_OUT_CANCELED
  * The lock request was canceled.
  *
- * LM_OUT_ASYNC
- * The result of the request will be returned in an LM_CB_ASYNC callback.
- *
  */
 
 #define LM_OUT_ST_MASK		0x00000003
 #define LM_OUT_CANCELED		0x00000008
-#define LM_OUT_ASYNC		0x00000080
-#define LM_OUT_ERROR		0x00000100
+#define LM_OUT_ERROR		0x00000004
 
 /*
  * lm_recovery_done() messages
@@ -124,8 +120,8 @@ struct lm_lockops {
  	void (*lm_unmount) (struct gfs2_sbd *sdp);
 	void (*lm_withdraw) (struct gfs2_sbd *sdp);
 	void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
-	unsigned int (*lm_lock) (struct gfs2_glock *gl,
-				 unsigned int req_state, unsigned int flags);
+	int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
+			unsigned int flags);
 	void (*lm_cancel) (struct gfs2_glock *gl);
 	const match_table_t *lm_tokens;
 };
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 1c09425b45fd..f40ce34c803e 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -146,11 +146,10 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
 	return lkf;
 }
 
-static unsigned int gdlm_lock(struct gfs2_glock *gl,
-			      unsigned int req_state, unsigned int flags)
+static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
+		     unsigned int flags)
 {
 	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
-	int error;
 	int req;
 	u32 lkf;
 
@@ -162,13 +161,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
 	 * Submit the actual lock request.
 	 */
 
-	error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
-			 GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
-	if (error == -EAGAIN)
-		return 0;
-	if (error)
-		return LM_OUT_ERROR;
-	return LM_OUT_ASYNC;
+	return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
+			GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
 }
 
 static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
-- 
cgit v1.2.2


From 9833c39400c3e6ee19daeded6910df648741611e Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@iki.fi>
Date: Fri, 19 Nov 2010 09:29:24 +0100
Subject: ARM: 6485/5: proc/vmcore - allow archs to override
 vmcore_elf_check_arch()

Allow architectures to redefine this macro if needed. This is useful for
example in architectures where 64-bit ELF vmcores are not supported.
Specifying zero vmcore_elf64_check_arch() allows compiler to optimize
away unnecessary parts of parse_crash_elf64_headers().

We also rename the macro to vmcore_elf64_check_arch() to reflect that it
is used for 64-bit vmcores only.

Signed-off-by: Mika Westerberg <mika.westerberg@iki.fi>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 fs/proc/vmcore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 2367fb3f70bc..74802bc5ded9 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -499,7 +499,7 @@ static int __init parse_crash_elf64_headers(void)
 	/* Do some basic Verification. */
 	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
 		(ehdr.e_type != ET_CORE) ||
-		!vmcore_elf_check_arch(&ehdr) ||
+		!vmcore_elf64_check_arch(&ehdr) ||
 		ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
 		ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
 		ehdr.e_version != EV_CURRENT ||
-- 
cgit v1.2.2


From 5091faa449ee0b7d73bc296a93bca9540fc51d0a Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 30 Nov 2010 14:18:03 +0100
Subject: sched: Add 'autogroup' scheduling feature: automated per session task
 groups

A recurring complaint from CFS users is that parallel kbuild has
a negative impact on desktop interactivity.  This patch
implements an idea from Linus, to automatically create task
groups.  Currently, only per session autogroups are implemented,
but the patch leaves the way open for enhancement.

Implementation: each task's signal struct contains an inherited
pointer to a refcounted autogroup struct containing a task group
pointer, the default for all tasks pointing to the
init_task_group.  When a task calls setsid(), a new task group
is created, the process is moved into the new task group, and a
reference to the preveious task group is dropped.  Child
processes inherit this task group thereafter, and increase it's
refcount.  When the last thread of a process exits, the
process's reference is dropped, such that when the last process
referencing an autogroup exits, the autogroup is destroyed.

At runqueue selection time, IFF a task has no cgroup assignment,
its current autogroup is used.

Autogroup bandwidth is controllable via setting it's nice level
through the proc filesystem:

  cat /proc/<pid>/autogroup

Displays the task's group and the group's nice level.

  echo <nice level> > /proc/<pid>/autogroup

Sets the task group's shares to the weight of nice <level> task.
Setting nice level is rate limited for !admin users due to the
abuse risk of task group locking.

The feature is enabled from boot by default if
CONFIG_SCHED_AUTOGROUP=y is selected, but can be disabled via
the boot option noautogroup, and can also be turned on/off on
the fly via:

  echo [01] > /proc/sys/kernel/sched_autogroup_enabled

... which will automatically move tasks to/from the root task group.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Markus Trippelsdorf <markus@trippelsdorf.de>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Paul Turner <pjt@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
[ Removed the task_group_path() debug code, and fixed !EVENTFD build failure. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1290281700.28711.9.camel@maggy.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/base.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index f3d02ca461ec..2fa0ce29b6dc 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1407,6 +1407,82 @@ static const struct file_operations proc_pid_sched_operations = {
 
 #endif
 
+#ifdef CONFIG_SCHED_AUTOGROUP
+/*
+ * Print out autogroup related information:
+ */
+static int sched_autogroup_show(struct seq_file *m, void *v)
+{
+	struct inode *inode = m->private;
+	struct task_struct *p;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+	proc_sched_autogroup_show_task(p, m);
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static ssize_t
+sched_autogroup_write(struct file *file, const char __user *buf,
+	    size_t count, loff_t *offset)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct task_struct *p;
+	char buffer[PROC_NUMBUF];
+	long nice;
+	int err;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+
+	err = strict_strtol(strstrip(buffer), 0, &nice);
+	if (err)
+		return -EINVAL;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	err = nice;
+	err = proc_sched_autogroup_set_nice(p, &err);
+	if (err)
+		count = err;
+
+	put_task_struct(p);
+
+	return count;
+}
+
+static int sched_autogroup_open(struct inode *inode, struct file *filp)
+{
+	int ret;
+
+	ret = single_open(filp, sched_autogroup_show, NULL);
+	if (!ret) {
+		struct seq_file *m = filp->private_data;
+
+		m->private = inode;
+	}
+	return ret;
+}
+
+static const struct file_operations proc_pid_sched_autogroup_operations = {
+	.open		= sched_autogroup_open,
+	.read		= seq_read,
+	.write		= sched_autogroup_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
+
 static ssize_t comm_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *offset)
 {
@@ -2732,6 +2808,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 	INF("limits",	  S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
 	REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
+#endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+	REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
 #endif
 	REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-- 
cgit v1.2.2


From 0b1246e6776c79719ff4a3afd9c38fba99b99d5a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 30 Nov 2010 15:33:04 +0000
Subject: GFS2: Remove duplicate #defines from glock.h

There are a number of duplicated #defines in glock.h
plus one which is unused. This removes the extra
definitions.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index ba6865c6e7e0..691851ceb615 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -87,7 +87,6 @@ enum {
 #define GL_ASYNC		0x00000040
 #define GL_EXACT		0x00000080
 #define GL_SKIP			0x00000100
-#define GL_ATIME		0x00000200
 #define GL_NOCACHE		0x00000400
   
 /*
@@ -126,19 +125,6 @@ struct lm_lockops {
 	const match_table_t *lm_tokens;
 };
 
-#define LM_FLAG_TRY		0x00000001
-#define LM_FLAG_TRY_1CB		0x00000002
-#define LM_FLAG_NOEXP		0x00000004
-#define LM_FLAG_ANY		0x00000008
-#define LM_FLAG_PRIORITY	0x00000010
-
-#define GL_ASYNC		0x00000040
-#define GL_EXACT		0x00000080
-#define GL_SKIP			0x00000100
-#define GL_NOCACHE		0x00000400
-
-#define GLR_TRYFAILED		13
-
 extern struct workqueue_struct *gfs2_delete_workqueue;
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
-- 
cgit v1.2.2


From 0489b3f5eba735413ccedd425651cf41d6b1f7c5 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Tue, 30 Nov 2010 09:38:35 -0600
Subject: GFS2: reread rindex when necessary to grow rindex

When GFS2 grew the filesystem, it was never rereading the rindex file during
the grow. This is necessary for large grows when the filesystem is almost full,
and GFS2 needs to use some of the space allocated earlier in the grow to
complete it.  Now, if GFS2 fails to reserve the necessary space and the rindex
file is not uptodate, it rereads it.  Also, the only difference between
gfs2_ri_update() and gfs2_ri_update_special() was that gfs2_ri_update_special()
didn't clear out the existing resource groups, since you knew that it was only
called when there were no resource groups.  Attempting to clear out the
resource groups when there are none takes almost no time, and rarely happens,
so I simply removed gfs2_ri_update_special().

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 53 +++++++++++------------------------------------------
 1 file changed, 11 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 33c8407b876f..6375b3860a9a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -613,46 +613,6 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
 	return 0;
 }
 
-/**
- * gfs2_ri_update_special - Pull in a new resource index from the disk
- *
- * This is a special version that's safe to call from gfs2_inplace_reserve_i.
- * In this case we know that we don't have any resource groups in memory yet.
- *
- * @ip: pointer to the rindex inode
- *
- * Returns: 0 on successful update, error code otherwise
- */
-static int gfs2_ri_update_special(struct gfs2_inode *ip)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct inode *inode = &ip->i_inode;
-	struct file_ra_state ra_state;
-	struct gfs2_rgrpd *rgd;
-	unsigned int max_data = 0;
-	int error;
-
-	file_ra_state_init(&ra_state, inode->i_mapping);
-	for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
-		/* Ignore partials */
-		if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-		    i_size_read(inode))
-			break;
-		error = read_rindex_entry(ip, &ra_state);
-		if (error) {
-			clear_rgrpdi(sdp);
-			return error;
-		}
-	}
-	list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
-		if (rgd->rd_data > max_data)
-			max_data = rgd->rd_data;
-	sdp->sd_max_rg_data = max_data;
-
-	sdp->sd_rindex_uptodate = 1;
-	return 0;
-}
-
 /**
  * gfs2_rindex_hold - Grab a lock on the rindex
  * @sdp: The GFS2 superblock
@@ -1226,16 +1186,25 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
 			error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
 		else if (!sdp->sd_rgrps) /* We may not have the rindex read
 					    in, so: */
-			error = gfs2_ri_update_special(ip);
+			error = gfs2_ri_update(ip);
 		if (error)
 			return error;
 	}
 
+try_again:
 	do {
 		error = get_local_rgrp(ip, &last_unlinked);
 		/* If there is no space, flushing the log may release some */
-		if (error)
+		if (error) {
+			if (ip == GFS2_I(sdp->sd_rindex) &&
+			    !sdp->sd_rindex_uptodate) {
+				error = gfs2_ri_update(ip);
+				if (error)
+					return error;
+				goto try_again;
+			}
 			gfs2_log_flush(sdp, NULL);
+		}
 	} while (error && tries++ < 3);
 
 	if (error) {
-- 
cgit v1.2.2


From 086d8334cf73b3bb695b82dd864a7a8b00d96b7e Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Tue, 23 Nov 2010 23:52:55 -0600
Subject: GFS2: fix recursive locking during rindex truncates

When you truncate the rindex file, you need to avoid calling gfs2_rindex_hold,
since you already hold it.  However, if you haven't already read in the
resource groups, you need to do that.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 9 +++++++--
 fs/gfs2/rgrp.c | 2 +-
 fs/gfs2/rgrp.h | 1 +
 3 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5476c066d4ee..ae7d205de0d1 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -780,7 +780,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	if (metadata)
 		revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
 
-	error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+	if (ip != GFS2_I(sdp->sd_rindex))
+		error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+	else if (!sdp->sd_rgrps)
+		error = gfs2_ri_update(ip);
+
 	if (error)
 		return error;
 
@@ -879,7 +883,8 @@ out_rg_gunlock:
 out_rlist:
 	gfs2_rlist_free(&rlist);
 out:
-	gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
+	if (ip != GFS2_I(sdp->sd_rindex))
+		gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
 	return error;
 }
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 6375b3860a9a..25dbe5c84dff 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -583,7 +583,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
  * Returns: 0 on successful update, error code otherwise
  */
 
-static int gfs2_ri_update(struct gfs2_inode *ip)
+int gfs2_ri_update(struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct inode *inode = &ip->i_inode;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 0e35c0466f9a..50c2bb04369c 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -48,6 +48,7 @@ extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
 
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
+extern int gfs2_ri_update(struct gfs2_inode *ip);
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
 extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
 
-- 
cgit v1.2.2


From e06dfc492870e1d380f02722cde084b724dc197b Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 30 Nov 2010 15:46:02 +0000
Subject: GFS2: Fix uninitialised error value in previous patch

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index ae7d205de0d1..3c4039d5eef1 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -763,7 +763,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	int metadata;
 	unsigned int revokes = 0;
 	int x;
-	int error;
+	int error = 0;
 
 	if (!*top)
 		sm->sm_first = 0;
-- 
cgit v1.2.2


From 47a25380e37f44db7202093ca92e4af569c34f55 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 30 Nov 2010 15:49:31 +0000
Subject: GFS2: Merge glock state fields into a bitfield

We can only merge the fields into a bitfield if the locking
rules for them are the same. In this case gl_spin covers all
of the fields (write side) but a couple of them are used
with GLF_LOCK as the read side lock, which should be ok
since we know that the field in question won't be changing
at the time.

The gl_req setting has to be done earlier (in glock.c) in order
to place it under gl_spin. The gl_reply setting also has to be
brought under gl_spin in order to comply with the new rules.

This saves 4*sizeof(unsigned int) per glock.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/glock.c    |  9 +++++++--
 fs/gfs2/incore.h   | 12 +++++++-----
 fs/gfs2/lock_dlm.c |  1 -
 3 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 2dd1d7238111..08a8beb152e6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -567,6 +567,7 @@ __acquires(&gl->gl_spin)
 		set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
 		do_error(gl, 0); /* Fail queued try locks */
 	}
+	gl->gl_req = target;
 	spin_unlock(&gl->gl_spin);
 	if (glops->go_xmote_th)
 		glops->go_xmote_th(gl);
@@ -1353,24 +1354,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
  * @gl: Pointer to the glock
  * @ret: The return value from the dlm
  *
+ * The gl_reply field is under the gl_spin lock so that it is ok
+ * to use a bitfield shared with other glock state fields.
  */
 
 void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 {
 	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
 
+	spin_lock(&gl->gl_spin);
 	gl->gl_reply = ret;
 
 	if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
-		spin_lock(&gl->gl_spin);
 		if (gfs2_should_freeze(gl)) {
 			set_bit(GLF_FROZEN, &gl->gl_flags);
 			spin_unlock(&gl->gl_spin);
 			return;
 		}
-		spin_unlock(&gl->gl_spin);
 	}
+
+	spin_unlock(&gl->gl_spin);
 	set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+	smp_wmb();
 	gfs2_glock_hold(gl);
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 		gfs2_glock_put(gl);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 764fbb49efc8..8d3d2b4a0a7d 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -207,12 +207,14 @@ struct gfs2_glock {
 
 	spinlock_t gl_spin;
 
-	unsigned int gl_state;
-	unsigned int gl_target;
-	unsigned int gl_reply;
+	/* State fields protected by gl_spin */
+	unsigned int gl_state:2,	/* Current state */
+		     gl_target:2,	/* Target state */
+		     gl_demote_state:2,	/* State requested by remote node */
+		     gl_req:2,		/* State in last dlm request */
+		     gl_reply:8;	/* Last reply from the dlm */
+
 	unsigned int gl_hash;
-	unsigned int gl_req;
-	unsigned int gl_demote_state; /* state requested by remote node */
 	unsigned long gl_demote_time; /* time of first demote request */
 	struct list_head gl_holders;
 
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index f40ce34c803e..6e493aee28f8 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -153,7 +153,6 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
 	int req;
 	u32 lkf;
 
-	gl->gl_req = req_state;
 	req = make_mode(req_state);
 	lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
 
-- 
cgit v1.2.2


From bcd7278d8a423a255e45f4d10afe564328f1885f Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 7 Dec 2010 13:58:56 -0500
Subject: GFS2: fsck.gfs2 reported statfs error after gfs2_grow

When you do gfs2_grow it failed to take the very last
rgrp into account when adding up the new free space due
to an off-by-one error.  It was not reading the last
rgrp from the rindex because of a check for "<=" that
should have been "<".  Therefore, fsck.gfs2 was finding
(and fixing) an error with the system statfs file.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/rgrp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 25dbe5c84dff..7293ea27020c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
 	for (rgrps = 0;; rgrps++) {
 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
 
-		if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
+		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
 			break;
 		error = gfs2_internal_read(ip, &ra_state, buf, &pos,
 					   sizeof(struct gfs2_rindex));
-- 
cgit v1.2.2


From afe2c511fb2d75f1515081ff1be15bd79cfe722d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 14 Dec 2010 16:21:17 +0100
Subject: workqueue: convert cancel_rearming_delayed_work[queue]() users to
 cancel_delayed_work_sync()

cancel_rearming_delayed_work[queue]() has been superceded by
cancel_delayed_work_sync() quite some time ago.  Convert all the
in-kernel users.  The conversions are completely equivalent and
trivial.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: "David S. Miller" <davem@davemloft.net>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
Cc: Jeff Garzik <jgarzik@pobox.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Mauro Carvalho Chehab <mchehab@infradead.org>
Cc: netdev@vger.kernel.org
Cc: Anton Vorontsov <cbou@mail.ru>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Neil Brown <neilb@suse.de>
Cc: Alex Elder <aelder@sgi.com>
Cc: xfs-masters@oss.sgi.com
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: netfilter-devel@vger.kernel.org
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: linux-nfs@vger.kernel.org
---
 fs/nfsd/nfs4state.c    | 2 +-
 fs/xfs/xfs_mru_cache.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 116cab970e0f..fbd18c3074bb 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4336,7 +4336,7 @@ __nfs4_state_shutdown(void)
 void
 nfs4_state_shutdown(void)
 {
-	cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work);
+	cancel_delayed_work_sync(&laundromat_work);
 	destroy_workqueue(laundry_wq);
 	locks_end_grace(&nfsd4_manager);
 	nfs4_lock_state();
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 45ce15dc5b2b..edfa178bafb6 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -408,7 +408,7 @@ xfs_mru_cache_flush(
 	spin_lock(&mru->lock);
 	if (mru->queued) {
 		spin_unlock(&mru->lock);
-		cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work);
+		cancel_delayed_work_sync(&mru->work);
 		spin_lock(&mru->lock);
 	}
 
-- 
cgit v1.2.2


From 846f40455276617275284a4b76b89311b4aed0b9 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 16 Dec 2010 15:18:48 +0000
Subject: GFS2: Don't flush delete workqueue when releasing the transaction
 lock

There is no requirement to flush the delete workqueue before a
gfs2 filesystem is suspended. The workqueue's work will just
be suspended along with the rest of the tasks on the filesystem.

The resolves a deadlock situation where the transaction lock's
demotion code was trying to flush the delete workqueue while at
the same time, the workqueue was waiting for the transaction
lock.

The delete workqueue is flushed by gfs2_make_fs_ro() already, so
that umount/remount are correctly protected anyway.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glops.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 0d149dcc04e5..263561bf1a50 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -325,7 +325,6 @@ static void trans_go_sync(struct gfs2_glock *gl)
 
 	if (gl->gl_state != LM_ST_UNLOCKED &&
 	    test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-		flush_workqueue(gfs2_delete_workqueue);
 		gfs2_meta_syncfs(sdp);
 		gfs2_log_shutdown(sdp);
 	}
-- 
cgit v1.2.2


From 596276c3571e2108f4b336be545ece2eacf3da59 Mon Sep 17 00:00:00 2001
From: Anton Salikhmetov <alexo@tuxera.com>
Date: Thu, 16 Dec 2010 14:44:51 +0200
Subject: hfsplus: fix an artifact in ioctl flag checking

Fix a flag checking artifact in hfsplus_ioctl_getflags() routine
found while doing clean-up against assignments inside `if's.

Signed-off-by: Anton Salikhmetov <alexo@tuxera.com>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index f5a7224f4cbb..508ce662ce12 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -28,7 +28,7 @@ static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
 
 	if (inode->i_flags & S_IMMUTABLE)
 		flags |= FS_IMMUTABLE_FL;
-	if (inode->i_flags |= S_APPEND)
+	if (inode->i_flags & S_APPEND)
 		flags |= FS_APPEND_FL;
 	if (hip->userflags & HFSPLUS_FLG_NODUMP)
 		flags |= FS_NODUMP_FL;
-- 
cgit v1.2.2


From 2753cc281c9a0e8a0a45ee2b8110866a9fe63bdd Mon Sep 17 00:00:00 2001
From: Anton Salikhmetov <alexo@tuxera.com>
Date: Thu, 16 Dec 2010 18:08:38 +0200
Subject: hfsplus: over 80 character lines clean-up

Match coding style line length limitation where checkpatch.pl
reported over-80-character-line warnings.

Signed-off-by: Anton Salikhmetov <alexo@tuxera.com>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/bfind.c       |  6 ++--
 fs/hfsplus/bitmap.c      |  3 +-
 fs/hfsplus/bnode.c       | 41 +++++++++++++++++---------
 fs/hfsplus/brec.c        | 26 +++++++++++------
 fs/hfsplus/btree.c       | 31 ++++++++++++++------
 fs/hfsplus/catalog.c     | 75 ++++++++++++++++++++++++++++++++----------------
 fs/hfsplus/dir.c         | 34 +++++++++++++++-------
 fs/hfsplus/extents.c     | 28 ++++++++++++------
 fs/hfsplus/hfsplus_fs.h  | 30 ++++++++++++-------
 fs/hfsplus/hfsplus_raw.h |  3 +-
 fs/hfsplus/inode.c       | 26 +++++++++++------
 fs/hfsplus/options.c     |  7 +++--
 fs/hfsplus/part_tbl.c    |  3 +-
 fs/hfsplus/super.c       | 34 +++++++++++++++-------
 fs/hfsplus/unicode.c     | 21 ++++++++++----
 fs/hfsplus/wrapper.c     | 15 ++++++----
 16 files changed, 260 insertions(+), 123 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index d182438c7ae4..5d799c13205f 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -22,7 +22,8 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
 		return -ENOMEM;
 	fd->search_key = ptr;
 	fd->key = ptr + tree->max_key_len + 2;
-	dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
+	dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n",
+		tree->cnid, __builtin_return_address(0));
 	mutex_lock(&tree->tree_lock);
 	return 0;
 }
@@ -31,7 +32,8 @@ void hfs_find_exit(struct hfs_find_data *fd)
 {
 	hfs_bnode_put(fd->bnode);
 	kfree(fd->search_key);
-	dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
+	dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n",
+		fd->tree->cnid, __builtin_return_address(0));
 	mutex_unlock(&fd->tree->tree_lock);
 	fd->tree = NULL;
 }
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index ad57f5991eb1..1cad80c789cb 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -15,7 +15,8 @@
 
 #define PAGE_CACHE_BITS	(PAGE_CACHE_SIZE * 8)
 
-int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max)
+int hfsplus_block_allocate(struct super_block *sb, u32 size,
+		u32 offset, u32 *max)
 {
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct page *page;
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index c8aa1659b838..cf7dc8dac766 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -212,7 +212,8 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
 				dst_page--;
 			}
 			src -= len;
-			memmove(kmap(*dst_page) + src, kmap(*src_page) + src, len);
+			memmove(kmap(*dst_page) + src,
+				kmap(*src_page) + src, len);
 			kunmap(*src_page);
 			set_page_dirty(*dst_page);
 			kunmap(*dst_page);
@@ -250,14 +251,16 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
 
 		if (src == dst) {
 			l = min(len, (int)PAGE_CACHE_SIZE - src);
-			memmove(kmap(*dst_page) + src, kmap(*src_page) + src, l);
+			memmove(kmap(*dst_page) + src,
+				kmap(*src_page) + src, l);
 			kunmap(*src_page);
 			set_page_dirty(*dst_page);
 			kunmap(*dst_page);
 
 			while ((len -= l) != 0) {
 				l = min(len, (int)PAGE_CACHE_SIZE);
-				memmove(kmap(*++dst_page), kmap(*++src_page), l);
+				memmove(kmap(*++dst_page),
+					kmap(*++src_page), l);
 				kunmap(*src_page);
 				set_page_dirty(*dst_page);
 				kunmap(*dst_page);
@@ -268,7 +271,8 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
 			do {
 				src_ptr = kmap(*src_page) + src;
 				dst_ptr = kmap(*dst_page) + dst;
-				if (PAGE_CACHE_SIZE - src < PAGE_CACHE_SIZE - dst) {
+				if (PAGE_CACHE_SIZE - src <
+						PAGE_CACHE_SIZE - dst) {
 					l = PAGE_CACHE_SIZE - src;
 					src = 0;
 					dst += l;
@@ -340,7 +344,8 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
 			return;
 		tmp->next = node->next;
 		cnid = cpu_to_be32(tmp->next);
-		hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, next), 4);
+		hfs_bnode_write(tmp, &cnid,
+			offsetof(struct hfs_bnode_desc, next), 4);
 		hfs_bnode_put(tmp);
 	} else if (node->type == HFS_NODE_LEAF)
 		tree->leaf_head = node->next;
@@ -351,7 +356,8 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
 			return;
 		tmp->prev = node->prev;
 		cnid = cpu_to_be32(tmp->prev);
-		hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, prev), 4);
+		hfs_bnode_write(tmp, &cnid,
+			offsetof(struct hfs_bnode_desc, prev), 4);
 		hfs_bnode_put(tmp);
 	} else if (node->type == HFS_NODE_LEAF)
 		tree->leaf_tail = node->prev;
@@ -379,7 +385,9 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
 	struct hfs_bnode *node;
 
 	if (cnid >= tree->node_count) {
-		printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid);
+		printk(KERN_ERR "hfs: request for non-existent node "
+				"%d in B*Tree\n",
+			cnid);
 		return NULL;
 	}
 
@@ -402,7 +410,9 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	loff_t off;
 
 	if (cnid >= tree->node_count) {
-		printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid);
+		printk(KERN_ERR "hfs: request for non-existent node "
+				"%d in B*Tree\n",
+			cnid);
 		return NULL;
 	}
 
@@ -429,7 +439,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	} else {
 		spin_unlock(&tree->hash_lock);
 		kfree(node);
-		wait_event(node2->lock_wq, !test_bit(HFS_BNODE_NEW, &node2->flags));
+		wait_event(node2->lock_wq,
+			!test_bit(HFS_BNODE_NEW, &node2->flags));
 		return node2;
 	}
 	spin_unlock(&tree->hash_lock);
@@ -483,7 +494,8 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
 	if (node) {
 		hfs_bnode_get(node);
 		spin_unlock(&tree->hash_lock);
-		wait_event(node->lock_wq, !test_bit(HFS_BNODE_NEW, &node->flags));
+		wait_event(node->lock_wq,
+			!test_bit(HFS_BNODE_NEW, &node->flags));
 		if (test_bit(HFS_BNODE_ERROR, &node->flags))
 			goto node_error;
 		return node;
@@ -497,7 +509,8 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
 	if (!test_bit(HFS_BNODE_NEW, &node->flags))
 		return node;
 
-	desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset);
+	desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) +
+			node->page_offset);
 	node->prev = be32_to_cpu(desc->prev);
 	node->next = be32_to_cpu(desc->next);
 	node->num_recs = be16_to_cpu(desc->num_recs);
@@ -607,7 +620,8 @@ void hfs_bnode_get(struct hfs_bnode *node)
 	if (node) {
 		atomic_inc(&node->refcnt);
 		dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n",
-		       node->tree->cnid, node->this, atomic_read(&node->refcnt));
+			node->tree->cnid, node->this,
+			atomic_read(&node->refcnt));
 	}
 }
 
@@ -619,7 +633,8 @@ void hfs_bnode_put(struct hfs_bnode *node)
 		int i;
 
 		dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n",
-		       node->tree->cnid, node->this, atomic_read(&node->refcnt));
+			node->tree->cnid, node->this,
+			atomic_read(&node->refcnt));
 		BUG_ON(!atomic_read(&node->refcnt));
 		if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
 			return;
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 81f7e6e51d5a..2312de34bd42 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -39,7 +39,8 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
 	   !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) {
 		retval = node->tree->max_key_len + 2;
 	} else {
-		recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
+		recoff = hfs_bnode_read_u16(node,
+			node->tree->node_size - (rec + 1) * 2);
 		if (!recoff)
 			return 0;
 
@@ -84,7 +85,8 @@ again:
 	end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
 	end_off = hfs_bnode_read_u16(node, end_rec_off);
 	end_rec_off -= 2;
-	dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", rec, size, end_off, end_rec_off);
+	dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
+		rec, size, end_off, end_rec_off);
 	if (size > end_rec_off - end_off) {
 		if (new_node)
 			panic("not enough room!\n");
@@ -99,7 +101,9 @@ again:
 	}
 	node->num_recs++;
 	/* write new last offset */
-	hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs);
+	hfs_bnode_write_u16(node,
+		offsetof(struct hfs_bnode_desc, num_recs),
+		node->num_recs);
 	hfs_bnode_write_u16(node, end_rec_off, end_off + size);
 	data_off = end_off;
 	data_rec_off = end_rec_off + 2;
@@ -151,7 +155,8 @@ skip:
 		if (tree->attributes & HFS_TREE_VARIDXKEYS)
 			key_len = be16_to_cpu(fd->search_key->key_len) + 2;
 		else {
-			fd->search_key->key_len = cpu_to_be16(tree->max_key_len);
+			fd->search_key->key_len =
+				cpu_to_be16(tree->max_key_len);
 			key_len = tree->max_key_len + 2;
 		}
 		goto again;
@@ -180,7 +185,8 @@ again:
 		mark_inode_dirty(tree->inode);
 	}
 	hfs_bnode_dump(node);
-	dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", fd->record, fd->keylength + fd->entrylength);
+	dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n",
+		fd->record, fd->keylength + fd->entrylength);
 	if (!--node->num_recs) {
 		hfs_bnode_unlink(node);
 		if (!node->parent)
@@ -194,7 +200,9 @@ again:
 		__hfs_brec_find(node, fd);
 		goto again;
 	}
-	hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs);
+	hfs_bnode_write_u16(node,
+		offsetof(struct hfs_bnode_desc, num_recs),
+		node->num_recs);
 
 	if (rec_off == end_off)
 		goto skip;
@@ -364,7 +372,8 @@ again:
 		newkeylen = hfs_bnode_read_u16(node, 14) + 2;
 	else
 		fd->keylength = newkeylen = tree->max_key_len + 2;
-	dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", rec, fd->keylength, newkeylen);
+	dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n",
+		rec, fd->keylength, newkeylen);
 
 	rec_off = tree->node_size - (rec + 2) * 2;
 	end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
@@ -383,7 +392,8 @@ again:
 			parent = fd->bnode;
 			rec = fd->record;
 			rec_off = tree->node_size - (rec + 2) * 2;
-			end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
+			end_rec_off = tree->node_size -
+				(parent->num_recs + 1) * 2;
 		}
 	}
 
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 97556f932d57..21023d9f8ff3 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -51,7 +51,8 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 		goto free_inode;
 
 	/* Load the header */
-	head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
+	head = (struct hfs_btree_header_rec *)(kmap(page) +
+		sizeof(struct hfs_bnode_desc));
 	tree->root = be32_to_cpu(head->root);
 	tree->leaf_count = be32_to_cpu(head->leaf_count);
 	tree->leaf_head = be32_to_cpu(head->leaf_head);
@@ -115,7 +116,9 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 
 	tree->node_size_shift = ffs(size) - 1;
 
-	tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	tree->pages_per_bnode =
+		(tree->node_size + PAGE_CACHE_SIZE - 1) >>
+		PAGE_CACHE_SHIFT;
 
 	kunmap(page);
 	page_cache_release(page);
@@ -144,8 +147,10 @@ void hfs_btree_close(struct hfs_btree *tree)
 		while ((node = tree->node_hash[i])) {
 			tree->node_hash[i] = node->next_hash;
 			if (atomic_read(&node->refcnt))
-				printk(KERN_CRIT "hfs: node %d:%d still has %d user(s)!\n",
-					node->tree->cnid, node->this, atomic_read(&node->refcnt));
+				printk(KERN_CRIT "hfs: node %d:%d "
+						"still has %d user(s)!\n",
+					node->tree->cnid, node->this,
+					atomic_read(&node->refcnt));
 			hfs_bnode_free(node);
 			tree->node_hash_cnt--;
 		}
@@ -166,7 +171,8 @@ void hfs_btree_write(struct hfs_btree *tree)
 		return;
 	/* Load the header */
 	page = node->page[0];
-	head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
+	head = (struct hfs_btree_header_rec *)(kmap(page) +
+		sizeof(struct hfs_bnode_desc));
 
 	head->root = cpu_to_be32(tree->root);
 	head->leaf_count = cpu_to_be32(tree->leaf_count);
@@ -272,7 +278,8 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 						tree->free_nodes--;
 						mark_inode_dirty(tree->inode);
 						hfs_bnode_put(node);
-						return hfs_bnode_create(tree, idx);
+						return hfs_bnode_create(tree,
+							idx);
 					}
 				}
 			}
@@ -329,7 +336,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
 		hfs_bnode_put(node);
 		if (!i) {
 			/* panic */;
-			printk(KERN_CRIT "hfs: unable to free bnode %u. bmap not found!\n", node->this);
+			printk(KERN_CRIT "hfs: unable to free bnode %u. "
+					"bmap not found!\n",
+				node->this);
 			return;
 		}
 		node = hfs_bnode_find(tree, i);
@@ -337,7 +346,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
 			return;
 		if (node->type != HFS_NODE_MAP) {
 			/* panic */;
-			printk(KERN_CRIT "hfs: invalid bmap found! (%u,%d)\n", node->this, node->type);
+			printk(KERN_CRIT "hfs: invalid bmap found! "
+					"(%u,%d)\n",
+				node->this, node->type);
 			hfs_bnode_put(node);
 			return;
 		}
@@ -350,7 +361,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
 	m = 1 << (~nidx & 7);
 	byte = data[off];
 	if (!(byte & m)) {
-		printk(KERN_CRIT "hfs: trying to free free bnode %u(%d)\n", node->this, node->type);
+		printk(KERN_CRIT "hfs: trying to free free bnode "
+				"%u(%d)\n",
+			node->this, node->type);
 		kunmap(page);
 		hfs_bnode_put(node);
 		return;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 0aa40b36a9e7..b4ba1b319333 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -91,7 +91,8 @@ void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
 		perms->dev = 0;
 }
 
-static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode)
+static int hfsplus_cat_build_record(hfsplus_cat_entry *entry,
+		u32 cnid, struct inode *inode)
 {
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
 
@@ -128,20 +129,32 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
 		if (cnid == inode->i_ino) {
 			hfsplus_cat_set_perms(inode, &file->permissions);
 			if (S_ISLNK(inode->i_mode)) {
-				file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE);
-				file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR);
+				file->user_info.fdType =
+					cpu_to_be32(HFSP_SYMLINK_TYPE);
+				file->user_info.fdCreator =
+					cpu_to_be32(HFSP_SYMLINK_CREATOR);
 			} else {
-				file->user_info.fdType = cpu_to_be32(sbi->type);
-				file->user_info.fdCreator = cpu_to_be32(sbi->creator);
+				file->user_info.fdType =
+					cpu_to_be32(sbi->type);
+				file->user_info.fdCreator =
+					cpu_to_be32(sbi->creator);
 			}
-			if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
-				file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
+			if (HFSPLUS_FLG_IMMUTABLE &
+					(file->permissions.rootflags |
+					file->permissions.userflags))
+				file->flags |=
+					cpu_to_be16(HFSPLUS_FILE_LOCKED);
 		} else {
-			file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE);
-			file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR);
-			file->user_info.fdFlags = cpu_to_be16(0x100);
-			file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date;
-			file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid);
+			file->user_info.fdType =
+				cpu_to_be32(HFSP_HARDLINK_TYPE);
+			file->user_info.fdCreator =
+				cpu_to_be32(HFSP_HFSPLUS_CREATOR);
+			file->user_info.fdFlags =
+				cpu_to_be16(0x100);
+			file->create_date =
+				HFSPLUS_I(sbi->hidden_dir)->create_date;
+			file->permissions.dev =
+				cpu_to_be32(HFSPLUS_I(inode)->linkid);
 		}
 		return sizeof(*file);
 	}
@@ -182,12 +195,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
 		return -EIO;
 	}
 
-	hfsplus_cat_build_key_uni(fd->search_key, be32_to_cpu(tmp.thread.parentID),
-				 &tmp.thread.nodeName);
+	hfsplus_cat_build_key_uni(fd->search_key,
+		be32_to_cpu(tmp.thread.parentID),
+		&tmp.thread.nodeName);
 	return hfs_brec_find(fd);
 }
 
-int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode)
+int hfsplus_create_cat(u32 cnid, struct inode *dir,
+		struct qstr *str, struct inode *inode)
 {
 	struct super_block *sb = dir->i_sb;
 	struct hfs_find_data fd;
@@ -195,13 +210,15 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct ino
 	int entry_size;
 	int err;
 
-	dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink);
+	dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n",
+		str->name, cnid, inode->i_nlink);
 	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 
 	hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
-	entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
+	entry_size = hfsplus_fill_cat_thread(sb, &entry,
+		S_ISDIR(inode->i_mode) ?
 			HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD,
-			dir->i_ino, str);
+		dir->i_ino, str);
 	err = hfs_brec_find(&fd);
 	if (err != -ENOENT) {
 		if (!err)
@@ -250,7 +267,8 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 	int err, off;
 	u16 type;
 
-	dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
+	dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n",
+		str ? str->name : NULL, cnid);
 	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 
 	if (!str) {
@@ -261,11 +279,15 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 		if (err)
 			goto out;
 
-		off = fd.entryoffset + offsetof(struct hfsplus_cat_thread, nodeName);
+		off = fd.entryoffset +
+			offsetof(struct hfsplus_cat_thread, nodeName);
 		fd.search_key->cat.parent = cpu_to_be32(dir->i_ino);
-		hfs_bnode_read(fd.bnode, &fd.search_key->cat.name.length, off, 2);
+		hfs_bnode_read(fd.bnode,
+			&fd.search_key->cat.name.length, off, 2);
 		len = be16_to_cpu(fd.search_key->cat.name.length) * 2;
-		hfs_bnode_read(fd.bnode, &fd.search_key->cat.name.unicode, off + 2, len);
+		hfs_bnode_read(fd.bnode,
+			&fd.search_key->cat.name.unicode,
+			off + 2, len);
 		fd.search_key->key_len = cpu_to_be16(6 + len);
 	} else
 		hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
@@ -282,7 +304,8 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 		hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_DATA);
 #endif
 
-		off = fd.entryoffset + offsetof(struct hfsplus_cat_file, rsrc_fork);
+		off = fd.entryoffset +
+			offsetof(struct hfsplus_cat_file, rsrc_fork);
 		hfs_bnode_read(fd.bnode, &fork, off, sizeof(fork));
 		hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
 	}
@@ -326,7 +349,8 @@ int hfsplus_rename_cat(u32 cnid,
 	int entry_size, type;
 	int err = 0;
 
-	dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name,
+	dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
+		cnid, src_dir->i_ino, src_name->name,
 		dst_dir->i_ino, dst_name->name);
 	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
 	dst_fd = src_fd;
@@ -378,7 +402,8 @@ int hfsplus_rename_cat(u32 cnid,
 
 	/* create new thread entry */
 	hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL);
-	entry_size = hfsplus_fill_cat_thread(sb, &entry, type, dst_dir->i_ino, dst_name);
+	entry_size = hfsplus_fill_cat_thread(sb, &entry, type,
+		dst_dir->i_ino, dst_name);
 	err = hfs_brec_find(&dst_fd);
 	if (err != -ENOENT) {
 		if (!err)
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index e44c78a837e8..c423942793ee 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -66,11 +66,17 @@ again:
 			goto fail;
 		}
 		cnid = be32_to_cpu(entry.file.id);
-		if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) &&
-		    entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
-		    (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date ||
-		     entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) &&
-		    HFSPLUS_SB(sb)->hidden_dir) {
+		if (entry.file.user_info.fdType ==
+				cpu_to_be32(HFSP_HARDLINK_TYPE) &&
+				entry.file.user_info.fdCreator ==
+				cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
+				(entry.file.create_date ==
+					HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->
+						create_date ||
+				entry.file.create_date ==
+					HFSPLUS_I(sb->s_root->d_inode)->
+						create_date) &&
+				HFSPLUS_SB(sb)->hidden_dir) {
 			struct qstr str;
 			char name[32];
 
@@ -83,11 +89,13 @@ again:
 				linkid = 0;
 			} else {
 				dentry->d_fsdata = (void *)(unsigned long)cnid;
-				linkid = be32_to_cpu(entry.file.permissions.dev);
+				linkid =
+					be32_to_cpu(entry.file.permissions.dev);
 				str.len = sprintf(name, "iNode%d", linkid);
 				str.name = name;
 				hfsplus_cat_build_key(sb, fd.search_key,
-					HFSPLUS_SB(sb)->hidden_dir->i_ino, &str);
+					HFSPLUS_SB(sb)->hidden_dir->i_ino,
+					&str);
 				goto again;
 			}
 		} else if (!dentry->d_fsdata)
@@ -139,7 +147,8 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		filp->f_pos++;
 		/* fall through */
 	case 1:
-		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
+		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+			fd.entrylength);
 		if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) {
 			printk(KERN_ERR "hfs: bad catalog folder thread\n");
 			err = -EIO;
@@ -169,14 +178,16 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			err = -EIO;
 			goto out;
 		}
-		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
+		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
+			fd.entrylength);
 		type = be16_to_cpu(entry.type);
 		len = HFSPLUS_MAX_STRLEN;
 		err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len);
 		if (err)
 			goto out;
 		if (type == HFSPLUS_FOLDER) {
-			if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) {
+			if (fd.entrylength <
+					sizeof(struct hfsplus_cat_folder)) {
 				printk(KERN_ERR "hfs: small dir entry\n");
 				err = -EIO;
 				goto out;
@@ -273,7 +284,8 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 		HFSPLUS_I(inode)->linkid = id;
 		cnid = sbi->next_cnid++;
 		src_dentry->d_fsdata = (void *)(unsigned long)cnid;
-		res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode);
+		res = hfsplus_create_cat(cnid, src_dir,
+			&src_dentry->d_name, inode);
 		if (res)
 			/* panic? */
 			goto out;
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index b1127ef26750..1ae2fe5cfa74 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -83,7 +83,8 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
 	return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count);
 }
 
-static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
+static void __hfsplus_ext_write_extent(struct inode *inode,
+		struct hfs_find_data *fd)
 {
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	int res;
@@ -152,11 +153,13 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
 		return -ENOENT;
 	if (fd->entrylength != sizeof(hfsplus_extent_rec))
 		return -EIO;
-	hfs_bnode_read(fd->bnode, extent, fd->entryoffset, sizeof(hfsplus_extent_rec));
+	hfs_bnode_read(fd->bnode, extent, fd->entryoffset,
+		sizeof(hfsplus_extent_rec));
 	return 0;
 }
 
-static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block)
+static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd,
+		struct inode *inode, u32 block)
 {
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	int res;
@@ -172,7 +175,8 @@ static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct in
 						HFSPLUS_TYPE_DATA);
 	if (!res) {
 		hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
-		hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents);
+		hip->cached_blocks =
+			hfsplus_ext_block_count(hip->cached_extents);
 	} else {
 		hip->cached_start = hip->cached_blocks = 0;
 		hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
@@ -249,9 +253,12 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
 	mutex_unlock(&hip->extents_lock);
 
 done:
-	dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
+	dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n",
+		inode->i_ino, (long long)iblock, dblock);
 	mask = (1 << sbi->fs_shift) - 1;
-	map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask));
+	map_bh(bh_result, sb,
+		(dblock << sbi->fs_shift) + sbi->blockoffset +
+			(iblock & mask));
 	if (create) {
 		set_buffer_new(bh_result);
 		hip->phys_size += sb->s_blocksize;
@@ -342,7 +349,8 @@ found:
 	}
 }
 
-int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw *fork, int type)
+int hfsplus_free_fork(struct super_block *sb, u32 cnid,
+		struct hfsplus_fork_raw *fork, int type)
 {
 	struct hfs_find_data fd;
 	hfsplus_extent_rec ext_entry;
@@ -497,7 +505,8 @@ void hfsplus_file_truncate(struct inode *inode)
 						&page, &fsdata);
 		if (res)
 			return;
-		res = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
+		res = pagecache_write_end(NULL, mapping, size,
+			0, 0, page, fsdata);
 		if (res < 0)
 			return;
 		mark_inode_dirty(inode);
@@ -543,7 +552,8 @@ void hfsplus_file_truncate(struct inode *inode)
 	hip->alloc_blocks = blk_cnt;
 out:
 	hip->phys_size = inode->i_size;
-	hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+	hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >>
+		sb->s_blocksize_bits;
 	inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
 	hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
 }
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 17ac254e72cb..c4abec167826 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -37,7 +37,8 @@
 #define HFSPLUS_TYPE_DATA 0x00
 #define HFSPLUS_TYPE_RSRC 0xFF
 
-typedef int (*btree_keycmp)(const hfsplus_btree_key *, const hfsplus_btree_key *);
+typedef int (*btree_keycmp)(const hfsplus_btree_key *,
+		const hfsplus_btree_key *);
 
 #define NODE_HASH_SIZE	256
 
@@ -352,9 +353,12 @@ int hfs_brec_read(struct hfs_find_data *, void *, int);
 int hfs_brec_goto(struct hfs_find_data *, int);
 
 /* catalog.c */
-int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
-int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
-void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *, u32, struct qstr *);
+int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *,
+		const hfsplus_btree_key *);
+int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *,
+		const hfsplus_btree_key *);
+void hfsplus_cat_build_key(struct super_block *sb,
+		hfsplus_btree_key *, u32, struct qstr *);
 int hfsplus_find_cat(struct super_block *, u32, struct hfs_find_data *);
 int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
 int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
@@ -370,7 +374,8 @@ extern const struct file_operations hfsplus_dir_operations;
 int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
 void hfsplus_ext_write_extent(struct inode *);
 int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int);
-int hfsplus_free_fork(struct super_block *, u32, struct hfsplus_fork_raw *, int);
+int hfsplus_free_fork(struct super_block *, u32,
+		struct hfsplus_fork_raw *, int);
 int hfsplus_file_extend(struct inode *);
 void hfsplus_file_truncate(struct inode *);
 
@@ -411,12 +416,17 @@ extern u16 hfsplus_decompose_table[];
 extern u16 hfsplus_compose_table[];
 
 /* unicode.c */
-int hfsplus_strcasecmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *);
-int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *);
-int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *);
-int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int);
+int hfsplus_strcasecmp(const struct hfsplus_unistr *,
+		const struct hfsplus_unistr *);
+int hfsplus_strcmp(const struct hfsplus_unistr *,
+		const struct hfsplus_unistr *);
+int hfsplus_uni2asc(struct super_block *,
+		const struct hfsplus_unistr *, char *, int *);
+int hfsplus_asc2uni(struct super_block *,
+		struct hfsplus_unistr *, const char *, int);
 int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str);
-int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2);
+int hfsplus_compare_dentry(struct dentry *dentry,
+		struct qstr *s1, struct qstr *s2);
 
 /* wrapper.c */
 int hfsplus_read_wrapper(struct super_block *);
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 6892899fd6fb..927cdd6d5bf5 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -36,7 +36,8 @@
 #define HFSP_WRAPOFF_EMBEDSIG     0x7C
 #define HFSP_WRAPOFF_EMBEDEXT     0x7E
 
-#define HFSP_HIDDENDIR_NAME	"\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data"
+#define HFSP_HIDDENDIR_NAME \
+	"\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data"
 
 #define HFSP_HARDLINK_TYPE	0x686c6e6b	/* 'hlnk' */
 #define HFSP_HFSPLUS_CREATOR	0x6866732b	/* 'hfs+' */
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index bda7464c205d..8d7350e5ce0d 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -78,7 +78,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
 	if (!tree)
 		return 0;
 	if (tree->node_size >= PAGE_CACHE_SIZE) {
-		nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
+		nidx = page->index >>
+			(tree->node_size_shift - PAGE_CACHE_SHIFT);
 		spin_lock(&tree->hash_lock);
 		node = hfs_bnode_findhash(tree, nidx);
 		if (!node)
@@ -91,7 +92,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
 		}
 		spin_unlock(&tree->hash_lock);
 	} else {
-		nidx = page->index << (PAGE_CACHE_SHIFT - tree->node_size_shift);
+		nidx = page->index <<
+			(PAGE_CACHE_SHIFT - tree->node_size_shift);
 		i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
 		spin_lock(&tree->hash_lock);
 		do {
@@ -167,8 +169,8 @@ const struct dentry_operations hfsplus_dentry_operations = {
 	.d_compare    = hfsplus_compare_dentry,
 };
 
-static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dentry,
-					  struct nameidata *nd)
+static struct dentry *hfsplus_file_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
 {
 	struct hfs_find_data fd;
 	struct super_block *sb = dir->i_sb;
@@ -222,7 +224,8 @@ out:
 	return NULL;
 }
 
-static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir)
+static void hfsplus_get_perms(struct inode *inode,
+		struct hfsplus_perm *perms, int dir)
 {
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
 	u16 mode;
@@ -324,7 +327,8 @@ int hfsplus_file_fsync(struct file *file, int datasync)
 		error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping);
 
 	if (test_and_clear_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags)) {
-		error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
+		error2 =
+			filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
 		if (!error)
 			error = error2;
 	}
@@ -472,7 +476,8 @@ void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
 	}
 }
 
-void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
+void hfsplus_inode_write_fork(struct inode *inode,
+		struct hfsplus_fork_raw *fork)
 {
 	memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
 	       sizeof(hfsplus_extent_rec));
@@ -520,7 +525,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 		inode->i_nlink = 1;
 		if (S_ISREG(inode->i_mode)) {
 			if (file->permissions.dev)
-				inode->i_nlink = be32_to_cpu(file->permissions.dev);
+				inode->i_nlink =
+					be32_to_cpu(file->permissions.dev);
 			inode->i_op = &hfsplus_file_inode_operations;
 			inode->i_fop = &hfsplus_file_operations;
 			inode->i_mapping->a_ops = &hfsplus_aops;
@@ -593,7 +599,9 @@ int hfsplus_cat_write_inode(struct inode *inode)
 					sizeof(struct hfsplus_cat_file));
 		hfsplus_inode_write_fork(inode, &file->data_fork);
 		hfsplus_cat_set_perms(inode, &file->permissions);
-		if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
+		if (HFSPLUS_FLG_IMMUTABLE &
+				(file->permissions.rootflags |
+					file->permissions.userflags))
 			file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
 		else
 			file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED);
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index dbd9d0c426cb..bb62a5882147 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -165,7 +165,9 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
 			if (p)
 				sbi->nls = load_nls(p);
 			if (!sbi->nls) {
-				printk(KERN_ERR "hfs: unable to load nls mapping \"%s\"\n", p);
+				printk(KERN_ERR "hfs: unable to load "
+						"nls mapping \"%s\"\n",
+					p);
 				kfree(p);
 				return 0;
 			}
@@ -212,7 +214,8 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
 		seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
 	if (sbi->type != HFSPLUS_DEF_CR_TYPE)
 		seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
-	seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, sbi->uid, sbi->gid);
+	seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
+		sbi->uid, sbi->gid);
 	if (sbi->part >= 0)
 		seq_printf(seq, ",part=%u", sbi->part);
 	if (sbi->session >= 0)
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 58918222a604..79eea8efad8e 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -2,7 +2,8 @@
  * linux/fs/hfsplus/part_tbl.c
  *
  * Copyright (C) 1996-1997  Paul H. Hargrove
- * This file may be distributed under the terms of the GNU General Public License.
+ * This file may be distributed under the terms of
+ * the GNU General Public License.
  *
  * Original code to handle the new style Mac partition table based on
  * a patch contributed by Holger Schemel (aeglos@valinor.owl.de).
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 154478c71f25..3c9f30e9cd36 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -290,18 +290,25 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 			return -EINVAL;
 
 		if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
-			printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, "
-			       "running fsck.hfsplus is recommended.  leaving read-only.\n");
+			printk(KERN_WARNING "hfs: filesystem was "
+					"not cleanly unmounted, "
+					"running fsck.hfsplus is recommended.  "
+					"leaving read-only.\n");
 			sb->s_flags |= MS_RDONLY;
 			*flags |= MS_RDONLY;
 		} else if (force) {
 			/* nothing */
-		} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
-			printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
+		} else if (vhdr->attributes &
+				cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
+			printk(KERN_WARNING "hfs: filesystem is marked locked, "
+					"leaving read-only.\n");
 			sb->s_flags |= MS_RDONLY;
 			*flags |= MS_RDONLY;
-		} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
-			printk(KERN_WARNING "hfs: filesystem is marked journaled, leaving read-only.\n");
+		} else if (vhdr->attributes &
+				cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
+			printk(KERN_WARNING "hfs: filesystem is "
+					"marked journaled, "
+					"leaving read-only.\n");
 			sb->s_flags |= MS_RDONLY;
 			*flags |= MS_RDONLY;
 		}
@@ -391,17 +398,22 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 
 	if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
-		printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, "
-		       "running fsck.hfsplus is recommended.  mounting read-only.\n");
+		printk(KERN_WARNING "hfs: Filesystem was "
+				"not cleanly unmounted, "
+				"running fsck.hfsplus is recommended.  "
+				"mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
 	} else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
 		/* nothing */
 	} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
 		printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
-	} else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !(sb->s_flags & MS_RDONLY)) {
-		printk(KERN_WARNING "hfs: write access to a journaled filesystem is not supported, "
-		       "use the force option at your own risk, mounting read-only.\n");
+	} else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) &&
+			!(sb->s_flags & MS_RDONLY)) {
+		printk(KERN_WARNING "hfs: write access to "
+				"a journaled filesystem is not supported, "
+				"use the force option at your own risk, "
+				"mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
 	}
 
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index b66d67de882c..15703e81174e 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -118,7 +118,9 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
 	return NULL;
 }
 
-int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p)
+int hfsplus_uni2asc(struct super_block *sb,
+		const struct hfsplus_unistr *ustr,
+		char *astr, int *len_p)
 {
 	const hfsplus_unichr *ip;
 	struct nls_table *nls = HFSPLUS_SB(sb)->nls;
@@ -171,7 +173,8 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
 				goto same;
 			c1 = be16_to_cpu(*ip);
 			if (likely(compose))
-				ce1 = hfsplus_compose_lookup(hfsplus_compose_table, c1);
+				ce1 = hfsplus_compose_lookup(
+					hfsplus_compose_table, c1);
 			if (ce1)
 				break;
 			switch (c0) {
@@ -199,7 +202,8 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
 		if (ce2) {
 			i = 1;
 			while (i < ustrlen) {
-				ce1 = hfsplus_compose_lookup(ce2, be16_to_cpu(ip[i]));
+				ce1 = hfsplus_compose_lookup(ce2,
+					be16_to_cpu(ip[i]));
 				if (!ce1)
 					break;
 				i++;
@@ -363,7 +367,8 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
  * Composed unicode characters are decomposed and case-folding is performed
  * if the appropriate bits are (un)set on the superblock.
  */
-int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2)
+int hfsplus_compare_dentry(struct dentry *dentry,
+		struct qstr *s1, struct qstr *s2)
 {
 	struct super_block *sb = dentry->d_sb;
 	int casefold, decompose, size;
@@ -388,7 +393,9 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
 			astr1 += size;
 			len1 -= size;
 
-			if (!decompose || !(dstr1 = decompose_unichar(c, &dsize1))) {
+			if (decompose)
+				dstr1 = decompose_unichar(c, &dsize1);
+			if (!decompose || !dstr1) {
 				c1 = c;
 				dstr1 = &c1;
 				dsize1 = 1;
@@ -400,7 +407,9 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
 			astr2 += size;
 			len2 -= size;
 
-			if (!decompose || !(dstr2 = decompose_unichar(c, &dsize2))) {
+			if (decompose)
+				dstr2 = decompose_unichar(c, &dsize2);
+			if (!decompose || !dstr2) {
 				c2 = c;
 				dstr2 = &c2;
 				dsize2 = 1;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 15e0eabb489e..196231794f64 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -74,12 +74,14 @@ static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
 	   !(attrib & HFSP_WRAP_ATTRIB_SPARED))
 		return 0;
 
-	wd->ablk_size = be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE));
+	wd->ablk_size =
+		be32_to_cpu(*(__be32 *)(bufptr + HFSP_WRAPOFF_ABLKSIZE));
 	if (wd->ablk_size < HFSPLUS_SECTOR_SIZE)
 		return 0;
 	if (wd->ablk_size % HFSPLUS_SECTOR_SIZE)
 		return 0;
-	wd->ablk_start = be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART));
+	wd->ablk_start =
+		be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART));
 
 	extent = get_unaligned_be32(bufptr + HFSP_WRAPOFF_EMBEDEXT);
 	wd->embed_start = (extent >> 16) & 0xFFFF;
@@ -102,7 +104,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
 	if (HFSPLUS_SB(sb)->session >= 0) {
 		te.cdte_track = HFSPLUS_SB(sb)->session;
 		te.cdte_format = CDROM_LBA;
-		res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te);
+		res = ioctl_by_bdev(sb->s_bdev,
+			CDROMREADTOCENTRY, (unsigned long)&te);
 		if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
 			*start = (sector_t)te.cdte_addr.lba << 2;
 			return 0;
@@ -111,7 +114,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
 		return -EINVAL;
 	}
 	ms_info.addr_format = CDROM_LBA;
-	res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION, (unsigned long)&ms_info);
+	res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION,
+		(unsigned long)&ms_info);
 	if (!res && ms_info.xa_flag)
 		*start = (sector_t)ms_info.addr.lba << 2;
 	return 0;
@@ -212,7 +216,8 @@ reread:
 		blocksize >>= 1;
 
 	if (sb_set_blocksize(sb, blocksize) != blocksize) {
-		printk(KERN_ERR "hfs: unable to set blocksize to %u!\n", blocksize);
+		printk(KERN_ERR "hfs: unable to set blocksize to %u!\n",
+			blocksize);
 		goto out_free_backup_vhdr;
 	}
 
-- 
cgit v1.2.2


From 21f2296a598c4089e0a9bdf54634269ac913a693 Mon Sep 17 00:00:00 2001
From: Anton Salikhmetov <alexo@tuxera.com>
Date: Thu, 16 Dec 2010 18:08:39 +0200
Subject: hfsplus: C99 comments clean-up

Match coding style restriction against C99 comments where
checkpatch.pl reported errors about their usage.

Signed-off-by: Anton Salikhmetov <alexo@tuxera.com>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/bnode.c      | 18 ++++++++++--------
 fs/hfsplus/extents.c    |  2 +-
 fs/hfsplus/hfsplus_fs.h |  9 +++++----
 3 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index cf7dc8dac766..c8f4ea4bb5ed 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -42,7 +42,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
 u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
 {
 	__be16 data;
-	// optimize later...
+	/* TODO: optimize later... */
 	hfs_bnode_read(node, &data, off, 2);
 	return be16_to_cpu(data);
 }
@@ -50,7 +50,7 @@ u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
 u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off)
 {
 	u8 data;
-	// optimize later...
+	/* TODO: optimize later... */
 	hfs_bnode_read(node, &data, off, 1);
 	return data;
 }
@@ -96,7 +96,7 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
 void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data)
 {
 	__be16 v = cpu_to_be16(data);
-	// optimize later...
+	/* TODO: optimize later... */
 	hfs_bnode_write(node, &v, off, 2);
 }
 
@@ -362,7 +362,7 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
 	} else if (node->type == HFS_NODE_LEAF)
 		tree->leaf_tail = node->prev;
 
-	// move down?
+	/* move down? */
 	if (!node->prev && !node->next) {
 		dprint(DBG_BNODE_MOD, "hfs_btree_del_level\n");
 	}
@@ -569,11 +569,13 @@ node_error:
 
 void hfs_bnode_free(struct hfs_bnode *node)
 {
-	//int i;
+#if 0
+	int i;
 
-	//for (i = 0; i < node->tree->pages_per_bnode; i++)
-	//	if (node->page[i])
-	//		page_cache_release(node->page[i]);
+	for (i = 0; i < node->tree->pages_per_bnode; i++)
+		if (node->page[i])
+			page_cache_release(node->page[i]);
+#endif
 	kfree(node);
 }
 
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 1ae2fe5cfa74..4b693b5b6230 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -399,7 +399,7 @@ int hfsplus_file_extend(struct inode *inode)
 
 	if (sbi->alloc_file->i_size * 8 <
 	    sbi->total_blocks - sbi->free_blocks + 8) {
-		// extend alloc file
+		/* extend alloc file */
 		printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n",
 				sbi->alloc_file->i_size * 8,
 				sbi->total_blocks, sbi->free_blocks);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index c4abec167826..a15e1c11e820 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -23,9 +23,11 @@
 #define DBG_EXTENT	0x00000020
 #define DBG_BITMAP	0x00000040
 
-//#define DBG_MASK	(DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
-//#define DBG_MASK	(DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE)
-//#define DBG_MASK	(DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
+#if 0
+#define DBG_MASK	(DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
+#define DBG_MASK	(DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE)
+#define DBG_MASK	(DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
+#endif
 #define DBG_MASK	(0)
 
 #define dprint(flg, fmt, args...) \
@@ -62,7 +64,6 @@ struct hfs_btree {
 	unsigned int max_key_len;
 	unsigned int depth;
 
-	//unsigned int map1_size, map_size;
 	struct mutex tree_lock;
 
 	unsigned int pages_per_bnode;
-- 
cgit v1.2.2


From 20b7643d8ee44254fc972d42655bace81e7ab50a Mon Sep 17 00:00:00 2001
From: Anton Salikhmetov <alexo@tuxera.com>
Date: Thu, 16 Dec 2010 18:08:40 +0200
Subject: hfsplus: spaces/indentation clean-up

Fix incorrect spaces and indentation reported by checkpatch.pl.

Signed-off-by: Anton Salikhmetov <alexo@tuxera.com>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/dir.c      |  2 +-
 fs/hfsplus/inode.c    |  2 +-
 fs/hfsplus/part_tbl.c |  4 ++--
 fs/hfsplus/unicode.c  | 20 ++++++++++----------
 4 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index c423942793ee..f611d55c9f5e 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -213,7 +213,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			err = -EIO;
 			goto out;
 		}
-	next:
+next:
 		filp->f_pos++;
 		if (filp->f_pos >= inode->i_size)
 			goto out;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 8d7350e5ce0d..a8df651747f0 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -355,7 +355,7 @@ static const struct inode_operations hfsplus_file_inode_operations = {
 };
 
 static const struct file_operations hfsplus_file_operations = {
-	.llseek 	= generic_file_llseek,
+	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
 	.write		= do_sync_write,
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 79eea8efad8e..d66ad113b1cc 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -60,7 +60,7 @@ struct new_pmap {
  */
 struct old_pmap {
 	__be16		pdSig;	/* Signature bytes */
-	struct 	old_pmap_entry {
+	struct old_pmap_entry {
 		__be32	pdStart;
 		__be32	pdSize;
 		__be32	pdFSID;
@@ -97,7 +97,7 @@ static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm,
 	int i = 0;
 
 	do {
-		if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
+		if (!memcmp(pm->pmPartType, "Apple_HFS", 9) &&
 		    (sbi->part < 0 || sbi->part == i)) {
 			*part_start += be32_to_cpu(pm->pmPyPartStart);
 			*part_size = be32_to_cpu(pm->pmPartBlkCnt);
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 15703e81174e..7dd90a540546 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -17,14 +17,14 @@
 /* Returns folded char, or 0 if ignorable */
 static inline u16 case_fold(u16 c)
 {
-        u16 tmp;
-
-        tmp = hfsplus_case_fold_table[c >> 8];
-        if (tmp)
-                tmp = hfsplus_case_fold_table[tmp + (c & 0xff)];
-        else
-                tmp = c;
-        return tmp;
+	u16 tmp;
+
+	tmp = hfsplus_case_fold_table[c >> 8];
+	if (tmp)
+		tmp = hfsplus_case_fold_table[tmp + (c & 0xff)];
+	else
+		tmp = c;
+	return tmp;
 }
 
 /* Compare unicode strings, return values like normal strcmp */
@@ -215,7 +215,7 @@ int hfsplus_uni2asc(struct super_block *sb,
 				goto done;
 			}
 		}
-	same:
+same:
 		switch (c0) {
 		case 0:
 			cc = 0x2400;
@@ -226,7 +226,7 @@ int hfsplus_uni2asc(struct super_block *sb,
 		default:
 			cc = c0;
 		}
-	done:
+done:
 		res = nls->uni2char(cc, op, len);
 		if (res < 0) {
 			if (res == -ENAMETOOLONG)
-- 
cgit v1.2.2


From b2837fcf4994e699a4def002e26f274d95b387c1 Mon Sep 17 00:00:00 2001
From: Anton Salikhmetov <alexo@tuxera.com>
Date: Thu, 16 Dec 2010 18:08:41 +0200
Subject: hfsplus: %L-to-%ll, macro correction, and remove unneeded braces

Clean-up based on checkpatch.pl report against unnecessary braces
(`{' and `}'), non-standard format option %Lu (%llu recommended)
as well as one trailing statement in a macro definition which
should have been on the next line.

Signed-off-by: Anton Salikhmetov <alexo@tuxera.com>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/bnode.c      |  9 +++------
 fs/hfsplus/extents.c    | 16 +++++++++-------
 fs/hfsplus/hfsplus_fs.h |  3 ++-
 3 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index c8f4ea4bb5ed..1c42cc5b899f 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -363,9 +363,8 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
 		tree->leaf_tail = node->prev;
 
 	/* move down? */
-	if (!node->prev && !node->next) {
+	if (!node->prev && !node->next)
 		dprint(DBG_BNODE_MOD, "hfs_btree_del_level\n");
-	}
 	if (!node->parent) {
 		tree->root = 0;
 		tree->depth = 0;
@@ -392,11 +391,9 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
 	}
 
 	for (node = tree->node_hash[hfs_bnode_hash(cnid)];
-	     node; node = node->next_hash) {
-		if (node->this == cnid) {
+			node; node = node->next_hash)
+		if (node->this == cnid)
 			return node;
-		}
-	}
 	return NULL;
 }
 
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 4b693b5b6230..52a0bcaa7b6d 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -397,12 +397,13 @@ int hfsplus_file_extend(struct inode *inode)
 	u32 start, len, goal;
 	int res;
 
-	if (sbi->alloc_file->i_size * 8 <
-	    sbi->total_blocks - sbi->free_blocks + 8) {
+	if (sbi->total_blocks - sbi->free_blocks + 8 >
+			sbi->alloc_file->i_size * 8) {
 		/* extend alloc file */
-		printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n",
-				sbi->alloc_file->i_size * 8,
-				sbi->total_blocks, sbi->free_blocks);
+		printk(KERN_ERR "hfs: extend alloc file! "
+				"(%llu,%u,%u)\n",
+			sbi->alloc_file->i_size * 8,
+			sbi->total_blocks, sbi->free_blocks);
 		return -ENOSPC;
 	}
 
@@ -490,8 +491,9 @@ void hfsplus_file_truncate(struct inode *inode)
 	u32 alloc_cnt, blk_cnt, start;
 	int res;
 
-	dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n",
-		inode->i_ino, (long long)hip->phys_size, inode->i_size);
+	dprint(DBG_INODE, "truncate: %lu, %llu -> %llu\n",
+		inode->i_ino, (long long)hip->phys_size,
+		inode->i_size);
 
 	if (inode->i_size > hip->phys_size) {
 		struct address_space *mapping = inode->i_mapping;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index a15e1c11e820..f7cbdf89ac9b 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -31,7 +31,8 @@
 #define DBG_MASK	(0)
 
 #define dprint(flg, fmt, args...) \
-	if (flg & DBG_MASK) printk(fmt , ## args)
+	if (flg & DBG_MASK) \
+		printk(fmt , ## args)
 
 /* Runtime config options */
 #define HFSPLUS_DEF_CR_TYPE    0x3F3F3F3F  /* '????' */
-- 
cgit v1.2.2


From b7b8de087384cc1954a8cd075af3f9e5977caa2e Mon Sep 17 00:00:00 2001
From: Werner Fink <werner@suse.de>
Date: Fri, 3 Dec 2010 12:48:23 +0100
Subject: TTY: Add tty ioctl to figure device node of the system console.

This has been in the SuSE kernels for a very long time.

Signed-off-by: Werner Fink <werner@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/compat_ioctl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 410ed188faa1..a1a8f0c6735f 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -837,6 +837,7 @@ COMPATIBLE_IOCTL(TCSETSW)
 COMPATIBLE_IOCTL(TCSETSF)
 COMPATIBLE_IOCTL(TIOCLINUX)
 COMPATIBLE_IOCTL(TIOCSBRK)
+COMPATIBLE_IOCTL(TIOCGDEV)
 COMPATIBLE_IOCTL(TIOCCBRK)
 COMPATIBLE_IOCTL(TIOCGSID)
 COMPATIBLE_IOCTL(TIOCGICOUNT)
-- 
cgit v1.2.2


From c7b92516a9c68fa5403879225a5a19974a801ef6 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Mon, 6 Dec 2010 11:16:28 -0600
Subject: fs: Use this_cpu_xx operations in buffer.c

Optimize various per cpu area operations through these new percpu
operations.  These operations avoid address calculations through the
use of segment prefixes and multiple memory references through RMW
instructions etc.

Reduces code size:

Before:

christoph@linux-2.6$ size fs/buffer.o
   text	   data	    bss	    dec	    hex	filename
  19169	     80	     28	  19277	   4b4d	fs/buffer.o

After:

christoph@linux-2.6$ size fs/buffer.o
   text	   data	    bss	    dec	    hex	filename
  19138	     80	     28	  19246	   4b2e	fs/buffer.o

V3->V4:
	- Move the use of this_cpu_inc_return into a later patch so that
	  this one can go in without percpu infrastructure changes.

Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 fs/buffer.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 5930e382959b..137d9de00e24 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1270,12 +1270,10 @@ static inline void check_irqs_on(void)
 static void bh_lru_install(struct buffer_head *bh)
 {
 	struct buffer_head *evictee = NULL;
-	struct bh_lru *lru;
 
 	check_irqs_on();
 	bh_lru_lock();
-	lru = &__get_cpu_var(bh_lrus);
-	if (lru->bhs[0] != bh) {
+	if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
 		struct buffer_head *bhs[BH_LRU_SIZE];
 		int in;
 		int out = 0;
@@ -1283,7 +1281,8 @@ static void bh_lru_install(struct buffer_head *bh)
 		get_bh(bh);
 		bhs[out++] = bh;
 		for (in = 0; in < BH_LRU_SIZE; in++) {
-			struct buffer_head *bh2 = lru->bhs[in];
+			struct buffer_head *bh2 =
+				__this_cpu_read(bh_lrus.bhs[in]);
 
 			if (bh2 == bh) {
 				__brelse(bh2);
@@ -1298,7 +1297,7 @@ static void bh_lru_install(struct buffer_head *bh)
 		}
 		while (out < BH_LRU_SIZE)
 			bhs[out++] = NULL;
-		memcpy(lru->bhs, bhs, sizeof(bhs));
+		memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
 	}
 	bh_lru_unlock();
 
@@ -1313,23 +1312,22 @@ static struct buffer_head *
 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
 {
 	struct buffer_head *ret = NULL;
-	struct bh_lru *lru;
 	unsigned int i;
 
 	check_irqs_on();
 	bh_lru_lock();
-	lru = &__get_cpu_var(bh_lrus);
 	for (i = 0; i < BH_LRU_SIZE; i++) {
-		struct buffer_head *bh = lru->bhs[i];
+		struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
 
 		if (bh && bh->b_bdev == bdev &&
 				bh->b_blocknr == block && bh->b_size == size) {
 			if (i) {
 				while (i) {
-					lru->bhs[i] = lru->bhs[i - 1];
+					__this_cpu_write(bh_lrus.bhs[i],
+						__this_cpu_read(bh_lrus.bhs[i - 1]));
 					i--;
 				}
-				lru->bhs[0] = bh;
+				__this_cpu_write(bh_lrus.bhs[0], bh);
 			}
 			get_bh(bh);
 			ret = bh;
@@ -3205,20 +3203,21 @@ static void recalc_bh_state(void)
 
 	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
 		return;
-	__get_cpu_var(bh_accounting).ratelimit = 0;
+	__this_cpu_write(bh_accounting.ratelimit, 0);
 	for_each_online_cpu(i)
 		tot += per_cpu(bh_accounting, i).nr;
 	buffer_heads_over_limit = (tot > max_buffer_heads);
 }
-	
+
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 {
 	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
 	if (ret) {
 		INIT_LIST_HEAD(&ret->b_assoc_buffers);
-		get_cpu_var(bh_accounting).nr++;
+		preempt_disable();
+		__this_cpu_inc(bh_accounting.nr);
 		recalc_bh_state();
-		put_cpu_var(bh_accounting);
+		preempt_enable();
 	}
 	return ret;
 }
@@ -3228,9 +3227,10 @@ void free_buffer_head(struct buffer_head *bh)
 {
 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
 	kmem_cache_free(bh_cachep, bh);
-	get_cpu_var(bh_accounting).nr--;
+	preempt_disable();
+	__this_cpu_dec(bh_accounting.nr);
 	recalc_bh_state();
-	put_cpu_var(bh_accounting);
+	preempt_enable();
 }
 EXPORT_SYMBOL(free_buffer_head);
 
@@ -3243,9 +3243,8 @@ static void buffer_exit_cpu(int cpu)
 		brelse(b->bhs[i]);
 		b->bhs[i] = NULL;
 	}
-	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
+	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
 	per_cpu(bh_accounting, cpu).nr = 0;
-	put_cpu_var(bh_accounting);
 }
 
 static int buffer_cpu_notify(struct notifier_block *self,
-- 
cgit v1.2.2


From ee1be8626355e6a1f3f8c44e2351ff2661c5998d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Mon, 6 Dec 2010 11:40:05 -0600
Subject: fs: Use this_cpu_inc_return in buffer.c

__this_cpu_inc can create a single instruction with the same effect
as the _get_cpu_var(..)++ construct in buffer.c.

Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 fs/buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 137d9de00e24..2219a76e2caf 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3201,7 +3201,7 @@ static void recalc_bh_state(void)
 	int i;
 	int tot = 0;
 
-	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
+	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
 		return;
 	__this_cpu_write(bh_accounting.ratelimit, 0);
 	for_each_online_cpu(i)
-- 
cgit v1.2.2


From 9b00a8182987e8b7028d97c2bee3319ef383b57a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Dec 2010 15:59:06 +0100
Subject: ocfs2: don't use flush_scheduled_work()

flush_scheduled_work() is deprecated and scheduled to be removed.

* cancel_delayed_work() + flush_schedule_work() ->
  cancel_delayed_work_sync().

* flush qs->qs_work directly on exit instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Joel Becker <joel.becker@oracle.com>
Cc: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/cluster/heartbeat.c | 3 +--
 fs/ocfs2/cluster/quorum.c    | 4 +++-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 52c7557f3e25..892e2de49903 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -307,8 +307,7 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
 
 static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
 {
-	cancel_delayed_work(&reg->hr_write_timeout_work);
-	flush_scheduled_work();
+	cancel_delayed_work_sync(&reg->hr_write_timeout_work);
 }
 
 static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index cf3e16696216..a87366750f23 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -325,5 +325,7 @@ void o2quo_init(void)
 
 void o2quo_exit(void)
 {
-	flush_scheduled_work();
+	struct o2quo_state *qs = &o2quo_state;
+
+	flush_work_sync(&qs->qs_work);
 }
-- 
cgit v1.2.2


From 5d8e4bddc635dd61ab8b3bcb75c59934e9c1e19f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 Dec 2010 15:59:06 +0100
Subject: ncpfs: don't use flush_scheduled_work()

flush_scheduled_work() is deprecated and scheduled to be removed.
Directly flush the used works on stop instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Petr Vandrovec <petr@vandrovec.name>
---
 fs/ncpfs/inode.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 8fb93b604e73..31532330d016 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -309,7 +309,12 @@ static void ncp_stop_tasks(struct ncp_server *server) {
 	sk->sk_write_space  = server->write_space;
 	release_sock(sk);
 	del_timer_sync(&server->timeout_tm);
-	flush_scheduled_work();
+
+	flush_work_sync(&server->rcv.tq);
+	if (sk->sk_socket->type == SOCK_STREAM)
+		flush_work_sync(&server->tx.tq);
+	else
+		flush_work_sync(&server->timeout_tq);
 }
 
 static int  ncp_show_options(struct seq_file *seq, struct vfsmount *mnt)
-- 
cgit v1.2.2


From 88ae7624a6fe890e5a8ca57b25420f66e1389f8b Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Mon, 27 Dec 2010 07:47:54 -0300
Subject: [media] V4L1 removal: Remove linux/videodev.h

There's no sense on keeping it on 2.6.38, as nobody is using it
anymore, at the kernel tree, and installing it at the userspace
API.

As two deprecated drivers still need it, move it to their internal
directories.

Reviewed-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 fs/compat_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a60579b007b0..175736c282ab 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -42,7 +42,7 @@
 #include <linux/tty.h>
 #include <linux/vt_kern.h>
 #include <linux/fb.h>
-#include <linux/videodev.h>
+#include <linux/videodev2.h>
 #include <linux/netdevice.h>
 #include <linux/raw.h>
 #include <linux/blkdev.h>
-- 
cgit v1.2.2


From d3a23e1678a5827c38ed8a465ad91d65e59fa911 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Wed, 5 Jan 2011 20:01:21 +1100
Subject: Revert "fs: use RCU read side protection in d_validate"

This reverts commit 3825bdb7ed920845961f32f364454bee5f469abb.

You cannot dget() a dentry without having a reference, or holding
a lock that guarantees it remains valid.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 23702a9d4e6d..cc2b93802179 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1491,26 +1491,33 @@ out:
  * This is used by ncpfs in its readdir implementation.
  * Zero is returned in the dentry is invalid.
  */
-int d_validate(struct dentry *dentry, struct dentry *parent)
+ 
+int d_validate(struct dentry *dentry, struct dentry *dparent)
 {
-	struct hlist_head *head = d_hash(parent, dentry->d_name.hash);
-	struct hlist_node *node;
-	struct dentry *d;
+	struct hlist_head *base;
+	struct hlist_node *lhp;
 
 	/* Check whether the ptr might be valid at all.. */
 	if (!kmem_ptr_validate(dentry_cache, dentry))
-		return 0;
-	if (dentry->d_parent != parent)
-		return 0;
+		goto out;
 
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(d, node, head, d_hash) {
-		if (d == dentry) {
-			dget(dentry);
+	if (dentry->d_parent != dparent)
+		goto out;
+
+	spin_lock(&dcache_lock);
+	base = d_hash(dparent, dentry->d_name.hash);
+	hlist_for_each(lhp,base) { 
+		/* hlist_for_each_entry_rcu() not required for d_hash list
+		 * as it is parsed under dcache_lock
+		 */
+		if (dentry == hlist_entry(lhp, struct dentry, d_hash)) {
+			__dget_locked(dentry);
+			spin_unlock(&dcache_lock);
 			return 1;
 		}
 	}
-	rcu_read_unlock();
+	spin_unlock(&dcache_lock);
+out:
 	return 0;
 }
 EXPORT_SYMBOL(d_validate);
-- 
cgit v1.2.2


From 786a5e15b613a9cee4fc9139fc3113a5ab0fde79 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:16 +1100
Subject: fs: d_validate fixes

d_validate has been broken for a long time.

kmem_ptr_validate does not guarantee that a pointer can be dereferenced
if it can go away at any time. Even rcu_read_lock doesn't help, because
the pointer might be queued in RCU callbacks but not executed yet.

So the parent cannot be checked, nor the name hashed. The dentry pointer
can not be touched until it can be verified under lock. Hashing simply
cannot be used.

Instead, verify the parent/child relationship by traversing parent's
d_child list. It's slow, but only ncpfs and the destaged smbfs care
about it, at this point.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index cc2b93802179..9d1a59dfda0b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1483,41 +1483,30 @@ out:
 }
 
 /**
- * d_validate - verify dentry provided from insecure source
+ * d_validate - verify dentry provided from insecure source (deprecated)
  * @dentry: The dentry alleged to be valid child of @dparent
  * @dparent: The parent dentry (known to be valid)
  *
  * An insecure source has sent us a dentry, here we verify it and dget() it.
  * This is used by ncpfs in its readdir implementation.
  * Zero is returned in the dentry is invalid.
+ *
+ * This function is slow for big directories, and deprecated, do not use it.
  */
- 
 int d_validate(struct dentry *dentry, struct dentry *dparent)
 {
-	struct hlist_head *base;
-	struct hlist_node *lhp;
-
-	/* Check whether the ptr might be valid at all.. */
-	if (!kmem_ptr_validate(dentry_cache, dentry))
-		goto out;
-
-	if (dentry->d_parent != dparent)
-		goto out;
+	struct dentry *child;
 
 	spin_lock(&dcache_lock);
-	base = d_hash(dparent, dentry->d_name.hash);
-	hlist_for_each(lhp,base) { 
-		/* hlist_for_each_entry_rcu() not required for d_hash list
-		 * as it is parsed under dcache_lock
-		 */
-		if (dentry == hlist_entry(lhp, struct dentry, d_hash)) {
+	list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) {
+		if (dentry == child) {
 			__dget_locked(dentry);
 			spin_unlock(&dcache_lock);
 			return 1;
 		}
 	}
 	spin_unlock(&dcache_lock);
-out:
+
 	return 0;
 }
 EXPORT_SYMBOL(d_validate);
-- 
cgit v1.2.2


From 86c8749ede0c59e590de9267066932a26f1ce796 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:18 +1100
Subject: vfs: revert per-cpu nr_unused counters for dentry and inodes

The nr_unused counters count the number of objects on an LRU, and as such they
are synchronized with LRU object insertion and removal and scanning, and
protected under the LRU lock.

Making it per-cpu does not actually get any concurrency improvements because of
this lock, and summing the counter is much slower, and
incrementing/decrementing it costs more code size and is slower too.

These counters should stay per-LRU, which currently means global.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 16 +++++-----------
 fs/inode.c  | 17 +++++++----------
 2 files changed, 12 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 9d1a59dfda0b..f62ba90bce91 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -68,14 +68,12 @@ struct dentry_stat_t dentry_stat = {
 };
 
 static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
-static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp;
 
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
 int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
 		   size_t *lenp, loff_t *ppos)
 {
 	dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
-	dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
 	return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #endif
@@ -140,7 +138,7 @@ static void dentry_lru_add(struct dentry *dentry)
 	if (list_empty(&dentry->d_lru)) {
 		list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 		dentry->d_sb->s_nr_dentry_unused++;
-		percpu_counter_inc(&nr_dentry_unused);
+		dentry_stat.nr_unused++;
 	}
 }
 
@@ -149,7 +147,7 @@ static void dentry_lru_del(struct dentry *dentry)
 	if (!list_empty(&dentry->d_lru)) {
 		list_del_init(&dentry->d_lru);
 		dentry->d_sb->s_nr_dentry_unused--;
-		percpu_counter_dec(&nr_dentry_unused);
+		dentry_stat.nr_unused--;
 	}
 }
 
@@ -158,7 +156,7 @@ static void dentry_lru_move_tail(struct dentry *dentry)
 	if (list_empty(&dentry->d_lru)) {
 		list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 		dentry->d_sb->s_nr_dentry_unused++;
-		percpu_counter_inc(&nr_dentry_unused);
+		dentry_stat.nr_unused++;
 	} else {
 		list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 	}
@@ -546,7 +544,7 @@ static void prune_dcache(int count)
 {
 	struct super_block *sb, *p = NULL;
 	int w_count;
-	int unused = percpu_counter_sum_positive(&nr_dentry_unused);
+	int unused = dentry_stat.nr_unused;
 	int prune_ratio;
 	int pruned;
 
@@ -908,16 +906,13 @@ EXPORT_SYMBOL(shrink_dcache_parent);
  */
 static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
-	int nr_unused;
-
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
 		prune_dcache(nr);
 	}
 
-	nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
-	return (nr_unused / 100) * sysctl_vfs_cache_pressure;
+	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 
 static struct shrinker dcache_shrinker = {
@@ -2424,7 +2419,6 @@ static void __init dcache_init(void)
 	int loop;
 
 	percpu_counter_init(&nr_dentry, 0);
-	percpu_counter_init(&nr_dentry_unused, 0);
 
 	/* 
 	 * A constructor could be added for stable state like the lists,
diff --git a/fs/inode.c b/fs/inode.c
index ae2727ab0c3a..efc43979709f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -103,7 +103,6 @@ static DECLARE_RWSEM(iprune_sem);
 struct inodes_stat_t inodes_stat;
 
 static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
-static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
 
 static struct kmem_cache *inode_cachep __read_mostly;
 
@@ -114,7 +113,7 @@ static inline int get_nr_inodes(void)
 
 static inline int get_nr_inodes_unused(void)
 {
-	return percpu_counter_sum_positive(&nr_inodes_unused);
+	return inodes_stat.nr_unused;
 }
 
 int get_nr_dirty_inodes(void)
@@ -132,7 +131,6 @@ int proc_nr_inodes(ctl_table *table, int write,
 		   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	inodes_stat.nr_inodes = get_nr_inodes();
-	inodes_stat.nr_unused = get_nr_inodes_unused();
 	return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #endif
@@ -335,7 +333,7 @@ static void inode_lru_list_add(struct inode *inode)
 {
 	if (list_empty(&inode->i_lru)) {
 		list_add(&inode->i_lru, &inode_lru);
-		percpu_counter_inc(&nr_inodes_unused);
+		inodes_stat.nr_unused++;
 	}
 }
 
@@ -343,7 +341,7 @@ static void inode_lru_list_del(struct inode *inode)
 {
 	if (!list_empty(&inode->i_lru)) {
 		list_del_init(&inode->i_lru);
-		percpu_counter_dec(&nr_inodes_unused);
+		inodes_stat.nr_unused--;
 	}
 }
 
@@ -513,7 +511,7 @@ void evict_inodes(struct super_block *sb)
 		list_move(&inode->i_lru, &dispose);
 		list_del_init(&inode->i_wb_list);
 		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			percpu_counter_dec(&nr_inodes_unused);
+			inodes_stat.nr_unused--;
 	}
 	spin_unlock(&inode_lock);
 
@@ -554,7 +552,7 @@ int invalidate_inodes(struct super_block *sb)
 		list_move(&inode->i_lru, &dispose);
 		list_del_init(&inode->i_wb_list);
 		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			percpu_counter_dec(&nr_inodes_unused);
+			inodes_stat.nr_unused--;
 	}
 	spin_unlock(&inode_lock);
 
@@ -616,7 +614,7 @@ static void prune_icache(int nr_to_scan)
 		if (atomic_read(&inode->i_count) ||
 		    (inode->i_state & ~I_REFERENCED)) {
 			list_del_init(&inode->i_lru);
-			percpu_counter_dec(&nr_inodes_unused);
+			inodes_stat.nr_unused--;
 			continue;
 		}
 
@@ -650,7 +648,7 @@ static void prune_icache(int nr_to_scan)
 		 */
 		list_move(&inode->i_lru, &freeable);
 		list_del_init(&inode->i_wb_list);
-		percpu_counter_dec(&nr_inodes_unused);
+		inodes_stat.nr_unused--;
 	}
 	if (current_is_kswapd())
 		__count_vm_events(KSWAPD_INODESTEAL, reap);
@@ -1649,7 +1647,6 @@ void __init inode_init(void)
 					 init_once);
 	register_shrinker(&icache_shrinker);
 	percpu_counter_init(&nr_inodes, 0);
-	percpu_counter_init(&nr_inodes_unused, 0);
 
 	/* Hash may have been set up in inode_init_early */
 	if (!hashdist)
-- 
cgit v1.2.2


From 3e880fb5e4bb6a012035e3edd0586ee2817c2e24 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:19 +1100
Subject: fs: use fast counters for vfs caches

percpu_counter library generates quite nasty code, so unless you need
to dynamically allocate counters or take fast approximate value, a
simple per cpu set of counters is much better.

The percpu_counter can never be made to work as well, because it has an
indirection from pointer to percpu memory, and it can't use direct
this_cpu_inc interfaces because it doesn't use static PER_CPU data, so
code will always be worse.

In the fastpath, it is the difference between this:

        incl %gs:nr_dentry      # nr_dentry

and this:

        movl    percpu_counter_batch(%rip), %edx        # percpu_counter_batch,
        movl    $1, %esi        #,
        movq    $nr_dentry, %rdi        #,
        call    __percpu_counter_add    # (plus I clobber registers)

__percpu_counter_add:
        pushq   %rbp    #
        movq    %rsp, %rbp      #,
        subq    $32, %rsp       #,
        movq    %rbx, -24(%rbp) #,
        movq    %r12, -16(%rbp) #,
        movq    %r13, -8(%rbp)  #,
        movq    %rdi, %rbx      # fbc, fbc
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
        movq %gs:kernel_stack,%rax      #, pfo_ret__
# 0 "" 2
#NO_APP
        incl    -8124(%rax)     # <variable>.preempt_count
        movq    32(%rdi), %r12  # <variable>.counters, tcp_ptr__
#APP
# 78 "lib/percpu_counter.c" 1
        add %gs:this_cpu_off, %r12      # this_cpu_off, tcp_ptr__
# 0 "" 2
#NO_APP
        movslq  (%r12),%r13     #* tcp_ptr__, tmp73
        movslq  %edx,%rax       # batch, batch
        addq    %rsi, %r13      # amount, count
        cmpq    %rax, %r13      # batch, count
        jge     .L27    #,
        negl    %edx    # tmp76
        movslq  %edx,%rdx       # tmp76, tmp77
        cmpq    %rdx, %r13      # tmp77, count
        jg      .L28    #,
.L27:
        movq    %rbx, %rdi      # fbc,
        call    _raw_spin_lock  #
        addq    %r13, 8(%rbx)   # count, <variable>.count
        movq    %rbx, %rdi      # fbc,
        movl    $0, (%r12)      #,* tcp_ptr__
        call    _raw_spin_unlock        #
.L29:
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
        movq %gs:kernel_stack,%rax      #, pfo_ret__
# 0 "" 2
#NO_APP
        decl    -8124(%rax)     # <variable>.preempt_count
        movq    -8136(%rax), %rax       #, D.14625
        testb   $8, %al #, D.14625
        jne     .L32    #,
.L31:
        movq    -24(%rbp), %rbx #,
        movq    -16(%rbp), %r12 #,
        movq    -8(%rbp), %r13  #,
        leave
        ret
        .p2align 4,,10
        .p2align 3
.L28:
        movl    %r13d, (%r12)   # count,*
        jmp     .L29    #
.L32:
        call    preempt_schedule        #
        .p2align 4,,6
        jmp     .L31    #
        .size   __percpu_counter_add, .-__percpu_counter_add
        .p2align 4,,15

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 19 +++++++++++++------
 fs/inode.c  | 17 ++++++++++-------
 2 files changed, 23 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index f62ba90bce91..b2cb2662ca00 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -67,13 +67,22 @@ struct dentry_stat_t dentry_stat = {
 	.age_limit = 45,
 };
 
-static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(unsigned int, nr_dentry);
 
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+static int get_nr_dentry(void)
+{
+	int i;
+	int sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_dentry, i);
+	return sum < 0 ? 0 : sum;
+}
+
 int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
 		   size_t *lenp, loff_t *ppos)
 {
-	dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
+	dentry_stat.nr_dentry = get_nr_dentry();
 	return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #endif
@@ -93,7 +102,7 @@ static void __d_free(struct rcu_head *head)
  */
 static void d_free(struct dentry *dentry)
 {
-	percpu_counter_dec(&nr_dentry);
+	this_cpu_dec(nr_dentry);
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
 
@@ -981,7 +990,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
 	spin_unlock(&dcache_lock);
 
-	percpu_counter_inc(&nr_dentry);
+	this_cpu_inc(nr_dentry);
 
 	return dentry;
 }
@@ -2418,8 +2427,6 @@ static void __init dcache_init(void)
 {
 	int loop;
 
-	percpu_counter_init(&nr_dentry, 0);
-
 	/* 
 	 * A constructor could be added for stable state like the lists,
 	 * but it is probably not worth it because of the cache nature
diff --git a/fs/inode.c b/fs/inode.c
index efc43979709f..5a0a898f55d1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -102,13 +102,17 @@ static DECLARE_RWSEM(iprune_sem);
  */
 struct inodes_stat_t inodes_stat;
 
-static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(unsigned int, nr_inodes);
 
 static struct kmem_cache *inode_cachep __read_mostly;
 
-static inline int get_nr_inodes(void)
+static int get_nr_inodes(void)
 {
-	return percpu_counter_sum_positive(&nr_inodes);
+	int i;
+	int sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_inodes, i);
+	return sum < 0 ? 0 : sum;
 }
 
 static inline int get_nr_inodes_unused(void)
@@ -118,9 +122,9 @@ static inline int get_nr_inodes_unused(void)
 
 int get_nr_dirty_inodes(void)
 {
+	/* not actually dirty inodes, but a wild approximation */
 	int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
 	return nr_dirty > 0 ? nr_dirty : 0;
-
 }
 
 /*
@@ -222,7 +226,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_fsnotify_mask = 0;
 #endif
 
-	percpu_counter_inc(&nr_inodes);
+	this_cpu_inc(nr_inodes);
 
 	return 0;
 out:
@@ -264,7 +268,7 @@ void __destroy_inode(struct inode *inode)
 	if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
 		posix_acl_release(inode->i_default_acl);
 #endif
-	percpu_counter_dec(&nr_inodes);
+	this_cpu_dec(nr_inodes);
 }
 EXPORT_SYMBOL(__destroy_inode);
 
@@ -1646,7 +1650,6 @@ void __init inode_init(void)
 					 SLAB_MEM_SPREAD),
 					 init_once);
 	register_shrinker(&icache_shrinker);
-	percpu_counter_init(&nr_inodes, 0);
 
 	/* Hash may have been set up in inode_init_early */
 	if (!hashdist)
-- 
cgit v1.2.2


From fbc8d4c04626e015b18cc61199f505920abb48f0 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:21 +1100
Subject: config fs: avoid switching ->d_op on live dentry

Switching d_op on a live dentry is racy in general, so avoid it. In this case
it is a negative dentry, which is safer, but there are still concurrent ops
which may be called on d_op in that case (eg. d_revalidate). So in general
a filesystem may not do this. Fix configfs so as not to do this.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/configfs/dir.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 0b502f80c691..578706969415 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -232,10 +232,8 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
 
 	sd->s_mode = mode;
 	sd->s_dentry = dentry;
-	if (dentry) {
+	if (dentry)
 		dentry->d_fsdata = configfs_get(sd);
-		dentry->d_op = &configfs_dentry_ops;
-	}
 
 	return 0;
 }
@@ -278,7 +276,6 @@ static int create_dir(struct config_item * k, struct dentry * p,
 		error = configfs_create(d, mode, init_dir);
 		if (!error) {
 			inc_nlink(p->d_inode);
-			(d)->d_op = &configfs_dentry_ops;
 		} else {
 			struct configfs_dirent *sd = d->d_fsdata;
 			if (sd) {
@@ -371,9 +368,7 @@ int configfs_create_link(struct configfs_symlink *sl,
 				   CONFIGFS_ITEM_LINK);
 	if (!err) {
 		err = configfs_create(dentry, mode, init_symlink);
-		if (!err)
-			dentry->d_op = &configfs_dentry_ops;
-		else {
+		if (err) {
 			struct configfs_dirent *sd = dentry->d_fsdata;
 			if (sd) {
 				spin_lock(&configfs_dirent_lock);
@@ -493,7 +488,11 @@ static struct dentry * configfs_lookup(struct inode *dir,
 		 * If it doesn't exist and it isn't a NOT_PINNED item,
 		 * it must be negative.
 		 */
-		return simple_lookup(dir, dentry, nd);
+		if (dentry->d_name.len > NAME_MAX)
+			return ERR_PTR(-ENAMETOOLONG);
+		dentry->d_op = &configfs_dentry_ops;
+		d_add(dentry, NULL);
+		return NULL;
 	}
 
 out:
@@ -685,6 +684,7 @@ static int create_default_group(struct config_group *parent_group,
 	ret = -ENOMEM;
 	child = d_alloc(parent, &name);
 	if (child) {
+		child->d_op = &configfs_dentry_ops;
 		d_add(child, NULL);
 
 		ret = configfs_attach_group(&parent_group->cg_item,
@@ -1682,6 +1682,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
 	err = -ENOMEM;
 	dentry = d_alloc(configfs_sb->s_root, &name);
 	if (dentry) {
+		dentry->d_op = &configfs_dentry_ops;
 		d_add(dentry, NULL);
 
 		err = configfs_attach_group(sd->s_element, &group->cg_item,
-- 
cgit v1.2.2


From fe15ce446beb3a33583af81ffe6c9d01a75314ed Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:23 +1100
Subject: fs: change d_delete semantics

Change d_delete from a dentry deletion notification to a dentry caching
advise, more like ->drop_inode. Require it to be constant and idempotent,
and not take d_lock. This is how all existing filesystems use the callback
anyway.

This makes fine grained dentry locking of dput and dentry lru scanning
much simpler.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/9p/vfs_dentry.c      | 4 ++--
 fs/afs/dir.c            | 4 ++--
 fs/btrfs/inode.c        | 2 +-
 fs/coda/dir.c           | 4 ++--
 fs/configfs/dir.c       | 2 +-
 fs/dcache.c             | 2 --
 fs/gfs2/dentry.c        | 2 +-
 fs/hostfs/hostfs_kern.c | 2 +-
 fs/libfs.c              | 2 +-
 fs/ncpfs/dir.c          | 4 ++--
 fs/nfs/dir.c            | 2 +-
 fs/proc/base.c          | 2 +-
 fs/proc/generic.c       | 2 +-
 fs/proc/proc_sysctl.c   | 2 +-
 fs/sysfs/dir.c          | 2 +-
 15 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index cbf4e50f3933..466d2a4fc5cb 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -51,7 +51,7 @@
  *
  */
 
-static int v9fs_dentry_delete(struct dentry *dentry)
+static int v9fs_dentry_delete(const struct dentry *dentry)
 {
 	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
 									dentry);
@@ -68,7 +68,7 @@ static int v9fs_dentry_delete(struct dentry *dentry)
  *
  */
 
-static int v9fs_cached_dentry_delete(struct dentry *dentry)
+static int v9fs_cached_dentry_delete(const struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
 	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 5439e1bc9a86..2c18cde27000 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -23,7 +23,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 static int afs_dir_open(struct inode *inode, struct file *file);
 static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
 static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd);
-static int afs_d_delete(struct dentry *dentry);
+static int afs_d_delete(const struct dentry *dentry);
 static void afs_d_release(struct dentry *dentry);
 static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
 				  loff_t fpos, u64 ino, unsigned dtype);
@@ -730,7 +730,7 @@ out_bad:
  * - called from dput() when d_count is going to 0.
  * - return 1 to request dentry be unhashed, 0 otherwise
  */
-static int afs_d_delete(struct dentry *dentry)
+static int afs_d_delete(const struct dentry *dentry)
 {
 	_enter("%s", dentry->d_name.name);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 72f31ecb5c90..7ce9f8932789 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4127,7 +4127,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 	return inode;
 }
 
-static int btrfs_dentry_delete(struct dentry *dentry)
+static int btrfs_dentry_delete(const struct dentry *dentry)
 {
 	struct btrfs_root *root;
 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 5d8b35539601..4cce3b07d9d7 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -47,7 +47,7 @@ static int coda_readdir(struct file *file, void *buf, filldir_t filldir);
 
 /* dentry ops */
 static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd);
-static int coda_dentry_delete(struct dentry *);
+static int coda_dentry_delete(const struct dentry *);
 
 /* support routines */
 static int coda_venus_readdir(struct file *coda_file, void *buf,
@@ -577,7 +577,7 @@ out:
  * This is the callback from dput() when d_count is going to 0.
  * We use this to unhash dentries with bad inodes.
  */
-static int coda_dentry_delete(struct dentry * dentry)
+static int coda_dentry_delete(const struct dentry * dentry)
 {
 	int flags;
 
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 578706969415..20024a9ef5a7 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -67,7 +67,7 @@ static void configfs_d_iput(struct dentry * dentry,
  * We _must_ delete our dentries on last dput, as the chain-to-parent
  * behavior is required to clear the parents of default_groups.
  */
-static int configfs_d_delete(struct dentry *dentry)
+static int configfs_d_delete(const struct dentry *dentry)
 {
 	return 1;
 }
diff --git a/fs/dcache.c b/fs/dcache.c
index b2cb2662ca00..6ee6bc40cb63 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -453,8 +453,6 @@ static void prune_one_dentry(struct dentry * dentry)
 		if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock))
 			return;
 
-		if (dentry->d_op && dentry->d_op->d_delete)
-			dentry->d_op->d_delete(dentry);
 		dentry_lru_del(dentry);
 		__d_drop(dentry);
 		dentry = d_kill(dentry);
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 6798755b3858..e80fea2f65ff 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -106,7 +106,7 @@ static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
 	return 0;
 }
 
-static int gfs2_dentry_delete(struct dentry *dentry)
+static int gfs2_dentry_delete(const struct dentry *dentry)
 {
 	struct gfs2_inode *ginode;
 
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2c0f148a49e6..cfe8bc7de511 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -32,7 +32,7 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
 
 #define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode)
 
-static int hostfs_d_delete(struct dentry *dentry)
+static int hostfs_d_delete(const struct dentry *dentry)
 {
 	return 1;
 }
diff --git a/fs/libfs.c b/fs/libfs.c
index a3accdf528ad..b9d25d83e228 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -37,7 +37,7 @@ int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
  * Retaining negative dentries for an in-memory filesystem just wastes
  * memory and lookup time: arrange for them to be deleted immediately.
  */
-static int simple_delete_dentry(struct dentry *dentry)
+static int simple_delete_dentry(const struct dentry *dentry)
 {
 	return 1;
 }
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index f22b12e7d337..d6e6453881ce 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -76,7 +76,7 @@ const struct inode_operations ncp_dir_inode_operations =
 static int ncp_lookup_validate(struct dentry *, struct nameidata *);
 static int ncp_hash_dentry(struct dentry *, struct qstr *);
 static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *);
-static int ncp_delete_dentry(struct dentry *);
+static int ncp_delete_dentry(const struct dentry *);
 
 static const struct dentry_operations ncp_dentry_operations =
 {
@@ -162,7 +162,7 @@ ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
  * Closing files can be safely postponed until iput() - it's done there anyway.
  */
 static int
-ncp_delete_dentry(struct dentry * dentry)
+ncp_delete_dentry(const struct dentry * dentry)
 {
 	struct inode *inode = dentry->d_inode;
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 996dd8989a91..9184c7c80f78 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1117,7 +1117,7 @@ out_error:
 /*
  * This is called from dput() when d_count is going to 0.
  */
-static int nfs_dentry_delete(struct dentry *dentry)
+static int nfs_dentry_delete(const struct dentry *dentry)
 {
 	dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 182845147fe4..d932fdb6a245 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1744,7 +1744,7 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return 0;
 }
 
-static int pid_delete_dentry(struct dentry * dentry)
+static int pid_delete_dentry(const struct dentry * dentry)
 {
 	/* Is the task we represent dead?
 	 * If so, then don't put the dentry on the lru list,
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index dd29f0337661..1d607be36d95 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -400,7 +400,7 @@ static const struct inode_operations proc_link_inode_operations = {
  * smarter: we could keep a "volatile" flag in the 
  * inode to indicate which ones to keep.
  */
-static int proc_delete_dentry(struct dentry * dentry)
+static int proc_delete_dentry(const struct dentry * dentry)
 {
 	return 1;
 }
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index b652cb00906b..a256d770ea18 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -392,7 +392,7 @@ static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return !PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
 
-static int proc_sys_delete(struct dentry *dentry)
+static int proc_sys_delete(const struct dentry *dentry)
 {
 	return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 7e54bac8c4b0..27e1102e303e 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -231,7 +231,7 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
 		goto repeat;
 }
 
-static int sysfs_dentry_delete(struct dentry *dentry)
+static int sysfs_dentry_delete(const struct dentry *dentry)
 {
 	struct sysfs_dirent *sd = dentry->d_fsdata;
 	return !!(sd->s_flags & SYSFS_FLAG_REMOVED);
-- 
cgit v1.2.2


From 79eb4dde742fe2e9c9e301432b894a7410261ce7 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:24 +1100
Subject: cifs: dont overwrite dentry name in d_revalidate

Use vfat's method for dealing with negative dentries to preserve case,
rather than overwrite dentry name in d_revalidate, which is a bit ugly
and also gets in the way of doing lock-free path walking.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/cifs/dir.c | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3840eddbfb7a..521d841b1fd1 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -656,22 +656,34 @@ lookup_out:
 static int
 cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 {
-	int isValid = 1;
-
 	if (direntry->d_inode) {
 		if (cifs_revalidate_dentry(direntry))
 			return 0;
-	} else {
-		cFYI(1, "neg dentry 0x%p name = %s",
-			 direntry, direntry->d_name.name);
-		if (time_after(jiffies, direntry->d_time + HZ) ||
-			!lookupCacheEnabled) {
-			d_drop(direntry);
-			isValid = 0;
-		}
+		else
+			return 1;
 	}
 
-	return isValid;
+	/*
+	 * This may be nfsd (or something), anyway, we can't see the
+	 * intent of this. So, since this can be for creation, drop it.
+	 */
+	if (!nd)
+		return 0;
+
+	/*
+	 * Drop the negative dentry, in order to make sure to use the
+	 * case sensitive name which is specified by user if this is
+	 * for creation.
+	 */
+	if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+		if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+			return 0;
+	}
+
+	if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled)
+		return 0;
+
+	return 1;
 }
 
 /* static int cifs_d_delete(struct dentry *direntry)
@@ -709,15 +721,8 @@ static int cifs_ci_compare(struct dentry *dentry, struct qstr *a,
 	struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls;
 
 	if ((a->len == b->len) &&
-	    (nls_strnicmp(codepage, a->name, b->name, a->len) == 0)) {
-		/*
-		 * To preserve case, don't let an existing negative dentry's
-		 * case take precedence.  If a is not a negative dentry, this
-		 * should have no side effects
-		 */
-		memcpy((void *)a->name, b->name, a->len);
+	    (nls_strnicmp(codepage, a->name, b->name, a->len) == 0))
 		return 0;
-	}
 	return 1;
 }
 
-- 
cgit v1.2.2


From 2bc334dcc7c77be3700dd443d92a78603f76976b Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:25 +1100
Subject: jfs: dont overwrite dentry name in d_revalidate

Use vfat's method for dealing with negative dentries to preserve case,
rather than overwrite dentry name in d_revalidate, which is a bit ugly
and also gets in the way of doing lock-free path walking.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/jfs/namei.c | 43 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 231ca4af9bce..2da1546161fa 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -18,6 +18,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/ctype.h>
 #include <linux/quotaops.h>
 #include <linux/exportfs.h>
@@ -1597,21 +1598,47 @@ static int jfs_ci_compare(struct dentry *dir, struct qstr *a, struct qstr *b)
 			goto out;
 	}
 	result = 0;
+out:
+	return result;
+}
 
+static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
 	/*
-	 * We want creates to preserve case.  A negative dentry, a, that
-	 * has a different case than b may cause a new entry to be created
-	 * with the wrong case.  Since we can't tell if a comes from a negative
-	 * dentry, we blindly replace it with b.  This should be harmless if
-	 * a is not a negative dentry.
+	 * This is not negative dentry. Always valid.
+	 *
+	 * Note, rename() to existing directory entry will have ->d_inode,
+	 * and will use existing name which isn't specified name by user.
+	 *
+	 * We may be able to drop this positive dentry here. But dropping
+	 * positive dentry isn't good idea. So it's unsupported like
+	 * rename("filename", "FILENAME") for now.
 	 */
-	memcpy((unsigned char *)a->name, b->name, a->len);
-out:
-	return result;
+	if (dentry->d_inode)
+		return 1;
+
+	/*
+	 * This may be nfsd (or something), anyway, we can't see the
+	 * intent of this. So, since this can be for creation, drop it.
+	 */
+	if (!nd)
+		return 0;
+
+	/*
+	 * Drop the negative dentry, in order to make sure to use the
+	 * case sensitive name which is specified by user if this is
+	 * for creation.
+	 */
+	if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+		if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+			return 0;
+	}
+	return 1;
 }
 
 const struct dentry_operations jfs_ci_dentry_operations =
 {
 	.d_hash = jfs_ci_hash,
 	.d_compare = jfs_ci_compare,
+	.d_revalidate = jfs_ci_revalidate,
 };
-- 
cgit v1.2.2


From fb2d5b86aff355a27ebfc132d3c99f4a940cc3fe Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:26 +1100
Subject: fs: name case update method

smpfs and ncpfs want to update a live dentry name in-place. Rather than
have them open code the locking, provide a documented dcache API.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c    | 27 +++++++++++++++++++++++++++
 fs/ncpfs/dir.c | 35 ++++++-----------------------------
 2 files changed, 33 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 6ee6bc40cb63..814e5f491e9c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1589,6 +1589,33 @@ void d_rehash(struct dentry * entry)
 }
 EXPORT_SYMBOL(d_rehash);
 
+/**
+ * dentry_update_name_case - update case insensitive dentry with a new name
+ * @dentry: dentry to be updated
+ * @name: new name
+ *
+ * Update a case insensitive dentry with new case of name.
+ *
+ * dentry must have been returned by d_lookup with name @name. Old and new
+ * name lengths must match (ie. no d_compare which allows mismatched name
+ * lengths).
+ *
+ * Parent inode i_mutex must be held over d_lookup and into this call (to
+ * keep renames and concurrent inserts, and readdir(2) away).
+ */
+void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
+{
+	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+	BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
+
+	spin_lock(&dcache_lock);
+	spin_lock(&dentry->d_lock);
+	memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&dcache_lock);
+}
+EXPORT_SYMBOL(dentry_update_name_case);
+
 /*
  * When switching names, the actual string doesn't strictly have to
  * be preserved in the target - because we're dropping the target
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index d6e6453881ce..e80ea4e37c48 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -611,35 +611,12 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 			shrink_dcache_parent(newdent);
 
 		/*
-		 * It is not as dangerous as it looks.  NetWare's OS2 namespace is
-		 * case preserving yet case insensitive.  So we update dentry's name
-		 * as received from server.  We found dentry via d_lookup with our
-		 * hash, so we know that hash does not change, and so replacing name
-		 * should be reasonably safe.
+		 * NetWare's OS2 namespace is case preserving yet case
+		 * insensitive.  So we update dentry's name as received from
+		 * server. Parent dir's i_mutex is locked because we're in
+		 * readdir.
 		 */
-		if (qname.len == newdent->d_name.len &&
-		    memcmp(newdent->d_name.name, qname.name, newdent->d_name.len)) {
-			struct inode *inode = newdent->d_inode;
-
-			/*
-			 * Inside ncpfs all uses of d_name are either for debugging,
-			 * or on functions which acquire inode mutex (mknod, creat,
-			 * lookup).  So grab i_mutex here, to be sure.  d_path
-			 * uses dcache_lock when generating path, so we should too.
-			 * And finally d_compare is protected by dentry's d_lock, so
-			 * here we go.
-			 */
-			if (inode)
-				mutex_lock(&inode->i_mutex);
-			spin_lock(&dcache_lock);
-			spin_lock(&newdent->d_lock);
-			memcpy((char *) newdent->d_name.name, qname.name,
-								newdent->d_name.len);
-			spin_unlock(&newdent->d_lock);
-			spin_unlock(&dcache_lock);
-			if (inode)
-				mutex_unlock(&inode->i_mutex);
-		}
+		dentry_update_name_case(newdent, &qname);
 	}
 
 	if (!newdent->d_inode) {
@@ -657,7 +634,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 	} else {
 		struct inode *inode = newdent->d_inode;
 
-		mutex_lock(&inode->i_mutex);
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
 		ncp_update_inode2(inode, entry);
 		mutex_unlock(&inode->i_mutex);
 	}
-- 
cgit v1.2.2


From 621e155a3591962420eacdd39f6f0aa29ceb221e Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:27 +1100
Subject: fs: change d_compare for rcu-walk

Change d_compare so it may be called from lock-free RCU lookups. This
does put significant restrictions on what may be done from the callback,
however there don't seem to have been any problems with in-tree fses.
If some strange use case pops up that _really_ cannot cope with the
rcu-walk rules, we can just add new rcu-unaware callbacks, which would
cause name lookup to drop out of rcu-walk mode.

For in-tree filesystems, this is just a mechanical change.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/adfs/dir.c            |  8 +++--
 fs/affs/namei.c          | 46 ++++++++++++++----------
 fs/cifs/dir.c            | 12 ++++---
 fs/dcache.c              |  4 ++-
 fs/fat/namei_msdos.c     | 14 ++++----
 fs/fat/namei_vfat.c      | 33 ++++++++++-------
 fs/hfs/hfs_fs.h          |  5 ++-
 fs/hfs/string.c          | 14 ++++----
 fs/hfsplus/hfsplus_fs.h  |  5 ++-
 fs/hfsplus/unicode.c     | 15 ++++----
 fs/hpfs/dentry.c         | 22 +++++++-----
 fs/isofs/inode.c         | 92 +++++++++++++++++++++++++-----------------------
 fs/isofs/namei.c         |  3 +-
 fs/jfs/namei.c           | 11 +++---
 fs/ncpfs/dir.c           | 25 +++++++------
 fs/ncpfs/ncplib_kernel.h |  8 ++---
 fs/proc/proc_sysctl.c    | 13 +++----
 17 files changed, 191 insertions(+), 139 deletions(-)

(limited to 'fs')

diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index f4287e4de744..c8ed66162bd4 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -237,17 +237,19 @@ adfs_hash(struct dentry *parent, struct qstr *qstr)
  * requirements of the underlying filesystem.
  */
 static int
-adfs_compare(struct dentry *parent, struct qstr *entry, struct qstr *name)
+adfs_compare(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
 	int i;
 
-	if (entry->len != name->len)
+	if (len != name->len)
 		return 1;
 
 	for (i = 0; i < name->len; i++) {
 		char a, b;
 
-		a = entry->name[i];
+		a = str[i];
 		b = name->name[i];
 
 		if (a >= 'A' && a <= 'Z')
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 914d1c0bc07a..547d5deb0d42 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -14,10 +14,16 @@ typedef int (*toupper_t)(int);
 
 static int	 affs_toupper(int ch);
 static int	 affs_hash_dentry(struct dentry *, struct qstr *);
-static int       affs_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+static int       affs_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
 static int	 affs_intl_toupper(int ch);
 static int	 affs_intl_hash_dentry(struct dentry *, struct qstr *);
-static int       affs_intl_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+static int       affs_intl_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
 
 const struct dentry_operations affs_dentry_operations = {
 	.d_hash		= affs_hash_dentry,
@@ -88,29 +94,29 @@ affs_intl_hash_dentry(struct dentry *dentry, struct qstr *qstr)
 	return __affs_hash_dentry(dentry, qstr, affs_intl_toupper);
 }
 
-static inline int
-__affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, toupper_t toupper)
+static inline int __affs_compare_dentry(unsigned int len,
+		const char *str, const struct qstr *name, toupper_t toupper)
 {
-	const u8 *aname = a->name;
-	const u8 *bname = b->name;
-	int len;
+	const u8 *aname = str;
+	const u8 *bname = name->name;
 
-	/* 'a' is the qstr of an already existing dentry, so the name
-	 * must be valid. 'b' must be validated first.
+	/*
+	 * 'str' is the name of an already existing dentry, so the name
+	 * must be valid. 'name' must be validated first.
 	 */
 
-	if (affs_check_name(b->name,b->len))
+	if (affs_check_name(name->name, name->len))
 		return 1;
 
-	/* If the names are longer than the allowed 30 chars,
+	/*
+	 * If the names are longer than the allowed 30 chars,
 	 * the excess is ignored, so their length may differ.
 	 */
-	len = a->len;
 	if (len >= 30) {
-		if (b->len < 30)
+		if (name->len < 30)
 			return 1;
 		len = 30;
-	} else if (len != b->len)
+	} else if (len != name->len)
 		return 1;
 
 	for (; len > 0; len--)
@@ -121,14 +127,18 @@ __affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, tou
 }
 
 static int
-affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+affs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	return __affs_compare_dentry(dentry, a, b, affs_toupper);
+	return __affs_compare_dentry(len, str, name, affs_toupper);
 }
 static int
-affs_intl_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	return __affs_compare_dentry(dentry, a, b, affs_intl_toupper);
+	return __affs_compare_dentry(len, str, name, affs_intl_toupper);
 }
 
 /*
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 521d841b1fd1..c60133f0d8e4 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -715,13 +715,15 @@ static int cifs_ci_hash(struct dentry *dentry, struct qstr *q)
 	return 0;
 }
 
-static int cifs_ci_compare(struct dentry *dentry, struct qstr *a,
-			   struct qstr *b)
+static int cifs_ci_compare(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls;
+	struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls;
 
-	if ((a->len == b->len) &&
-	    (nls_strnicmp(codepage, a->name, b->name, a->len) == 0))
+	if ((name->len == len) &&
+	    (nls_strnicmp(codepage, name->name, str, len) == 0))
 		return 0;
 	return 1;
 }
diff --git a/fs/dcache.c b/fs/dcache.c
index 814e5f491e9c..7075555fbb04 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1437,7 +1437,9 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 		 */
 		qstr = &dentry->d_name;
 		if (parent->d_op && parent->d_op->d_compare) {
-			if (parent->d_op->d_compare(parent, qstr, name))
+			if (parent->d_op->d_compare(parent, parent->d_inode,
+						dentry, dentry->d_inode,
+						qstr->len, qstr->name, name))
 				goto next;
 		} else {
 			if (qstr->len != len)
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 3345aabd1dd7..99d3c7ac973c 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -164,16 +164,18 @@ static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
  * Compare two msdos names. If either of the names are invalid,
  * we fall back to doing the standard name comparison.
  */
-static int msdos_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int msdos_cmp(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
+	struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options;
 	unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME];
 	int error;
 
-	error = msdos_format_name(a->name, a->len, a_msdos_name, options);
+	error = msdos_format_name(name->name, name->len, a_msdos_name, options);
 	if (error)
 		goto old_compare;
-	error = msdos_format_name(b->name, b->len, b_msdos_name, options);
+	error = msdos_format_name(str, len, b_msdos_name, options);
 	if (error)
 		goto old_compare;
 	error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME);
@@ -182,8 +184,8 @@ out:
 
 old_compare:
 	error = 1;
-	if (a->len == b->len)
-		error = memcmp(a->name, b->name, a->len);
+	if (name->len == len)
+		error = memcmp(name->name, str, len);
 	goto out;
 }
 
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index b936703b8924..95e00ab84c3f 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -85,15 +85,18 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 }
 
 /* returns the length of a struct qstr, ignoring trailing dots */
-static unsigned int vfat_striptail_len(struct qstr *qstr)
+static unsigned int __vfat_striptail_len(unsigned int len, const char *name)
 {
-	unsigned int len = qstr->len;
-
-	while (len && qstr->name[len - 1] == '.')
+	while (len && name[len - 1] == '.')
 		len--;
 	return len;
 }
 
+static unsigned int vfat_striptail_len(const struct qstr *qstr)
+{
+	return __vfat_striptail_len(qstr->len, qstr->name);
+}
+
 /*
  * Compute the hash for the vfat name corresponding to the dentry.
  * Note: if the name is invalid, we leave the hash code unchanged so
@@ -133,16 +136,18 @@ static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
 /*
  * Case insensitive compare of two vfat names.
  */
-static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
+	struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
 	unsigned int alen, blen;
 
 	/* A filename cannot end in '.' or we treat it like it has none */
-	alen = vfat_striptail_len(a);
-	blen = vfat_striptail_len(b);
+	alen = vfat_striptail_len(name);
+	blen = __vfat_striptail_len(len, str);
 	if (alen == blen) {
-		if (nls_strnicmp(t, a->name, b->name, alen) == 0)
+		if (nls_strnicmp(t, name->name, str, alen) == 0)
 			return 0;
 	}
 	return 1;
@@ -151,15 +156,17 @@ static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
 /*
  * Case sensitive compare of two vfat names.
  */
-static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int vfat_cmp(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
 	unsigned int alen, blen;
 
 	/* A filename cannot end in '.' or we treat it like it has none */
-	alen = vfat_striptail_len(a);
-	blen = vfat_striptail_len(b);
+	alen = vfat_striptail_len(name);
+	blen = __vfat_striptail_len(len, str);
 	if (alen == blen) {
-		if (strncmp(a->name, b->name, alen) == 0)
+		if (strncmp(name->name, str, alen) == 0)
 			return 0;
 	}
 	return 1;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index c8cffb81e849..8cd876f0e961 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -216,7 +216,10 @@ extern const struct dentry_operations hfs_dentry_operations;
 extern int hfs_hash_dentry(struct dentry *, struct qstr *);
 extern int hfs_strcmp(const unsigned char *, unsigned int,
 		      const unsigned char *, unsigned int);
-extern int hfs_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+extern int hfs_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
 
 /* trans.c */
 extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *);
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index 927a5af79428..aaf90d0d6940 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -92,21 +92,21 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1,
  * Test for equality of two strings in the HFS filename character ordering.
  * return 1 on failure and 0 on success
  */
-int hfs_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2)
+int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
 	const unsigned char *n1, *n2;
-	int len;
 
-	len = s1->len;
 	if (len >= HFS_NAMELEN) {
-		if (s2->len < HFS_NAMELEN)
+		if (name->len < HFS_NAMELEN)
 			return 1;
 		len = HFS_NAMELEN;
-	} else if (len != s2->len)
+	} else if (len != name->len)
 		return 1;
 
-	n1 = s1->name;
-	n2 = s2->name;
+	n1 = str;
+	n2 = name->name;
 	while (len--) {
 		if (caseorder[*n1++] != caseorder[*n2++])
 			return 1;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index cb3653efb57a..7aa96eefe483 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -380,7 +380,10 @@ int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *)
 int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *);
 int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int);
 int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str);
-int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2);
+int hfsplus_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
 
 /* wrapper.c */
 int hfsplus_read_wrapper(struct super_block *);
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index b66d67de882c..b178c997efa8 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -363,9 +363,12 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
  * Composed unicode characters are decomposed and case-folding is performed
  * if the appropriate bits are (un)set on the superblock.
  */
-int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2)
+int hfsplus_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	struct super_block *sb = dentry->d_sb;
+	struct super_block *sb = parent->d_sb;
 	int casefold, decompose, size;
 	int dsize1, dsize2, len1, len2;
 	const u16 *dstr1, *dstr2;
@@ -375,10 +378,10 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
 
 	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
 	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
-	astr1 = s1->name;
-	len1 = s1->len;
-	astr2 = s2->name;
-	len2 = s2->len;
+	astr1 = str;
+	len1 = len;
+	astr2 = name->name;
+	len2 = name->len;
 	dsize1 = dsize2 = 0;
 	dstr1 = dstr2 = NULL;
 
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 67d9d36b3d5f..dd9b1e74a734 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -34,19 +34,25 @@ static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
 	return 0;
 }
 
-static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int hpfs_compare_dentry(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	unsigned al=a->len;
-	unsigned bl=b->len;
-	hpfs_adjust_length(a->name, &al);
+	unsigned al = len;
+	unsigned bl = name->len;
+
+	hpfs_adjust_length(str, &al);
 	/*hpfs_adjust_length(b->name, &bl);*/
-	/* 'a' is the qstr of an already existing dentry, so the name
-	 * must be valid. 'b' must be validated first.
+
+	/*
+	 * 'str' is the nane of an already existing dentry, so the name
+	 * must be valid. 'name' must be validated first.
 	 */
 
-	if (hpfs_chk_name(b->name, &bl))
+	if (hpfs_chk_name(name->name, &bl))
 		return 1;
-	if (hpfs_compare_names(dentry->d_sb, a->name, al, b->name, bl, 0))
+	if (hpfs_compare_names(parent->d_sb, str, al, name->name, bl, 0))
 		return 1;
 	return 0;
 }
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index bfdeb82a53be..7b0fbc61af81 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -28,14 +28,26 @@
 
 static int isofs_hashi(struct dentry *parent, struct qstr *qstr);
 static int isofs_hash(struct dentry *parent, struct qstr *qstr);
-static int isofs_dentry_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b);
-static int isofs_dentry_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b);
+static int isofs_dentry_cmpi(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
+static int isofs_dentry_cmp(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
 
 #ifdef CONFIG_JOLIET
 static int isofs_hashi_ms(struct dentry *parent, struct qstr *qstr);
 static int isofs_hash_ms(struct dentry *parent, struct qstr *qstr);
-static int isofs_dentry_cmpi_ms(struct dentry *dentry, struct qstr *a, struct qstr *b);
-static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qstr *b);
+static int isofs_dentry_cmpi_ms(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
+static int isofs_dentry_cmp_ms(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name);
 #endif
 
 static void isofs_put_super(struct super_block *sb)
@@ -206,49 +218,31 @@ isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms)
 }
 
 /*
- * Case insensitive compare of two isofs names.
+ * Compare of two isofs names.
  */
-static int isofs_dentry_cmpi_common(struct dentry *dentry, struct qstr *a,
-				struct qstr *b, int ms)
+static int isofs_dentry_cmp_common(
+		unsigned int len, const char *str,
+		const struct qstr *name, int ms, int ci)
 {
 	int alen, blen;
 
 	/* A filename cannot end in '.' or we treat it like it has none */
-	alen = a->len;
-	blen = b->len;
+	alen = name->len;
+	blen = len;
 	if (ms) {
-		while (alen && a->name[alen-1] == '.')
+		while (alen && name->name[alen-1] == '.')
 			alen--;
-		while (blen && b->name[blen-1] == '.')
+		while (blen && str[blen-1] == '.')
 			blen--;
 	}
 	if (alen == blen) {
-		if (strnicmp(a->name, b->name, alen) == 0)
-			return 0;
-	}
-	return 1;
-}
-
-/*
- * Case sensitive compare of two isofs names.
- */
-static int isofs_dentry_cmp_common(struct dentry *dentry, struct qstr *a,
-					struct qstr *b, int ms)
-{
-	int alen, blen;
-
-	/* A filename cannot end in '.' or we treat it like it has none */
-	alen = a->len;
-	blen = b->len;
-	if (ms) {
-		while (alen && a->name[alen-1] == '.')
-			alen--;
-		while (blen && b->name[blen-1] == '.')
-			blen--;
-	}
-	if (alen == blen) {
-		if (strncmp(a->name, b->name, alen) == 0)
-			return 0;
+		if (ci) {
+			if (strnicmp(name->name, str, alen) == 0)
+				return 0;
+		} else {
+			if (strncmp(name->name, str, alen) == 0)
+				return 0;
+		}
 	}
 	return 1;
 }
@@ -266,15 +260,19 @@ isofs_hashi(struct dentry *dentry, struct qstr *qstr)
 }
 
 static int
-isofs_dentry_cmp(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	return isofs_dentry_cmp_common(dentry, a, b, 0);
+	return isofs_dentry_cmp_common(len, str, name, 0, 0);
 }
 
 static int
-isofs_dentry_cmpi(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	return isofs_dentry_cmpi_common(dentry, a, b, 0);
+	return isofs_dentry_cmp_common(len, str, name, 0, 1);
 }
 
 #ifdef CONFIG_JOLIET
@@ -291,15 +289,19 @@ isofs_hashi_ms(struct dentry *dentry, struct qstr *qstr)
 }
 
 static int
-isofs_dentry_cmp_ms(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	return isofs_dentry_cmp_common(dentry, a, b, 1);
+	return isofs_dentry_cmp_common(len, str, name, 1, 0);
 }
 
 static int
-isofs_dentry_cmpi_ms(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	return isofs_dentry_cmpi_common(dentry, a, b, 1);
+	return isofs_dentry_cmp_common(len, str, name, 1, 1);
 }
 #endif
 
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 0d23abfd4280..715f7d318046 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -37,7 +37,8 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
 
 	qstr.name = compare;
 	qstr.len = dlen;
-	return dentry->d_op->d_compare(dentry, &dentry->d_name, &qstr);
+	return dentry->d_op->d_compare(NULL, NULL, NULL, NULL,
+			dentry->d_name.len, dentry->d_name.name, &qstr);
 }
 
 /*
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 2da1546161fa..92129016cd79 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1587,14 +1587,17 @@ static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
 	return 0;
 }
 
-static int jfs_ci_compare(struct dentry *dir, struct qstr *a, struct qstr *b)
+static int jfs_ci_compare(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
 	int i, result = 1;
 
-	if (a->len != b->len)
+	if (len != name->len)
 		goto out;
-	for (i=0; i < a->len; i++) {
-		if (tolower(a->name[i]) != tolower(b->name[i]))
+	for (i=0; i < len; i++) {
+		if (tolower(str[i]) != tolower(name->name[i]))
 			goto out;
 	}
 	result = 0;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index e80ea4e37c48..3bcc68aed416 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -75,7 +75,9 @@ const struct inode_operations ncp_dir_inode_operations =
  */
 static int ncp_lookup_validate(struct dentry *, struct nameidata *);
 static int ncp_hash_dentry(struct dentry *, struct qstr *);
-static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *);
+static int ncp_compare_dentry(const struct dentry *, const struct inode *,
+		const struct dentry *, const struct inode *,
+		unsigned int, const char *, const struct qstr *);
 static int ncp_delete_dentry(const struct dentry *);
 
 static const struct dentry_operations ncp_dentry_operations =
@@ -113,10 +115,10 @@ static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
 
 #define ncp_preserve_case(i)	(ncp_namespace(i) != NW_NS_DOS)
 
-static inline int ncp_case_sensitive(struct dentry *dentry)
+static inline int ncp_case_sensitive(const struct inode *i)
 {
 #ifdef CONFIG_NCPFS_NFS_NS
-	return ncp_namespace(dentry->d_inode) == NW_NS_NFS;
+	return ncp_namespace(i) == NW_NS_NFS;
 #else
 	return 0;
 #endif /* CONFIG_NCPFS_NFS_NS */
@@ -129,12 +131,13 @@ static inline int ncp_case_sensitive(struct dentry *dentry)
 static int 
 ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
 {
-	if (!ncp_case_sensitive(dentry)) {
+	if (!ncp_case_sensitive(dentry->d_inode)) {
+		struct super_block *sb = dentry->d_sb;
 		struct nls_table *t;
 		unsigned long hash;
 		int i;
 
-		t = NCP_IO_TABLE(dentry);
+		t = NCP_IO_TABLE(sb);
 		hash = init_name_hash();
 		for (i=0; i<this->len ; i++)
 			hash = partial_name_hash(ncp_tolower(t, this->name[i]),
@@ -145,15 +148,17 @@ ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
 }
 
 static int
-ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	if (a->len != b->len)
+	if (len != name->len)
 		return 1;
 
-	if (ncp_case_sensitive(dentry))
-		return strncmp(a->name, b->name, a->len);
+	if (ncp_case_sensitive(pinode))
+		return strncmp(str, name->name, len);
 
-	return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len);
+	return ncp_strnicmp(NCP_IO_TABLE(pinode->i_sb), str, name->name, len);
 }
 
 /*
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 3c57eca634ce..244d1b73fda7 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -135,7 +135,7 @@ int ncp__vol2io(struct ncp_server *, unsigned char *, unsigned int *,
 				const unsigned char *, unsigned int, int);
 
 #define NCP_ESC			':'
-#define NCP_IO_TABLE(dentry)	(NCP_SERVER((dentry)->d_inode)->nls_io)
+#define NCP_IO_TABLE(sb)	(NCP_SBP(sb)->nls_io)
 #define ncp_tolower(t, c)	nls_tolower(t, c)
 #define ncp_toupper(t, c)	nls_toupper(t, c)
 #define ncp_strnicmp(t, s1, s2, len) \
@@ -150,15 +150,15 @@ int ncp__io2vol(unsigned char *, unsigned int *,
 int ncp__vol2io(unsigned char *, unsigned int *,
 				const unsigned char *, unsigned int, int);
 
-#define NCP_IO_TABLE(dentry)	NULL
+#define NCP_IO_TABLE(sb)	NULL
 #define ncp_tolower(t, c)	tolower(c)
 #define ncp_toupper(t, c)	toupper(c)
 #define ncp_io2vol(S,m,i,n,k,U)	ncp__io2vol(m,i,n,k,U)
 #define ncp_vol2io(S,m,i,n,k,U)	ncp__vol2io(m,i,n,k,U)
 
 
-static inline int ncp_strnicmp(struct nls_table *t, const unsigned char *s1,
-		const unsigned char *s2, int len)
+static inline int ncp_strnicmp(const struct nls_table *t,
+		const unsigned char *s1, const unsigned char *s2, int len)
 {
 	while (len--) {
 		if (tolower(*s1++) != tolower(*s2++))
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index a256d770ea18..ae4b0fd9033f 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -397,15 +397,16 @@ static int proc_sys_delete(const struct dentry *dentry)
 	return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
 
-static int proc_sys_compare(struct dentry *dir, struct qstr *qstr,
-			    struct qstr *name)
+static int proc_sys_compare(const struct dentry *parent,
+		const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
 {
-	struct dentry *dentry = container_of(qstr, struct dentry, d_name);
-	if (qstr->len != name->len)
+	if (name->len != len)
 		return 1;
-	if (memcmp(qstr->name, name->name, name->len))
+	if (memcmp(name->name, str, len))
 		return 1;
-	return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl);
+	return !sysctl_is_seen(PROC_I(inode)->sysctl);
 }
 
 static const struct dentry_operations proc_sys_dentry_operations = {
-- 
cgit v1.2.2


From b1e6a015a580ad145689ad1d6b4aa0e03e6c868b Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:28 +1100
Subject: fs: change d_hash for rcu-walk

Change d_hash so it may be called from lock-free RCU lookups. See similar
patch for d_compare for details.

For in-tree filesystems, this is just a mechanical change.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/adfs/dir.c           |  3 ++-
 fs/affs/namei.c         | 20 ++++++++++++--------
 fs/cifs/dir.c           |  5 +++--
 fs/cifs/readdir.c       |  2 +-
 fs/dcache.c             |  2 +-
 fs/ecryptfs/inode.c     |  4 ++--
 fs/fat/namei_msdos.c    |  3 ++-
 fs/fat/namei_vfat.c     |  8 +++++---
 fs/gfs2/dentry.c        |  3 ++-
 fs/hfs/hfs_fs.h         |  3 ++-
 fs/hfs/string.c         |  3 ++-
 fs/hfsplus/hfsplus_fs.h |  3 ++-
 fs/hfsplus/unicode.c    |  3 ++-
 fs/hpfs/dentry.c        |  3 ++-
 fs/isofs/inode.c        | 28 ++++++++++++++++++----------
 fs/jfs/namei.c          |  3 ++-
 fs/namei.c              |  5 +++--
 fs/ncpfs/dir.c          | 10 ++++++----
 fs/sysv/namei.c         |  3 ++-
 19 files changed, 71 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index c8ed66162bd4..a11e5e102716 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -201,7 +201,8 @@ const struct file_operations adfs_dir_operations = {
 };
 
 static int
-adfs_hash(struct dentry *parent, struct qstr *qstr)
+adfs_hash(const struct dentry *parent, const struct inode *inode,
+		struct qstr *qstr)
 {
 	const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
 	const unsigned char *name;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 547d5deb0d42..5aca08c21100 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -13,13 +13,15 @@
 typedef int (*toupper_t)(int);
 
 static int	 affs_toupper(int ch);
-static int	 affs_hash_dentry(struct dentry *, struct qstr *);
+static int	 affs_hash_dentry(const struct dentry *,
+		const struct inode *, struct qstr *);
 static int       affs_compare_dentry(const struct dentry *parent,
 		const struct inode *pinode,
 		const struct dentry *dentry, const struct inode *inode,
 		unsigned int len, const char *str, const struct qstr *name);
 static int	 affs_intl_toupper(int ch);
-static int	 affs_intl_hash_dentry(struct dentry *, struct qstr *);
+static int	 affs_intl_hash_dentry(const struct dentry *,
+		const struct inode *, struct qstr *);
 static int       affs_intl_compare_dentry(const struct dentry *parent,
 		const struct inode *pinode,
 		const struct dentry *dentry, const struct inode *inode,
@@ -64,13 +66,13 @@ affs_get_toupper(struct super_block *sb)
  * Note: the dentry argument is the parent dentry.
  */
 static inline int
-__affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper)
+__affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
 {
 	const u8 *name = qstr->name;
 	unsigned long hash;
 	int i;
 
-	i = affs_check_name(qstr->name,qstr->len);
+	i = affs_check_name(qstr->name, qstr->len);
 	if (i)
 		return i;
 
@@ -84,14 +86,16 @@ __affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper)
 }
 
 static int
-affs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
+affs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
-	return __affs_hash_dentry(dentry, qstr, affs_toupper);
+	return __affs_hash_dentry(qstr, affs_toupper);
 }
 static int
-affs_intl_hash_dentry(struct dentry *dentry, struct qstr *qstr)
+affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
-	return __affs_hash_dentry(dentry, qstr, affs_intl_toupper);
+	return __affs_hash_dentry(qstr, affs_intl_toupper);
 }
 
 static inline int __affs_compare_dentry(unsigned int len,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index c60133f0d8e4..88bfe686ac00 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -700,9 +700,10 @@ const struct dentry_operations cifs_dentry_ops = {
 /* d_delete:       cifs_d_delete,      */ /* not needed except for debugging */
 };
 
-static int cifs_ci_hash(struct dentry *dentry, struct qstr *q)
+static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *q)
 {
-	struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls;
+	struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
 	unsigned long hash;
 	int i;
 
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index a73eb9f4bdaf..ee463aeca0b0 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -79,7 +79,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
 	cFYI(1, "For %s", name->name);
 
 	if (parent->d_op && parent->d_op->d_hash)
-		parent->d_op->d_hash(parent, name);
+		parent->d_op->d_hash(parent, parent->d_inode, name);
 	else
 		name->hash = full_name_hash(name->name, name->len);
 
diff --git a/fs/dcache.c b/fs/dcache.c
index 7075555fbb04..6f59f375ed9d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1478,7 +1478,7 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
 	 */
 	name->hash = full_name_hash(name->name, name->len);
 	if (dir->d_op && dir->d_op->d_hash) {
-		if (dir->d_op->d_hash(dir, name) < 0)
+		if (dir->d_op->d_hash(dir, dir->d_inode, name) < 0)
 			goto out;
 	}
 	dentry = d_lookup(dir, name);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 9d1a22d62765..a1ed7a7cb173 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -454,7 +454,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	lower_name.hash = ecryptfs_dentry->d_name.hash;
 	if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
 		rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
-						    &lower_name);
+				lower_dir_dentry->d_inode, &lower_name);
 		if (rc < 0)
 			goto out_d_drop;
 	}
@@ -489,7 +489,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
 	if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
 		rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
-						    &lower_name);
+				lower_dir_dentry->d_inode, &lower_name);
 		if (rc < 0)
 			goto out_d_drop;
 	}
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 99d3c7ac973c..3b3e072d8982 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -148,7 +148,8 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len,
  * that the existing dentry can be used. The msdos fs routines will
  * return ENOENT or EINVAL as appropriate.
  */
-static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
+static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
+	       struct qstr *qstr)
 {
 	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
 	unsigned char msdos_name[MSDOS_NAME];
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 95e00ab84c3f..4fc06278db48 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -103,7 +103,8 @@ static unsigned int vfat_striptail_len(const struct qstr *qstr)
  * that the existing dentry can be used. The vfat fs routines will
  * return ENOENT or EINVAL as appropriate.
  */
-static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
+static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
 	qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
 	return 0;
@@ -115,9 +116,10 @@ static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
  * that the existing dentry can be used. The vfat fs routines will
  * return ENOENT or EINVAL as appropriate.
  */
-static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
+static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
-	struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
+	struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
 	const unsigned char *name;
 	unsigned int len;
 	unsigned long hash;
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index e80fea2f65ff..50497f65763b 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -100,7 +100,8 @@ fail:
 	return 0;
 }
 
-static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
+static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *str)
 {
 	str->hash = gfs2_disk_hash(str->name, str->len);
 	return 0;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 8cd876f0e961..ad97c2d58287 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -213,7 +213,8 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
 /* string.c */
 extern const struct dentry_operations hfs_dentry_operations;
 
-extern int hfs_hash_dentry(struct dentry *, struct qstr *);
+extern int hfs_hash_dentry(const struct dentry *, const struct inode *,
+		struct qstr *);
 extern int hfs_strcmp(const unsigned char *, unsigned int,
 		      const unsigned char *, unsigned int);
 extern int hfs_compare_dentry(const struct dentry *parent,
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index aaf90d0d6940..495a976a3cc9 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -51,7 +51,8 @@ static unsigned char caseorder[256] = {
 /*
  * Hash a string to an integer in a case-independent way
  */
-int hfs_hash_dentry(struct dentry *dentry, struct qstr *this)
+int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *this)
 {
 	const unsigned char *name = this->name;
 	unsigned int hash, len = this->len;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 7aa96eefe483..a5308f491e3e 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -379,7 +379,8 @@ int hfsplus_strcasecmp(const struct hfsplus_unistr *, const struct hfsplus_unist
 int hfsplus_strcmp(const struct hfsplus_unistr *, const struct hfsplus_unistr *);
 int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *);
 int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, const char *, int);
-int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str);
+int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *str);
 int hfsplus_compare_dentry(const struct dentry *parent,
 		const struct inode *pinode,
 		const struct dentry *dentry, const struct inode *inode,
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index b178c997efa8..d800aa0f2c80 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -320,7 +320,8 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
  * Composed unicode characters are decomposed and case-folding is performed
  * if the appropriate bits are (un)set on the superblock.
  */
-int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
+int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *str)
 {
 	struct super_block *sb = dentry->d_sb;
 	const char *astr;
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index dd9b1e74a734..35526df1fd32 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -12,7 +12,8 @@
  * Note: the dentry argument is the parent dentry.
  */
 
-static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
+static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
 	unsigned long	 hash;
 	int		 i;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 7b0fbc61af81..d204ee4235fd 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -26,8 +26,10 @@
 
 #define BEQUIET
 
-static int isofs_hashi(struct dentry *parent, struct qstr *qstr);
-static int isofs_hash(struct dentry *parent, struct qstr *qstr);
+static int isofs_hashi(const struct dentry *parent, const struct inode *inode,
+		struct qstr *qstr);
+static int isofs_hash(const struct dentry *parent, const struct inode *inode,
+		struct qstr *qstr);
 static int isofs_dentry_cmpi(const struct dentry *parent,
 		const struct inode *pinode,
 		const struct dentry *dentry, const struct inode *inode,
@@ -38,8 +40,10 @@ static int isofs_dentry_cmp(const struct dentry *parent,
 		unsigned int len, const char *str, const struct qstr *name);
 
 #ifdef CONFIG_JOLIET
-static int isofs_hashi_ms(struct dentry *parent, struct qstr *qstr);
-static int isofs_hash_ms(struct dentry *parent, struct qstr *qstr);
+static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode,
+		struct qstr *qstr);
+static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode,
+		struct qstr *qstr);
 static int isofs_dentry_cmpi_ms(const struct dentry *parent,
 		const struct inode *pinode,
 		const struct dentry *dentry, const struct inode *inode,
@@ -172,7 +176,7 @@ struct iso9660_options{
  * Compute the hash for the isofs name corresponding to the dentry.
  */
 static int
-isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms)
+isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms)
 {
 	const char *name;
 	int len;
@@ -193,7 +197,7 @@ isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms)
  * Compute the hash for the isofs name corresponding to the dentry.
  */
 static int
-isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms)
+isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms)
 {
 	const char *name;
 	int len;
@@ -248,13 +252,15 @@ static int isofs_dentry_cmp_common(
 }
 
 static int
-isofs_hash(struct dentry *dentry, struct qstr *qstr)
+isofs_hash(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
 	return isofs_hash_common(dentry, qstr, 0);
 }
 
 static int
-isofs_hashi(struct dentry *dentry, struct qstr *qstr)
+isofs_hashi(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
 	return isofs_hashi_common(dentry, qstr, 0);
 }
@@ -277,13 +283,15 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
 
 #ifdef CONFIG_JOLIET
 static int
-isofs_hash_ms(struct dentry *dentry, struct qstr *qstr)
+isofs_hash_ms(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
 	return isofs_hash_common(dentry, qstr, 1);
 }
 
 static int
-isofs_hashi_ms(struct dentry *dentry, struct qstr *qstr)
+isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
 	return isofs_hashi_common(dentry, qstr, 1);
 }
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 92129016cd79..57f90dad8919 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1574,7 +1574,8 @@ const struct file_operations jfs_dir_operations = {
 	.llseek		= generic_file_llseek,
 };
 
-static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
+static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
+		struct qstr *this)
 {
 	unsigned long hash;
 	int i;
diff --git a/fs/namei.c b/fs/namei.c
index 4ff7ca530533..f3b5ca404659 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -731,7 +731,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 	 * to use its own hash..
 	 */
 	if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
-		int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name);
+		int err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
+				nd->path.dentry->d_inode, name);
 		if (err < 0)
 			return err;
 	}
@@ -1134,7 +1135,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
 	 * to use its own hash..
 	 */
 	if (base->d_op && base->d_op->d_hash) {
-		err = base->d_op->d_hash(base, name);
+		err = base->d_op->d_hash(base, inode, name);
 		dentry = ERR_PTR(err);
 		if (err < 0)
 			goto out;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 3bcc68aed416..bbbf7922f422 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -74,7 +74,8 @@ const struct inode_operations ncp_dir_inode_operations =
  * Dentry operations routines
  */
 static int ncp_lookup_validate(struct dentry *, struct nameidata *);
-static int ncp_hash_dentry(struct dentry *, struct qstr *);
+static int ncp_hash_dentry(const struct dentry *, const struct inode *,
+		struct qstr *);
 static int ncp_compare_dentry(const struct dentry *, const struct inode *,
 		const struct dentry *, const struct inode *,
 		unsigned int, const char *, const struct qstr *);
@@ -129,9 +130,10 @@ static inline int ncp_case_sensitive(const struct inode *i)
  * is case-sensitive.
  */
 static int 
-ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
+ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *this)
 {
-	if (!ncp_case_sensitive(dentry->d_inode)) {
+	if (!ncp_case_sensitive(inode)) {
 		struct super_block *sb = dentry->d_sb;
 		struct nls_table *t;
 		unsigned long hash;
@@ -597,7 +599,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 	qname.hash = full_name_hash(qname.name, qname.len);
 
 	if (dentry->d_op && dentry->d_op->d_hash)
-		if (dentry->d_op->d_hash(dentry, &qname) != 0)
+		if (dentry->d_op->d_hash(dentry, dentry->d_inode, &qname) != 0)
 			goto end_advance;
 
 	newdent = d_lookup(dentry, &qname);
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 11e7f7d11cd0..7507aeb4c90e 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -27,7 +27,8 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
 	return err;
 }
 
-static int sysv_hash(struct dentry *dentry, struct qstr *qstr)
+static int sysv_hash(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
 {
 	/* Truncate the name in place, avoids having to define a compare
 	   function. */
-- 
cgit v1.2.2


From ec2447c278ee973d35f38e53ca16ba7f965ae33d Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:29 +1100
Subject: hostfs: simplify locking

Remove dcache_lock locking from hostfs filesystem, and move it into dcache
helpers. All that is required is a coherent path name. Protection from
concurrent modification of the namespace after path name generation is not
provided in current code, because dcache_lock is dropped before the path is
used.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c             | 15 +++++++++++++--
 fs/hostfs/hostfs_kern.c | 24 ++++++++++--------------
 2 files changed, 23 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 6f59f375ed9d..61beb40dd6bf 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2171,7 +2171,7 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
 /*
  * Write full pathname from the root of the filesystem into the buffer.
  */
-char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
+static char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
 {
 	char *end = buf + buflen;
 	char *retval;
@@ -2198,7 +2198,18 @@ char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
 Elong:
 	return ERR_PTR(-ENAMETOOLONG);
 }
-EXPORT_SYMBOL(__dentry_path);
+
+char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
+{
+	char *retval;
+
+	spin_lock(&dcache_lock);
+	retval = __dentry_path(dentry, buf, buflen);
+	spin_unlock(&dcache_lock);
+
+	return retval;
+}
+EXPORT_SYMBOL(dentry_path_raw);
 
 char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index cfe8bc7de511..39dc505ed273 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -92,12 +92,10 @@ __uml_setup("hostfs=", hostfs_args,
 
 static char *__dentry_name(struct dentry *dentry, char *name)
 {
-	char *p = __dentry_path(dentry, name, PATH_MAX);
+	char *p = dentry_path_raw(dentry, name, PATH_MAX);
 	char *root;
 	size_t len;
 
-	spin_unlock(&dcache_lock);
-
 	root = dentry->d_sb->s_fs_info;
 	len = strlen(root);
 	if (IS_ERR(p)) {
@@ -123,25 +121,23 @@ static char *dentry_name(struct dentry *dentry)
 	if (!name)
 		return NULL;
 
-	spin_lock(&dcache_lock);
 	return __dentry_name(dentry, name); /* will unlock */
 }
 
 static char *inode_name(struct inode *ino)
 {
 	struct dentry *dentry;
-	char *name = __getname();
-	if (!name)
-		return NULL;
+	char *name;
 
-	spin_lock(&dcache_lock);
-	if (list_empty(&ino->i_dentry)) {
-		spin_unlock(&dcache_lock);
-		__putname(name);
+	dentry = d_find_alias(ino);
+	if (!dentry)
 		return NULL;
-	}
-	dentry = list_first_entry(&ino->i_dentry, struct dentry, d_alias);
-	return __dentry_name(dentry, name); /* will unlock */
+
+	name = dentry_name(dentry);
+
+	dput(dentry);
+
+	return name;
 }
 
 static char *follow_link(char *link)
-- 
cgit v1.2.2


From 789680d1ee9311cdf095241dc02bd9784d799cd1 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:30 +1100
Subject: fs: dcache scale hash

Add a new lock, dcache_hash_lock, to protect the dcache hash table from
concurrent modification. d_hash is also protected by d_lock.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 62 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 61beb40dd6bf..1e124d4673c6 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -35,10 +35,24 @@
 #include <linux/hardirq.h>
 #include "internal.h"
 
+/*
+ * Usage:
+ * dcache_hash_lock protects dcache hash table, s_anon lists
+ *
+ * Ordering:
+ * dcache_lock
+ *   dentry->d_lock
+ *     dcache_hash_lock
+ *
+ * if (dentry1 < dentry2)
+ *   dentry1->d_lock
+ *     dentry2->d_lock
+ */
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
- __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_hash_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
 EXPORT_SYMBOL(dcache_lock);
@@ -196,6 +210,42 @@ static struct dentry *d_kill(struct dentry *dentry)
 	return parent;
 }
 
+/**
+ * d_drop - drop a dentry
+ * @dentry: dentry to drop
+ *
+ * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
+ * be found through a VFS lookup any more. Note that this is different from
+ * deleting the dentry - d_delete will try to mark the dentry negative if
+ * possible, giving a successful _negative_ lookup, while d_drop will
+ * just make the cache lookup fail.
+ *
+ * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
+ * reason (NFS timeouts or autofs deletes).
+ *
+ * __d_drop requires dentry->d_lock.
+ */
+void __d_drop(struct dentry *dentry)
+{
+	if (!(dentry->d_flags & DCACHE_UNHASHED)) {
+		dentry->d_flags |= DCACHE_UNHASHED;
+		spin_lock(&dcache_hash_lock);
+		hlist_del_rcu(&dentry->d_hash);
+		spin_unlock(&dcache_hash_lock);
+	}
+}
+EXPORT_SYMBOL(__d_drop);
+
+void d_drop(struct dentry *dentry)
+{
+	spin_lock(&dcache_lock);
+	spin_lock(&dentry->d_lock);
+	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&dcache_lock);
+}
+EXPORT_SYMBOL(d_drop);
+
 /* 
  * This is dput
  *
@@ -1199,7 +1249,9 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	tmp->d_flags |= DCACHE_DISCONNECTED;
 	tmp->d_flags &= ~DCACHE_UNHASHED;
 	list_add(&tmp->d_alias, &inode->i_dentry);
+	spin_lock(&dcache_hash_lock);
 	hlist_add_head(&tmp->d_hash, &inode->i_sb->s_anon);
+	spin_unlock(&dcache_hash_lock);
 	spin_unlock(&tmp->d_lock);
 
 	spin_unlock(&dcache_lock);
@@ -1585,7 +1637,9 @@ void d_rehash(struct dentry * entry)
 {
 	spin_lock(&dcache_lock);
 	spin_lock(&entry->d_lock);
+	spin_lock(&dcache_hash_lock);
 	_d_rehash(entry);
+	spin_unlock(&dcache_hash_lock);
 	spin_unlock(&entry->d_lock);
 	spin_unlock(&dcache_lock);
 }
@@ -1692,8 +1746,6 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
  */
 static void d_move_locked(struct dentry * dentry, struct dentry * target)
 {
-	struct hlist_head *list;
-
 	if (!dentry->d_inode)
 		printk(KERN_WARNING "VFS: moving negative dcache entry\n");
 
@@ -1710,14 +1762,11 @@ static void d_move_locked(struct dentry * dentry, struct dentry * target)
 	}
 
 	/* Move the dentry to the target hash queue, if on different bucket */
-	if (d_unhashed(dentry))
-		goto already_unhashed;
-
-	hlist_del_rcu(&dentry->d_hash);
-
-already_unhashed:
-	list = d_hash(target->d_parent, target->d_name.hash);
-	__d_rehash(dentry, list);
+	spin_lock(&dcache_hash_lock);
+	if (!d_unhashed(dentry))
+		hlist_del_rcu(&dentry->d_hash);
+	__d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
+	spin_unlock(&dcache_hash_lock);
 
 	/* Unhash the target: dput() will then get rid of it */
 	__d_drop(target);
@@ -1914,7 +1963,9 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 found_lock:
 	spin_lock(&actual->d_lock);
 found:
+	spin_lock(&dcache_hash_lock);
 	_d_rehash(actual);
+	spin_unlock(&dcache_hash_lock);
 	spin_unlock(&actual->d_lock);
 	spin_unlock(&dcache_lock);
 out_nolock:
-- 
cgit v1.2.2


From 2304450783dfde7b0b94ae234edd0dbffa865073 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:31 +1100
Subject: fs: dcache scale lru

Add a new lock, dcache_lru_lock, to protect the dcache LRU list from concurrent
modification. d_lru is also protected by d_lock, which allows LRU lists to be
accessed without the lru lock, using RCU in future patches.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 112 +++++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 84 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 1e124d4673c6..3d3c843c36ed 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -37,11 +37,19 @@
 
 /*
  * Usage:
- * dcache_hash_lock protects dcache hash table, s_anon lists
+ * dcache_hash_lock protects:
+ *   - the dcache hash table, s_anon lists
+ * dcache_lru_lock protects:
+ *   - the dcache lru lists and counters
+ * d_lock protects:
+ *   - d_flags
+ *   - d_name
+ *   - d_lru
  *
  * Ordering:
  * dcache_lock
  *   dentry->d_lock
+ *     dcache_lru_lock
  *     dcache_hash_lock
  *
  * if (dentry1 < dentry2)
@@ -52,6 +60,7 @@ int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_hash_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
@@ -154,28 +163,38 @@ static void dentry_iput(struct dentry * dentry)
 }
 
 /*
- * dentry_lru_(add|del|move_tail) must be called with dcache_lock held.
+ * dentry_lru_(add|del|move_tail) must be called with d_lock held.
  */
 static void dentry_lru_add(struct dentry *dentry)
 {
 	if (list_empty(&dentry->d_lru)) {
+		spin_lock(&dcache_lru_lock);
 		list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 		dentry->d_sb->s_nr_dentry_unused++;
 		dentry_stat.nr_unused++;
+		spin_unlock(&dcache_lru_lock);
 	}
 }
 
+static void __dentry_lru_del(struct dentry *dentry)
+{
+	list_del_init(&dentry->d_lru);
+	dentry->d_sb->s_nr_dentry_unused--;
+	dentry_stat.nr_unused--;
+}
+
 static void dentry_lru_del(struct dentry *dentry)
 {
 	if (!list_empty(&dentry->d_lru)) {
-		list_del_init(&dentry->d_lru);
-		dentry->d_sb->s_nr_dentry_unused--;
-		dentry_stat.nr_unused--;
+		spin_lock(&dcache_lru_lock);
+		__dentry_lru_del(dentry);
+		spin_unlock(&dcache_lru_lock);
 	}
 }
 
 static void dentry_lru_move_tail(struct dentry *dentry)
 {
+	spin_lock(&dcache_lru_lock);
 	if (list_empty(&dentry->d_lru)) {
 		list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 		dentry->d_sb->s_nr_dentry_unused++;
@@ -183,6 +202,7 @@ static void dentry_lru_move_tail(struct dentry *dentry)
 	} else {
 		list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 	}
+	spin_unlock(&dcache_lru_lock);
 }
 
 /**
@@ -192,6 +212,8 @@ static void dentry_lru_move_tail(struct dentry *dentry)
  * The dentry must already be unhashed and removed from the LRU.
  *
  * If this is the root of the dentry tree, return NULL.
+ *
+ * dcache_lock and d_lock must be held by caller, are dropped by d_kill.
  */
 static struct dentry *d_kill(struct dentry *dentry)
 	__releases(dentry->d_lock)
@@ -383,10 +405,19 @@ int d_invalidate(struct dentry * dentry)
 EXPORT_SYMBOL(d_invalidate);
 
 /* This should be called _only_ with dcache_lock held */
+static inline struct dentry * __dget_locked_dlock(struct dentry *dentry)
+{
+	atomic_inc(&dentry->d_count);
+	dentry_lru_del(dentry);
+	return dentry;
+}
+
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
 	atomic_inc(&dentry->d_count);
+	spin_lock(&dentry->d_lock);
 	dentry_lru_del(dentry);
+	spin_unlock(&dentry->d_lock);
 	return dentry;
 }
 
@@ -465,7 +496,7 @@ restart:
 	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		spin_lock(&dentry->d_lock);
 		if (!atomic_read(&dentry->d_count)) {
-			__dget_locked(dentry);
+			__dget_locked_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
@@ -489,7 +520,6 @@ EXPORT_SYMBOL(d_prune_aliases);
 static void prune_one_dentry(struct dentry * dentry)
 	__releases(dentry->d_lock)
 	__releases(dcache_lock)
-	__acquires(dcache_lock)
 {
 	__d_drop(dentry);
 	dentry = d_kill(dentry);
@@ -498,15 +528,16 @@ static void prune_one_dentry(struct dentry * dentry)
 	 * Prune ancestors.  Locking is simpler than in dput(),
 	 * because dcache_lock needs to be taken anyway.
 	 */
-	spin_lock(&dcache_lock);
 	while (dentry) {
-		if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock))
+		spin_lock(&dcache_lock);
+		if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock)) {
+			spin_unlock(&dcache_lock);
 			return;
+		}
 
 		dentry_lru_del(dentry);
 		__d_drop(dentry);
 		dentry = d_kill(dentry);
-		spin_lock(&dcache_lock);
 	}
 }
 
@@ -516,21 +547,31 @@ static void shrink_dentry_list(struct list_head *list)
 
 	while (!list_empty(list)) {
 		dentry = list_entry(list->prev, struct dentry, d_lru);
-		dentry_lru_del(dentry);
+
+		if (!spin_trylock(&dentry->d_lock)) {
+			spin_unlock(&dcache_lru_lock);
+			cpu_relax();
+			spin_lock(&dcache_lru_lock);
+			continue;
+		}
+
+		__dentry_lru_del(dentry);
 
 		/*
 		 * We found an inuse dentry which was not removed from
 		 * the LRU because of laziness during lookup.  Do not free
 		 * it - just keep it off the LRU list.
 		 */
-		spin_lock(&dentry->d_lock);
 		if (atomic_read(&dentry->d_count)) {
 			spin_unlock(&dentry->d_lock);
 			continue;
 		}
+		spin_unlock(&dcache_lru_lock);
+
 		prune_one_dentry(dentry);
-		/* dentry->d_lock was dropped in prune_one_dentry() */
-		cond_resched_lock(&dcache_lock);
+		/* dcache_lock and dentry->d_lock dropped */
+		spin_lock(&dcache_lock);
+		spin_lock(&dcache_lru_lock);
 	}
 }
 
@@ -551,32 +592,36 @@ static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
 	int cnt = *count;
 
 	spin_lock(&dcache_lock);
+relock:
+	spin_lock(&dcache_lru_lock);
 	while (!list_empty(&sb->s_dentry_lru)) {
 		dentry = list_entry(sb->s_dentry_lru.prev,
 				struct dentry, d_lru);
 		BUG_ON(dentry->d_sb != sb);
 
+		if (!spin_trylock(&dentry->d_lock)) {
+			spin_unlock(&dcache_lru_lock);
+			cpu_relax();
+			goto relock;
+		}
+
 		/*
 		 * If we are honouring the DCACHE_REFERENCED flag and the
 		 * dentry has this flag set, don't free it.  Clear the flag
 		 * and put it back on the LRU.
 		 */
-		if (flags & DCACHE_REFERENCED) {
-			spin_lock(&dentry->d_lock);
-			if (dentry->d_flags & DCACHE_REFERENCED) {
-				dentry->d_flags &= ~DCACHE_REFERENCED;
-				list_move(&dentry->d_lru, &referenced);
-				spin_unlock(&dentry->d_lock);
-				cond_resched_lock(&dcache_lock);
-				continue;
-			}
+		if (flags & DCACHE_REFERENCED &&
+				dentry->d_flags & DCACHE_REFERENCED) {
+			dentry->d_flags &= ~DCACHE_REFERENCED;
+			list_move(&dentry->d_lru, &referenced);
+			spin_unlock(&dentry->d_lock);
+		} else {
+			list_move_tail(&dentry->d_lru, &tmp);
 			spin_unlock(&dentry->d_lock);
+			if (!--cnt)
+				break;
 		}
-
-		list_move_tail(&dentry->d_lru, &tmp);
-		if (!--cnt)
-			break;
-		cond_resched_lock(&dcache_lock);
+		/* XXX: re-add cond_resched_lock when dcache_lock goes away */
 	}
 
 	*count = cnt;
@@ -584,6 +629,7 @@ static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
 
 	if (!list_empty(&referenced))
 		list_splice(&referenced, &sb->s_dentry_lru);
+	spin_unlock(&dcache_lru_lock);
 	spin_unlock(&dcache_lock);
 
 }
@@ -679,10 +725,12 @@ void shrink_dcache_sb(struct super_block *sb)
 	LIST_HEAD(tmp);
 
 	spin_lock(&dcache_lock);
+	spin_lock(&dcache_lru_lock);
 	while (!list_empty(&sb->s_dentry_lru)) {
 		list_splice_init(&sb->s_dentry_lru, &tmp);
 		shrink_dentry_list(&tmp);
 	}
+	spin_unlock(&dcache_lru_lock);
 	spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
@@ -701,7 +749,9 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 
 	/* detach this root from the system */
 	spin_lock(&dcache_lock);
+	spin_lock(&dentry->d_lock);
 	dentry_lru_del(dentry);
+	spin_unlock(&dentry->d_lock);
 	__d_drop(dentry);
 	spin_unlock(&dcache_lock);
 
@@ -715,7 +765,9 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 			spin_lock(&dcache_lock);
 			list_for_each_entry(loop, &dentry->d_subdirs,
 					    d_u.d_child) {
+				spin_lock(&loop->d_lock);
 				dentry_lru_del(loop);
+				spin_unlock(&loop->d_lock);
 				__d_drop(loop);
 				cond_resched_lock(&dcache_lock);
 			}
@@ -892,6 +944,8 @@ resume:
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
 
+		spin_lock(&dentry->d_lock);
+
 		/* 
 		 * move only zero ref count dentries to the end 
 		 * of the unused list for prune_dcache
@@ -903,6 +957,8 @@ resume:
 			dentry_lru_del(dentry);
 		}
 
+		spin_unlock(&dentry->d_lock);
+
 		/*
 		 * We can return to the caller if we have found some (this
 		 * ensures forward progress). We'll be coming back to find
-- 
cgit v1.2.2


From b7ab39f631f505edc2bbdb86620d5493f995c9da Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:32 +1100
Subject: fs: dcache scale dentry refcount

Make d_count non-atomic and protect it with d_lock. This allows us to ensure a
0 refcount dentry remains 0 without dcache_lock. It is also fairly natural when
we start protecting many other dentry members with d_lock.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/autofs4/expire.c  |   8 ++--
 fs/autofs4/root.c    |   6 +--
 fs/ceph/dir.c        |   4 +-
 fs/ceph/inode.c      |   4 +-
 fs/ceph/mds_client.c |   2 +-
 fs/coda/dir.c        |   2 +-
 fs/configfs/dir.c    |   3 +-
 fs/configfs/inode.c  |   2 +-
 fs/dcache.c          | 106 +++++++++++++++++++++++++++++++++++++++------------
 fs/ecryptfs/inode.c  |   2 +-
 fs/locks.c           |   2 +-
 fs/namei.c           |   2 +-
 fs/nfs/dir.c         |   6 +--
 fs/nfs/unlink.c      |   2 +-
 fs/nfsd/vfs.c        |   5 +--
 fs/nilfs2/super.c    |   2 +-
 16 files changed, 108 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index a796c9417fb1..413b5642e6cf 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -198,7 +198,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
 			else
 				ino_count++;
 
-			if (atomic_read(&p->d_count) > ino_count) {
+			if (p->d_count > ino_count) {
 				top_ino->last_used = jiffies;
 				dput(p);
 				return 1;
@@ -347,7 +347,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 
 			/* Path walk currently on this dentry? */
 			ino_count = atomic_read(&ino->count) + 2;
-			if (atomic_read(&dentry->d_count) > ino_count)
+			if (dentry->d_count > ino_count)
 				goto next;
 
 			/* Can we umount this guy */
@@ -369,7 +369,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 		if (!exp_leaves) {
 			/* Path walk currently on this dentry? */
 			ino_count = atomic_read(&ino->count) + 1;
-			if (atomic_read(&dentry->d_count) > ino_count)
+			if (dentry->d_count > ino_count)
 				goto next;
 
 			if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
@@ -383,7 +383,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 		} else {
 			/* Path walk currently on this dentry? */
 			ino_count = atomic_read(&ino->count) + 1;
-			if (atomic_read(&dentry->d_count) > ino_count)
+			if (dentry->d_count > ino_count)
 				goto next;
 
 			expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index d34896cfb19f..7922509b5b2b 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -436,7 +436,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 		spin_lock(&active->d_lock);
 
 		/* Already gone? */
-		if (atomic_read(&active->d_count) == 0)
+		if (active->d_count == 0)
 			goto next;
 
 		qstr = &active->d_name;
@@ -452,7 +452,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 			goto next;
 
 		if (d_unhashed(active)) {
-			dget(active);
+			dget_dlock(active);
 			spin_unlock(&active->d_lock);
 			spin_unlock(&sbi->lookup_lock);
 			spin_unlock(&dcache_lock);
@@ -507,7 +507,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
 			goto next;
 
 		if (d_unhashed(expiring)) {
-			dget(expiring);
+			dget_dlock(expiring);
 			spin_unlock(&expiring->d_lock);
 			spin_unlock(&sbi->lookup_lock);
 			spin_unlock(&dcache_lock);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index d902948a90d8..3ecf915a4550 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -150,7 +150,9 @@ more:
 		di = ceph_dentry(dentry);
 	}
 
-	atomic_inc(&dentry->d_count);
+	spin_lock(&dentry->d_lock);
+	dentry->d_count++;
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 
 	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index bf1286588f26..bb68c799074d 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -879,8 +879,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
 	} else if (realdn) {
 		dout("dn %p (%d) spliced with %p (%d) "
 		     "inode %p ino %llx.%llx\n",
-		     dn, atomic_read(&dn->d_count),
-		     realdn, atomic_read(&realdn->d_count),
+		     dn, dn->d_count,
+		     realdn, realdn->d_count,
 		     realdn->d_inode, ceph_vinop(realdn->d_inode));
 		dput(dn);
 		dn = realdn;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 38800eaa81d0..a50fca1e03be 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1486,7 +1486,7 @@ retry:
 	*base = ceph_ino(temp->d_inode);
 	*plen = len;
 	dout("build_path on %p %d built %llx '%.*s'\n",
-	     dentry, atomic_read(&dentry->d_count), *base, len, path);
+	     dentry, dentry->d_count, *base, len, path);
 	return path;
 }
 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 4cce3b07d9d7..9e37e8bc9b89 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -559,7 +559,7 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
 	if (cii->c_flags & C_FLUSH) 
 		coda_flag_inode_children(inode, C_FLUSH);
 
-	if (atomic_read(&de->d_count) > 1)
+	if (de->d_count > 1)
 		/* pretend it's valid, but don't change the flags */
 		goto out;
 
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 20024a9ef5a7..e9acea440ffc 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -394,8 +394,7 @@ static void remove_dir(struct dentry * d)
 	if (d->d_inode)
 		simple_rmdir(parent->d_inode,d);
 
-	pr_debug(" o %s removing done (%d)\n",d->d_name.name,
-		 atomic_read(&d->d_count));
+	pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);
 
 	dput(parent);
 }
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 253476d78ed8..79b37765d8ff 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -253,7 +253,7 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
 		spin_lock(&dcache_lock);
 		spin_lock(&dentry->d_lock);
 		if (!(d_unhashed(dentry) && dentry->d_inode)) {
-			dget_locked(dentry);
+			dget_locked_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
diff --git a/fs/dcache.c b/fs/dcache.c
index 3d3c843c36ed..81e91502b294 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -45,6 +45,7 @@
  *   - d_flags
  *   - d_name
  *   - d_lru
+ *   - d_count
  *
  * Ordering:
  * dcache_lock
@@ -125,6 +126,7 @@ static void __d_free(struct rcu_head *head)
  */
 static void d_free(struct dentry *dentry)
 {
+	BUG_ON(dentry->d_count);
 	this_cpu_dec(nr_dentry);
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
@@ -222,8 +224,11 @@ static struct dentry *d_kill(struct dentry *dentry)
 	struct dentry *parent;
 
 	list_del(&dentry->d_u.d_child);
-	/*drops the locks, at that point nobody can reach this dentry */
 	dentry_iput(dentry);
+	/*
+	 * dentry_iput drops the locks, at which point nobody (except
+	 * transient RCU lookups) can reach this dentry.
+	 */
 	if (IS_ROOT(dentry))
 		parent = NULL;
 	else
@@ -303,13 +308,23 @@ void dput(struct dentry *dentry)
 		return;
 
 repeat:
-	if (atomic_read(&dentry->d_count) == 1)
+	if (dentry->d_count == 1)
 		might_sleep();
-	if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
-		return;
-
 	spin_lock(&dentry->d_lock);
-	if (atomic_read(&dentry->d_count)) {
+	if (dentry->d_count == 1) {
+		if (!spin_trylock(&dcache_lock)) {
+			/*
+			 * Something of a livelock possibility we could avoid
+			 * by taking dcache_lock and trying again, but we
+			 * want to reduce dcache_lock anyway so this will
+			 * get improved.
+			 */
+			spin_unlock(&dentry->d_lock);
+			goto repeat;
+		}
+	}
+	dentry->d_count--;
+	if (dentry->d_count) {
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&dcache_lock);
 		return;
@@ -389,7 +404,7 @@ int d_invalidate(struct dentry * dentry)
 	 * working directory or similar).
 	 */
 	spin_lock(&dentry->d_lock);
-	if (atomic_read(&dentry->d_count) > 1) {
+	if (dentry->d_count > 1) {
 		if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) {
 			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
@@ -404,29 +419,61 @@ int d_invalidate(struct dentry * dentry)
 }
 EXPORT_SYMBOL(d_invalidate);
 
-/* This should be called _only_ with dcache_lock held */
+/* This must be called with dcache_lock and d_lock held */
 static inline struct dentry * __dget_locked_dlock(struct dentry *dentry)
 {
-	atomic_inc(&dentry->d_count);
+	dentry->d_count++;
 	dentry_lru_del(dentry);
 	return dentry;
 }
 
+/* This should be called _only_ with dcache_lock held */
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
-	atomic_inc(&dentry->d_count);
 	spin_lock(&dentry->d_lock);
-	dentry_lru_del(dentry);
+	__dget_locked_dlock(dentry);
 	spin_unlock(&dentry->d_lock);
 	return dentry;
 }
 
+struct dentry * dget_locked_dlock(struct dentry *dentry)
+{
+	return __dget_locked_dlock(dentry);
+}
+
 struct dentry * dget_locked(struct dentry *dentry)
 {
 	return __dget_locked(dentry);
 }
 EXPORT_SYMBOL(dget_locked);
 
+struct dentry *dget_parent(struct dentry *dentry)
+{
+	struct dentry *ret;
+
+repeat:
+	spin_lock(&dentry->d_lock);
+	ret = dentry->d_parent;
+	if (!ret)
+		goto out;
+	if (dentry == ret) {
+		ret->d_count++;
+		goto out;
+	}
+	if (!spin_trylock(&ret->d_lock)) {
+		spin_unlock(&dentry->d_lock);
+		cpu_relax();
+		goto repeat;
+	}
+	BUG_ON(!ret->d_count);
+	ret->d_count++;
+	spin_unlock(&ret->d_lock);
+out:
+	spin_unlock(&dentry->d_lock);
+	return ret;
+}
+EXPORT_SYMBOL(dget_parent);
+
 /**
  * d_find_alias - grab a hashed alias of inode
  * @inode: inode in question
@@ -495,7 +542,7 @@ restart:
 	spin_lock(&dcache_lock);
 	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		spin_lock(&dentry->d_lock);
-		if (!atomic_read(&dentry->d_count)) {
+		if (!dentry->d_count) {
 			__dget_locked_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
@@ -530,7 +577,10 @@ static void prune_one_dentry(struct dentry * dentry)
 	 */
 	while (dentry) {
 		spin_lock(&dcache_lock);
-		if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock)) {
+		spin_lock(&dentry->d_lock);
+		dentry->d_count--;
+		if (dentry->d_count) {
+			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
 			return;
 		}
@@ -562,7 +612,7 @@ static void shrink_dentry_list(struct list_head *list)
 		 * the LRU because of laziness during lookup.  Do not free
 		 * it - just keep it off the LRU list.
 		 */
-		if (atomic_read(&dentry->d_count)) {
+		if (dentry->d_count) {
 			spin_unlock(&dentry->d_lock);
 			continue;
 		}
@@ -783,7 +833,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 		do {
 			struct inode *inode;
 
-			if (atomic_read(&dentry->d_count) != 0) {
+			if (dentry->d_count != 0) {
 				printk(KERN_ERR
 				       "BUG: Dentry %p{i=%lx,n=%s}"
 				       " still in use (%d)"
@@ -792,7 +842,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 				       dentry->d_inode ?
 				       dentry->d_inode->i_ino : 0UL,
 				       dentry->d_name.name,
-				       atomic_read(&dentry->d_count),
+				       dentry->d_count,
 				       dentry->d_sb->s_type->name,
 				       dentry->d_sb->s_id);
 				BUG();
@@ -802,7 +852,9 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 				parent = NULL;
 			else {
 				parent = dentry->d_parent;
-				atomic_dec(&parent->d_count);
+				spin_lock(&parent->d_lock);
+				parent->d_count--;
+				spin_unlock(&parent->d_lock);
 			}
 
 			list_del(&dentry->d_u.d_child);
@@ -853,7 +905,9 @@ void shrink_dcache_for_umount(struct super_block *sb)
 
 	dentry = sb->s_root;
 	sb->s_root = NULL;
-	atomic_dec(&dentry->d_count);
+	spin_lock(&dentry->d_lock);
+	dentry->d_count--;
+	spin_unlock(&dentry->d_lock);
 	shrink_dcache_for_umount_subtree(dentry);
 
 	while (!hlist_empty(&sb->s_anon)) {
@@ -950,7 +1004,7 @@ resume:
 		 * move only zero ref count dentries to the end 
 		 * of the unused list for prune_dcache
 		 */
-		if (!atomic_read(&dentry->d_count)) {
+		if (!dentry->d_count) {
 			dentry_lru_move_tail(dentry);
 			found++;
 		} else {
@@ -1068,7 +1122,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	memcpy(dname, name->name, name->len);
 	dname[name->len] = 0;
 
-	atomic_set(&dentry->d_count, 1);
+	dentry->d_count = 1;
 	dentry->d_flags = DCACHE_UNHASHED;
 	spin_lock_init(&dentry->d_lock);
 	dentry->d_inode = NULL;
@@ -1556,7 +1610,7 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 				goto next;
 		}
 
-		atomic_inc(&dentry->d_count);
+		dentry->d_count++;
 		found = dentry;
 		spin_unlock(&dentry->d_lock);
 		break;
@@ -1653,7 +1707,7 @@ void d_delete(struct dentry * dentry)
 	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	isdir = S_ISDIR(dentry->d_inode->i_mode);
-	if (atomic_read(&dentry->d_count) == 1) {
+	if (dentry->d_count == 1) {
 		dentry->d_flags &= ~DCACHE_CANT_MOUNT;
 		dentry_iput(dentry);
 		fsnotify_nameremove(dentry, isdir);
@@ -2494,11 +2548,15 @@ resume:
 			this_parent = dentry;
 			goto repeat;
 		}
-		atomic_dec(&dentry->d_count);
+		spin_lock(&dentry->d_lock);
+		dentry->d_count--;
+		spin_unlock(&dentry->d_lock);
 	}
 	if (this_parent != root) {
 		next = this_parent->d_u.d_child.next;
-		atomic_dec(&this_parent->d_count);
+		spin_lock(&this_parent->d_lock);
+		this_parent->d_count--;
+		spin_unlock(&this_parent->d_lock);
 		this_parent = this_parent->d_parent;
 		goto resume;
 	}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index a1ed7a7cb173..5e5c7ec1fc98 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -260,7 +260,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 				   ecryptfs_dentry->d_parent));
 	lower_inode = lower_dentry->d_inode;
 	fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
-	BUG_ON(!atomic_read(&lower_dentry->d_count));
+	BUG_ON(!lower_dentry->d_count);
 	ecryptfs_set_dentry_private(ecryptfs_dentry,
 				    kmem_cache_alloc(ecryptfs_dentry_info_cache,
 						     GFP_KERNEL));
diff --git a/fs/locks.c b/fs/locks.c
index 8729347bcd1a..08415b2a6d36 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1389,7 +1389,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 		if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
 			goto out;
 		if ((arg == F_WRLCK)
-		    && ((atomic_read(&dentry->d_count) > 1)
+		    && ((dentry->d_count > 1)
 			|| (atomic_read(&inode->i_count) > 1)))
 			goto out;
 	}
diff --git a/fs/namei.c b/fs/namei.c
index f3b5ca404659..cbfa5fb31072 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2133,7 +2133,7 @@ void dentry_unhash(struct dentry *dentry)
 	shrink_dcache_parent(dentry);
 	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
-	if (atomic_read(&dentry->d_count) == 2)
+	if (dentry->d_count == 2)
 		__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 9184c7c80f78..12de824edb5c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1720,7 +1720,7 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
-	if (atomic_read(&dentry->d_count) > 1) {
+	if (dentry->d_count > 1) {
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&dcache_lock);
 		/* Start asynchronous writeout of the inode */
@@ -1868,7 +1868,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
 		 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
 		 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
-		 atomic_read(&new_dentry->d_count));
+		 new_dentry->d_count);
 
 	/*
 	 * For non-directories, check whether the target is busy and if so,
@@ -1886,7 +1886,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			rehash = new_dentry;
 		}
 
-		if (atomic_read(&new_dentry->d_count) > 2) {
+		if (new_dentry->d_count > 2) {
 			int err;
 
 			/* copy the target dentry's name */
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 7bdec8531400..8fe9eb47a97f 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -496,7 +496,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 
 	dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
-		atomic_read(&dentry->d_count));
+		dentry->d_count);
 	nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
 
 	/*
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 184938fcff04..3a359023c9f7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1756,8 +1756,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 		goto out_dput_new;
 
 	if (svc_msnfs(ffhp) &&
-		((atomic_read(&odentry->d_count) > 1)
-		 || (atomic_read(&ndentry->d_count) > 1))) {
+		((odentry->d_count > 1) || (ndentry->d_count > 1))) {
 			host_err = -EPERM;
 			goto out_dput_new;
 	}
@@ -1843,7 +1842,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	if (type != S_IFDIR) { /* It's UNLINK */
 #ifdef MSNFS
 		if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-			(atomic_read(&rdentry->d_count) > 1)) {
+			(rdentry->d_count > 1)) {
 			host_err = -EPERM;
 		} else
 #endif
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f804d41ec9d3..d36fc7ee615f 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -838,7 +838,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
 
 static int nilfs_tree_was_touched(struct dentry *root_dentry)
 {
-	return atomic_read(&root_dentry->d_count) > 1;
+	return root_dentry->d_count > 1;
 }
 
 /**
-- 
cgit v1.2.2


From da5029563a0a026c64821b09e8e7b4fd81d3fe1b Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:33 +1100
Subject: fs: dcache scale d_unhashed

Protect d_unhashed(dentry) condition with d_lock. This means keeping
DCACHE_UNHASHED bit in synch with hash manipulations.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/autofs4/autofs_i.h           | 13 --------
 fs/autofs4/expire.c             | 21 +++++++++---
 fs/ceph/dir.c                   |  5 +--
 fs/configfs/configfs_internal.h |  2 ++
 fs/dcache.c                     | 74 ++++++++++++++++++++++++++++-------------
 fs/libfs.c                      | 29 +++++++++++-----
 fs/ocfs2/dcache.c               |  5 ++-
 7 files changed, 95 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 3d283abf67d7..3912dcf047e5 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -254,19 +254,6 @@ static inline int simple_positive(struct dentry *dentry)
 	return dentry->d_inode && !d_unhashed(dentry);
 }
 
-static inline int __simple_empty(struct dentry *dentry)
-{
-	struct dentry *child;
-	int ret = 0;
-
-	list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
-		if (simple_positive(child))
-			goto out;
-	ret = 1;
-out:
-	return ret;
-}
-
 static inline void autofs4_add_expiring(struct dentry *dentry)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 413b5642e6cf..ee6402050f13 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -160,14 +160,18 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
 
 	spin_lock(&dcache_lock);
 	for (p = top; p; p = next_dentry(p, top)) {
+		spin_lock(&p->d_lock);
 		/* Negative dentry - give up */
-		if (!simple_positive(p))
+		if (!simple_positive(p)) {
+			spin_unlock(&p->d_lock);
 			continue;
+		}
 
 		DPRINTK("dentry %p %.*s",
 			p, (int) p->d_name.len, p->d_name.name);
 
-		p = dget(p);
+		p = dget_dlock(p);
+		spin_unlock(&p->d_lock);
 		spin_unlock(&dcache_lock);
 
 		/*
@@ -228,14 +232,18 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
 
 	spin_lock(&dcache_lock);
 	for (p = parent; p; p = next_dentry(p, parent)) {
+		spin_lock(&p->d_lock);
 		/* Negative dentry - give up */
-		if (!simple_positive(p))
+		if (!simple_positive(p)) {
+			spin_unlock(&p->d_lock);
 			continue;
+		}
 
 		DPRINTK("dentry %p %.*s",
 			p, (int) p->d_name.len, p->d_name.name);
 
-		p = dget(p);
+		p = dget_dlock(p);
+		spin_unlock(&p->d_lock);
 		spin_unlock(&dcache_lock);
 
 		if (d_mountpoint(p)) {
@@ -324,12 +332,15 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
 
 		/* Negative dentry - give up */
+		spin_lock(&dentry->d_lock);
 		if (!simple_positive(dentry)) {
 			next = next->next;
+			spin_unlock(&dentry->d_lock);
 			continue;
 		}
 
-		dentry = dget(dentry);
+		dentry = dget_dlock(dentry);
+		spin_unlock(&dentry->d_lock);
 		spin_unlock(&dcache_lock);
 
 		spin_lock(&sbi->fs_lock);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 3ecf915a4550..571f270dca0f 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -136,6 +136,7 @@ more:
 			fi->at_end = 1;
 			goto out_unlock;
 		}
+		spin_lock(&dentry->d_lock);
 		if (!d_unhashed(dentry) && dentry->d_inode &&
 		    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
 		    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
@@ -145,13 +146,13 @@ more:
 		     dentry->d_name.len, dentry->d_name.name, di->offset,
 		     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
 		     !dentry->d_inode ? " null" : "");
+		spin_unlock(&dentry->d_lock);
 		p = p->prev;
 		dentry = list_entry(p, struct dentry, d_u.d_child);
 		di = ceph_dentry(dentry);
 	}
 
-	spin_lock(&dentry->d_lock);
-	dentry->d_count++;
+	dget_dlock(dentry);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index da6061a6df40..e58b4c30e216 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -121,6 +121,7 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
 	struct config_item * item = NULL;
 
 	spin_lock(&dcache_lock);
+	spin_lock(&dentry->d_lock);
 	if (!d_unhashed(dentry)) {
 		struct configfs_dirent * sd = dentry->d_fsdata;
 		if (sd->s_type & CONFIGFS_ITEM_LINK) {
@@ -129,6 +130,7 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
 		} else
 			item = config_item_get(sd->s_element);
 	}
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 
 	return item;
diff --git a/fs/dcache.c b/fs/dcache.c
index 81e91502b294..ee127f9ab274 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -46,6 +46,7 @@
  *   - d_name
  *   - d_lru
  *   - d_count
+ *   - d_unhashed()
  *
  * Ordering:
  * dcache_lock
@@ -53,6 +54,13 @@
  *     dcache_lru_lock
  *     dcache_hash_lock
  *
+ * If there is an ancestor relationship:
+ * dentry->d_parent->...->d_parent->d_lock
+ *   ...
+ *     dentry->d_parent->d_lock
+ *       dentry->d_lock
+ *
+ * If no ancestor relationship:
  * if (dentry1 < dentry2)
  *   dentry1->d_lock
  *     dentry2->d_lock
@@ -379,7 +387,9 @@ int d_invalidate(struct dentry * dentry)
 	 * If it's already been dropped, return OK.
 	 */
 	spin_lock(&dcache_lock);
+	spin_lock(&dentry->d_lock);
 	if (d_unhashed(dentry)) {
+		spin_unlock(&dentry->d_lock);
 		spin_unlock(&dcache_lock);
 		return 0;
 	}
@@ -388,9 +398,11 @@ int d_invalidate(struct dentry * dentry)
 	 * to get rid of unused child entries.
 	 */
 	if (!list_empty(&dentry->d_subdirs)) {
+		spin_unlock(&dentry->d_lock);
 		spin_unlock(&dcache_lock);
 		shrink_dcache_parent(dentry);
 		spin_lock(&dcache_lock);
+		spin_lock(&dentry->d_lock);
 	}
 
 	/*
@@ -403,7 +415,6 @@ int d_invalidate(struct dentry * dentry)
 	 * we might still populate it if it was a
 	 * working directory or similar).
 	 */
-	spin_lock(&dentry->d_lock);
 	if (dentry->d_count > 1) {
 		if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) {
 			spin_unlock(&dentry->d_lock);
@@ -490,35 +501,44 @@ EXPORT_SYMBOL(dget_parent);
  * any other hashed alias over that one unless @want_discon is set,
  * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
  */
-
-static struct dentry * __d_find_alias(struct inode *inode, int want_discon)
+static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
 {
-	struct list_head *head, *next, *tmp;
-	struct dentry *alias, *discon_alias=NULL;
+	struct dentry *alias, *discon_alias;
 
-	head = &inode->i_dentry;
-	next = inode->i_dentry.next;
-	while (next != head) {
-		tmp = next;
-		next = tmp->next;
-		prefetch(next);
-		alias = list_entry(tmp, struct dentry, d_alias);
+again:
+	discon_alias = NULL;
+	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
+		spin_lock(&alias->d_lock);
  		if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
 			if (IS_ROOT(alias) &&
-			    (alias->d_flags & DCACHE_DISCONNECTED))
+			    (alias->d_flags & DCACHE_DISCONNECTED)) {
 				discon_alias = alias;
-			else if (!want_discon) {
-				__dget_locked(alias);
+			} else if (!want_discon) {
+				__dget_locked_dlock(alias);
+				spin_unlock(&alias->d_lock);
+				return alias;
+			}
+		}
+		spin_unlock(&alias->d_lock);
+	}
+	if (discon_alias) {
+		alias = discon_alias;
+		spin_lock(&alias->d_lock);
+		if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
+			if (IS_ROOT(alias) &&
+			    (alias->d_flags & DCACHE_DISCONNECTED)) {
+				__dget_locked_dlock(alias);
+				spin_unlock(&alias->d_lock);
 				return alias;
 			}
 		}
+		spin_unlock(&alias->d_lock);
+		goto again;
 	}
-	if (discon_alias)
-		__dget_locked(discon_alias);
-	return discon_alias;
+	return NULL;
 }
 
-struct dentry * d_find_alias(struct inode *inode)
+struct dentry *d_find_alias(struct inode *inode)
 {
 	struct dentry *de = NULL;
 
@@ -801,8 +821,8 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	dentry_lru_del(dentry);
-	spin_unlock(&dentry->d_lock);
 	__d_drop(dentry);
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 
 	for (;;) {
@@ -817,8 +837,8 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 					    d_u.d_child) {
 				spin_lock(&loop->d_lock);
 				dentry_lru_del(loop);
-				spin_unlock(&loop->d_lock);
 				__d_drop(loop);
+				spin_unlock(&loop->d_lock);
 				cond_resched_lock(&dcache_lock);
 			}
 			spin_unlock(&dcache_lock);
@@ -1863,7 +1883,10 @@ static void d_move_locked(struct dentry * dentry, struct dentry * target)
 	/*
 	 * XXXX: do we really need to take target->d_lock?
 	 */
-	if (target < dentry) {
+	if (d_ancestor(dentry, target)) {
+		spin_lock(&dentry->d_lock);
+		spin_lock_nested(&target->d_lock, DENTRY_D_LOCK_NESTED);
+	} else if (d_ancestor(target, dentry) || target < dentry) {
 		spin_lock(&target->d_lock);
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 	} else {
@@ -2542,13 +2565,16 @@ resume:
 		struct list_head *tmp = next;
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
-		if (d_unhashed(dentry)||!dentry->d_inode)
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+		if (d_unhashed(dentry) || !dentry->d_inode) {
+			spin_unlock(&dentry->d_lock);
 			continue;
+		}
 		if (!list_empty(&dentry->d_subdirs)) {
+			spin_unlock(&dentry->d_lock);
 			this_parent = dentry;
 			goto repeat;
 		}
-		spin_lock(&dentry->d_lock);
 		dentry->d_count--;
 		spin_unlock(&dentry->d_lock);
 	}
diff --git a/fs/libfs.c b/fs/libfs.c
index b9d25d83e228..433e7139c23a 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -16,6 +16,11 @@
 
 #include <asm/uaccess.h>
 
+static inline int simple_positive(struct dentry *dentry)
+{
+	return dentry->d_inode && !d_unhashed(dentry);
+}
+
 int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		   struct kstat *stat)
 {
@@ -100,8 +105,10 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 			while (n && p != &file->f_path.dentry->d_subdirs) {
 				struct dentry *next;
 				next = list_entry(p, struct dentry, d_u.d_child);
-				if (!d_unhashed(next) && next->d_inode)
+				spin_lock(&next->d_lock);
+				if (simple_positive(next))
 					n--;
+				spin_unlock(&next->d_lock);
 				p = p->next;
 			}
 			list_add_tail(&cursor->d_u.d_child, p);
@@ -155,9 +162,13 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
 				struct dentry *next;
 				next = list_entry(p, struct dentry, d_u.d_child);
-				if (d_unhashed(next) || !next->d_inode)
+				spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+				if (!simple_positive(next)) {
+					spin_unlock(&next->d_lock);
 					continue;
+				}
 
+				spin_unlock(&next->d_lock);
 				spin_unlock(&dcache_lock);
 				if (filldir(dirent, next->d_name.name, 
 					    next->d_name.len, filp->f_pos, 
@@ -259,20 +270,20 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
 	return 0;
 }
 
-static inline int simple_positive(struct dentry *dentry)
-{
-	return dentry->d_inode && !d_unhashed(dentry);
-}
-
 int simple_empty(struct dentry *dentry)
 {
 	struct dentry *child;
 	int ret = 0;
 
 	spin_lock(&dcache_lock);
-	list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
-		if (simple_positive(child))
+	list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {
+		spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+		if (simple_positive(child)) {
+			spin_unlock(&child->d_lock);
 			goto out;
+		}
+		spin_unlock(&child->d_lock);
+	}
 	ret = 1;
 out:
 	spin_unlock(&dcache_lock);
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 895532ac4d98..35e5f5a9ef59 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -174,13 +174,16 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
 	list_for_each(p, &inode->i_dentry) {
 		dentry = list_entry(p, struct dentry, d_alias);
 
+		spin_lock(&dentry->d_lock);
 		if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
 			mlog(0, "dentry found: %.*s\n",
 			     dentry->d_name.len, dentry->d_name.name);
 
-			dget_locked(dentry);
+			dget_locked_dlock(dentry);
+			spin_unlock(&dentry->d_lock);
 			break;
 		}
+		spin_unlock(&dentry->d_lock);
 
 		dentry = NULL;
 	}
-- 
cgit v1.2.2


From 2fd6b7f50797f2e993eea59e0a0b8c6399c811dc Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:34 +1100
Subject: fs: dcache scale subdirs

Protect d_subdirs and d_child with d_lock, except in filesystems that aren't
using dcache_lock for these anyway (eg. using i_mutex).

Note: if we change the locking rule in future so that ->d_child protection is
provided only with ->d_parent->d_lock, it may allow us to reduce some locking.
But it would be an exception to an otherwise regular locking scheme, so we'd
have to see some good results. Probably not worthwhile.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/autofs4/autofs_i.h    |  11 +++
 fs/autofs4/expire.c      | 127 ++++++++++++------------
 fs/autofs4/root.c        |  18 +++-
 fs/ceph/dir.c            |   6 +-
 fs/ceph/inode.c          |   8 +-
 fs/coda/cache.c          |   2 +
 fs/dcache.c              | 248 +++++++++++++++++++++++++++++++++--------------
 fs/libfs.c               |  24 +++--
 fs/ncpfs/dir.c           |   3 +
 fs/ncpfs/ncplib_kernel.h |   4 +
 fs/notify/fsnotify.c     |   4 +-
 11 files changed, 302 insertions(+), 153 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 3912dcf047e5..9d2ae9b30d9f 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -254,6 +254,17 @@ static inline int simple_positive(struct dentry *dentry)
 	return dentry->d_inode && !d_unhashed(dentry);
 }
 
+static inline void __autofs4_add_expiring(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	if (ino) {
+		if (list_empty(&ino->expiring))
+			list_add(&ino->expiring, &sbi->expiring_list);
+	}
+	return;
+}
+
 static inline void autofs4_add_expiring(struct dentry *dentry)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index ee6402050f13..968c1434af62 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -91,24 +91,64 @@ done:
 }
 
 /*
- * Calculate next entry in top down tree traversal.
- * From next_mnt in namespace.c - elegant.
+ * Calculate and dget next entry in top down tree traversal.
  */
-static struct dentry *next_dentry(struct dentry *p, struct dentry *root)
+static struct dentry *get_next_positive_dentry(struct dentry *prev,
+						struct dentry *root)
 {
-	struct list_head *next = p->d_subdirs.next;
+	struct list_head *next;
+	struct dentry *p, *ret;
+
+	if (prev == NULL)
+		return dget(prev);
 
+	spin_lock(&dcache_lock);
+relock:
+	p = prev;
+	spin_lock(&p->d_lock);
+again:
+	next = p->d_subdirs.next;
 	if (next == &p->d_subdirs) {
 		while (1) {
-			if (p == root)
+			struct dentry *parent;
+
+			if (p == root) {
+				spin_unlock(&p->d_lock);
+				spin_unlock(&dcache_lock);
+				dput(prev);
 				return NULL;
+			}
+
+			parent = p->d_parent;
+			if (!spin_trylock(&parent->d_lock)) {
+				spin_unlock(&p->d_lock);
+				cpu_relax();
+				goto relock;
+			}
+			spin_unlock(&p->d_lock);
 			next = p->d_u.d_child.next;
-			if (next != &p->d_parent->d_subdirs)
+			p = parent;
+			if (next != &parent->d_subdirs)
 				break;
-			p = p->d_parent;
 		}
 	}
-	return list_entry(next, struct dentry, d_u.d_child);
+	ret = list_entry(next, struct dentry, d_u.d_child);
+
+	spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED);
+	/* Negative dentry - try next */
+	if (!simple_positive(ret)) {
+		spin_unlock(&ret->d_lock);
+		p = ret;
+		goto again;
+	}
+	dget_dlock(ret);
+	spin_unlock(&ret->d_lock);
+	spin_unlock(&p->d_lock);
+	spin_unlock(&dcache_lock);
+
+	dput(prev);
+
+	return ret;
 }
 
 /*
@@ -158,22 +198,11 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
 	if (!simple_positive(top))
 		return 1;
 
-	spin_lock(&dcache_lock);
-	for (p = top; p; p = next_dentry(p, top)) {
-		spin_lock(&p->d_lock);
-		/* Negative dentry - give up */
-		if (!simple_positive(p)) {
-			spin_unlock(&p->d_lock);
-			continue;
-		}
-
+	p = NULL;
+	while ((p = get_next_positive_dentry(p, top))) {
 		DPRINTK("dentry %p %.*s",
 			p, (int) p->d_name.len, p->d_name.name);
 
-		p = dget_dlock(p);
-		spin_unlock(&p->d_lock);
-		spin_unlock(&dcache_lock);
-
 		/*
 		 * Is someone visiting anywhere in the subtree ?
 		 * If there's no mount we need to check the usage
@@ -208,10 +237,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
 				return 1;
 			}
 		}
-		dput(p);
-		spin_lock(&dcache_lock);
 	}
-	spin_unlock(&dcache_lock);
 
 	/* Timeout of a tree mount is ultimately determined by its top dentry */
 	if (!autofs4_can_expire(top, timeout, do_now))
@@ -230,36 +256,21 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
 	DPRINTK("parent %p %.*s",
 		parent, (int)parent->d_name.len, parent->d_name.name);
 
-	spin_lock(&dcache_lock);
-	for (p = parent; p; p = next_dentry(p, parent)) {
-		spin_lock(&p->d_lock);
-		/* Negative dentry - give up */
-		if (!simple_positive(p)) {
-			spin_unlock(&p->d_lock);
-			continue;
-		}
-
+	p = NULL;
+	while ((p = get_next_positive_dentry(p, parent))) {
 		DPRINTK("dentry %p %.*s",
 			p, (int) p->d_name.len, p->d_name.name);
 
-		p = dget_dlock(p);
-		spin_unlock(&p->d_lock);
-		spin_unlock(&dcache_lock);
-
 		if (d_mountpoint(p)) {
 			/* Can we umount this guy */
 			if (autofs4_mount_busy(mnt, p))
-				goto cont;
+				continue;
 
 			/* Can we expire this guy */
 			if (autofs4_can_expire(p, timeout, do_now))
 				return p;
 		}
-cont:
-		dput(p);
-		spin_lock(&dcache_lock);
 	}
-	spin_unlock(&dcache_lock);
 	return NULL;
 }
 
@@ -310,8 +321,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 {
 	unsigned long timeout;
 	struct dentry *root = sb->s_root;
+	struct dentry *dentry;
 	struct dentry *expired = NULL;
-	struct list_head *next;
 	int do_now = how & AUTOFS_EXP_IMMEDIATE;
 	int exp_leaves = how & AUTOFS_EXP_LEAVES;
 	struct autofs_info *ino;
@@ -323,26 +334,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 	now = jiffies;
 	timeout = sbi->exp_timeout;
 
-	spin_lock(&dcache_lock);
-	next = root->d_subdirs.next;
-
-	/* On exit from the loop expire is set to a dgot dentry
-	 * to expire or it's NULL */
-	while ( next != &root->d_subdirs ) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
-
-		/* Negative dentry - give up */
-		spin_lock(&dentry->d_lock);
-		if (!simple_positive(dentry)) {
-			next = next->next;
-			spin_unlock(&dentry->d_lock);
-			continue;
-		}
-
-		dentry = dget_dlock(dentry);
-		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_lock);
-
+	dentry = NULL;
+	while ((dentry = get_next_positive_dentry(dentry, root))) {
 		spin_lock(&sbi->fs_lock);
 		ino = autofs4_dentry_ino(dentry);
 
@@ -405,11 +398,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 		}
 next:
 		spin_unlock(&sbi->fs_lock);
-		dput(dentry);
-		spin_lock(&dcache_lock);
-		next = next->next;
 	}
-	spin_unlock(&dcache_lock);
 	return NULL;
 
 found:
@@ -420,7 +409,11 @@ found:
 	init_completion(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 	spin_lock(&dcache_lock);
+	spin_lock(&expired->d_parent->d_lock);
+	spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
 	list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
+	spin_unlock(&expired->d_lock);
+	spin_unlock(&expired->d_parent->d_lock);
 	spin_unlock(&dcache_lock);
 	return expired;
 }
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 7922509b5b2b..7a9ed6b88291 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -143,10 +143,13 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
 	 * it.
 	 */
 	spin_lock(&dcache_lock);
+	spin_lock(&dentry->d_lock);
 	if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
+		spin_unlock(&dentry->d_lock);
 		spin_unlock(&dcache_lock);
 		return -ENOENT;
 	}
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 
 out:
@@ -253,7 +256,9 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	lookup_type = autofs4_need_mount(nd->flags);
 	spin_lock(&sbi->fs_lock);
 	spin_lock(&dcache_lock);
+	spin_lock(&dentry->d_lock);
 	if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
+		spin_unlock(&dentry->d_lock);
 		spin_unlock(&dcache_lock);
 		spin_unlock(&sbi->fs_lock);
 		goto follow;
@@ -266,6 +271,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	 */
 	if (ino->flags & AUTOFS_INF_PENDING ||
 	    (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
+		spin_unlock(&dentry->d_lock);
 		spin_unlock(&dcache_lock);
 		spin_unlock(&sbi->fs_lock);
 
@@ -275,6 +281,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 
 		goto follow;
 	}
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 	spin_unlock(&sbi->fs_lock);
 follow:
@@ -347,10 +354,12 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 
 	/* Check for a non-mountpoint directory with no contents */
 	spin_lock(&dcache_lock);
+	spin_lock(&dentry->d_lock);
 	if (S_ISDIR(dentry->d_inode->i_mode) &&
 	    !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
 		DPRINTK("dentry=%p %.*s, emptydir",
 			 dentry, dentry->d_name.len, dentry->d_name.name);
+		spin_unlock(&dentry->d_lock);
 		spin_unlock(&dcache_lock);
 
 		/* The daemon never causes a mount to trigger */
@@ -367,6 +376,7 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 
 		return status;
 	}
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 
 	return 1;
@@ -776,12 +786,16 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 		return -EACCES;
 
 	spin_lock(&dcache_lock);
+	spin_lock(&sbi->lookup_lock);
+	spin_lock(&dentry->d_lock);
 	if (!list_empty(&dentry->d_subdirs)) {
+		spin_unlock(&dentry->d_lock);
+		spin_unlock(&sbi->lookup_lock);
 		spin_unlock(&dcache_lock);
 		return -ENOTEMPTY;
 	}
-	autofs4_add_expiring(dentry);
-	spin_lock(&dentry->d_lock);
+	__autofs4_add_expiring(dentry);
+	spin_unlock(&sbi->lookup_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 571f270dca0f..2c924e8d85fe 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -113,6 +113,7 @@ static int __dcache_readdir(struct file *filp,
 	     last);
 
 	spin_lock(&dcache_lock);
+	spin_lock(&parent->d_lock);
 
 	/* start at beginning? */
 	if (filp->f_pos == 2 || last == NULL ||
@@ -136,7 +137,7 @@ more:
 			fi->at_end = 1;
 			goto out_unlock;
 		}
-		spin_lock(&dentry->d_lock);
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 		if (!d_unhashed(dentry) && dentry->d_inode &&
 		    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
 		    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
@@ -154,6 +155,7 @@ more:
 
 	dget_dlock(dentry);
 	spin_unlock(&dentry->d_lock);
+	spin_unlock(&parent->d_lock);
 	spin_unlock(&dcache_lock);
 
 	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
@@ -188,10 +190,12 @@ more:
 	}
 
 	spin_lock(&dcache_lock);
+	spin_lock(&parent->d_lock);
 	p = p->prev;	/* advance to next dentry */
 	goto more;
 
 out_unlock:
+	spin_unlock(&parent->d_lock);
 	spin_unlock(&dcache_lock);
 out:
 	if (last)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index bb68c799074d..2c6944473366 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -842,11 +842,13 @@ static void ceph_set_dentry_offset(struct dentry *dn)
 	spin_unlock(&inode->i_lock);
 
 	spin_lock(&dcache_lock);
-	spin_lock(&dn->d_lock);
+	spin_lock(&dir->d_lock);
+	spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
 	list_move(&dn->d_u.d_child, &dir->d_subdirs);
 	dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
 	     dn->d_u.d_child.prev, dn->d_u.d_child.next);
 	spin_unlock(&dn->d_lock);
+	spin_unlock(&dir->d_lock);
 	spin_unlock(&dcache_lock);
 }
 
@@ -1232,9 +1234,11 @@ retry_lookup:
 		} else {
 			/* reorder parent's d_subdirs */
 			spin_lock(&dcache_lock);
-			spin_lock(&dn->d_lock);
+			spin_lock(&parent->d_lock);
+			spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
 			list_move(&dn->d_u.d_child, &parent->d_subdirs);
 			spin_unlock(&dn->d_lock);
+			spin_unlock(&parent->d_lock);
 			spin_unlock(&dcache_lock);
 		}
 
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 9060f08e70cf..859393fca2b7 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -94,6 +94,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
 	struct dentry *de;
 
 	spin_lock(&dcache_lock);
+	spin_lock(&parent->d_lock);
 	list_for_each(child, &parent->d_subdirs)
 	{
 		de = list_entry(child, struct dentry, d_u.d_child);
@@ -102,6 +103,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
 			continue;
 		coda_flag_inode(de->d_inode, flag);
 	}
+	spin_unlock(&parent->d_lock);
 	spin_unlock(&dcache_lock);
 	return; 
 }
diff --git a/fs/dcache.c b/fs/dcache.c
index ee127f9ab274..a661247a20d5 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -47,6 +47,8 @@
  *   - d_lru
  *   - d_count
  *   - d_unhashed()
+ *   - d_parent and d_subdirs
+ *   - childrens' d_child and d_parent
  *
  * Ordering:
  * dcache_lock
@@ -223,24 +225,22 @@ static void dentry_lru_move_tail(struct dentry *dentry)
  *
  * If this is the root of the dentry tree, return NULL.
  *
- * dcache_lock and d_lock must be held by caller, are dropped by d_kill.
+ * dcache_lock and d_lock and d_parent->d_lock must be held by caller, and
+ * are dropped by d_kill.
  */
-static struct dentry *d_kill(struct dentry *dentry)
+static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
 	__releases(dentry->d_lock)
+	__releases(parent->d_lock)
 	__releases(dcache_lock)
 {
-	struct dentry *parent;
-
 	list_del(&dentry->d_u.d_child);
+	if (parent)
+		spin_unlock(&parent->d_lock);
 	dentry_iput(dentry);
 	/*
 	 * dentry_iput drops the locks, at which point nobody (except
 	 * transient RCU lookups) can reach this dentry.
 	 */
-	if (IS_ROOT(dentry))
-		parent = NULL;
-	else
-		parent = dentry->d_parent;
 	d_free(dentry);
 	return parent;
 }
@@ -312,6 +312,7 @@ EXPORT_SYMBOL(d_drop);
 
 void dput(struct dentry *dentry)
 {
+	struct dentry *parent;
 	if (!dentry)
 		return;
 
@@ -319,6 +320,10 @@ repeat:
 	if (dentry->d_count == 1)
 		might_sleep();
 	spin_lock(&dentry->d_lock);
+	if (IS_ROOT(dentry))
+		parent = NULL;
+	else
+		parent = dentry->d_parent;
 	if (dentry->d_count == 1) {
 		if (!spin_trylock(&dcache_lock)) {
 			/*
@@ -330,10 +335,17 @@ repeat:
 			spin_unlock(&dentry->d_lock);
 			goto repeat;
 		}
+		if (parent && !spin_trylock(&parent->d_lock)) {
+			spin_unlock(&dentry->d_lock);
+			spin_unlock(&dcache_lock);
+			goto repeat;
+		}
 	}
 	dentry->d_count--;
 	if (dentry->d_count) {
 		spin_unlock(&dentry->d_lock);
+		if (parent)
+			spin_unlock(&parent->d_lock);
 		spin_unlock(&dcache_lock);
 		return;
 	}
@@ -355,6 +367,8 @@ repeat:
 	dentry_lru_add(dentry);
 
  	spin_unlock(&dentry->d_lock);
+	if (parent)
+		spin_unlock(&parent->d_lock);
 	spin_unlock(&dcache_lock);
 	return;
 
@@ -363,7 +377,7 @@ unhash_it:
 kill_it:
 	/* if dentry was on the d_lru list delete it from there */
 	dentry_lru_del(dentry);
-	dentry = d_kill(dentry);
+	dentry = d_kill(dentry, parent);
 	if (dentry)
 		goto repeat;
 }
@@ -584,12 +598,13 @@ EXPORT_SYMBOL(d_prune_aliases);
  * quadratic behavior of shrink_dcache_parent(), but is also expected
  * to be beneficial in reducing dentry cache fragmentation.
  */
-static void prune_one_dentry(struct dentry * dentry)
+static void prune_one_dentry(struct dentry *dentry, struct dentry *parent)
 	__releases(dentry->d_lock)
+	__releases(parent->d_lock)
 	__releases(dcache_lock)
 {
 	__d_drop(dentry);
-	dentry = d_kill(dentry);
+	dentry = d_kill(dentry, parent);
 
 	/*
 	 * Prune ancestors.  Locking is simpler than in dput(),
@@ -597,9 +612,20 @@ static void prune_one_dentry(struct dentry * dentry)
 	 */
 	while (dentry) {
 		spin_lock(&dcache_lock);
+again:
 		spin_lock(&dentry->d_lock);
+		if (IS_ROOT(dentry))
+			parent = NULL;
+		else
+			parent = dentry->d_parent;
+		if (parent && !spin_trylock(&parent->d_lock)) {
+			spin_unlock(&dentry->d_lock);
+			goto again;
+		}
 		dentry->d_count--;
 		if (dentry->d_count) {
+			if (parent)
+				spin_unlock(&parent->d_lock);
 			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
 			return;
@@ -607,7 +633,7 @@ static void prune_one_dentry(struct dentry * dentry)
 
 		dentry_lru_del(dentry);
 		__d_drop(dentry);
-		dentry = d_kill(dentry);
+		dentry = d_kill(dentry, parent);
 	}
 }
 
@@ -616,29 +642,40 @@ static void shrink_dentry_list(struct list_head *list)
 	struct dentry *dentry;
 
 	while (!list_empty(list)) {
+		struct dentry *parent;
+
 		dentry = list_entry(list->prev, struct dentry, d_lru);
 
 		if (!spin_trylock(&dentry->d_lock)) {
+relock:
 			spin_unlock(&dcache_lru_lock);
 			cpu_relax();
 			spin_lock(&dcache_lru_lock);
 			continue;
 		}
 
-		__dentry_lru_del(dentry);
-
 		/*
 		 * We found an inuse dentry which was not removed from
 		 * the LRU because of laziness during lookup.  Do not free
 		 * it - just keep it off the LRU list.
 		 */
 		if (dentry->d_count) {
+			__dentry_lru_del(dentry);
 			spin_unlock(&dentry->d_lock);
 			continue;
 		}
+		if (IS_ROOT(dentry))
+			parent = NULL;
+		else
+			parent = dentry->d_parent;
+		if (parent && !spin_trylock(&parent->d_lock)) {
+			spin_unlock(&dentry->d_lock);
+			goto relock;
+		}
+		__dentry_lru_del(dentry);
 		spin_unlock(&dcache_lru_lock);
 
-		prune_one_dentry(dentry);
+		prune_one_dentry(dentry, parent);
 		/* dcache_lock and dentry->d_lock dropped */
 		spin_lock(&dcache_lock);
 		spin_lock(&dcache_lru_lock);
@@ -833,14 +870,16 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 			/* this is a branch with children - detach all of them
 			 * from the system in one go */
 			spin_lock(&dcache_lock);
+			spin_lock(&dentry->d_lock);
 			list_for_each_entry(loop, &dentry->d_subdirs,
 					    d_u.d_child) {
-				spin_lock(&loop->d_lock);
+				spin_lock_nested(&loop->d_lock,
+						DENTRY_D_LOCK_NESTED);
 				dentry_lru_del(loop);
 				__d_drop(loop);
 				spin_unlock(&loop->d_lock);
-				cond_resched_lock(&dcache_lock);
 			}
+			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
 
 			/* move to the first child */
@@ -868,16 +907,17 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 				BUG();
 			}
 
-			if (IS_ROOT(dentry))
+			if (IS_ROOT(dentry)) {
 				parent = NULL;
-			else {
+				list_del(&dentry->d_u.d_child);
+			} else {
 				parent = dentry->d_parent;
 				spin_lock(&parent->d_lock);
 				parent->d_count--;
+				list_del(&dentry->d_u.d_child);
 				spin_unlock(&parent->d_lock);
 			}
 
-			list_del(&dentry->d_u.d_child);
 			detached++;
 
 			inode = dentry->d_inode;
@@ -958,6 +998,7 @@ int have_submounts(struct dentry *parent)
 	spin_lock(&dcache_lock);
 	if (d_mountpoint(parent))
 		goto positive;
+	spin_lock(&this_parent->d_lock);
 repeat:
 	next = this_parent->d_subdirs.next;
 resume:
@@ -965,22 +1006,34 @@ resume:
 		struct list_head *tmp = next;
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
+
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 		/* Have we found a mount point ? */
-		if (d_mountpoint(dentry))
+		if (d_mountpoint(dentry)) {
+			spin_unlock(&dentry->d_lock);
+			spin_unlock(&this_parent->d_lock);
 			goto positive;
+		}
 		if (!list_empty(&dentry->d_subdirs)) {
+			spin_unlock(&this_parent->d_lock);
+			spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
 			this_parent = dentry;
+			spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
 			goto repeat;
 		}
+		spin_unlock(&dentry->d_lock);
 	}
 	/*
 	 * All done at this level ... ascend and resume the search.
 	 */
 	if (this_parent != parent) {
 		next = this_parent->d_u.d_child.next;
+		spin_unlock(&this_parent->d_lock);
 		this_parent = this_parent->d_parent;
+		spin_lock(&this_parent->d_lock);
 		goto resume;
 	}
+	spin_unlock(&this_parent->d_lock);
 	spin_unlock(&dcache_lock);
 	return 0; /* No mount points found in tree */
 positive:
@@ -1010,6 +1063,7 @@ static int select_parent(struct dentry * parent)
 	int found = 0;
 
 	spin_lock(&dcache_lock);
+	spin_lock(&this_parent->d_lock);
 repeat:
 	next = this_parent->d_subdirs.next;
 resume:
@@ -1017,8 +1071,9 @@ resume:
 		struct list_head *tmp = next;
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
+		BUG_ON(this_parent == dentry);
 
-		spin_lock(&dentry->d_lock);
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 
 		/* 
 		 * move only zero ref count dentries to the end 
@@ -1031,33 +1086,44 @@ resume:
 			dentry_lru_del(dentry);
 		}
 
-		spin_unlock(&dentry->d_lock);
-
 		/*
 		 * We can return to the caller if we have found some (this
 		 * ensures forward progress). We'll be coming back to find
 		 * the rest.
 		 */
-		if (found && need_resched())
+		if (found && need_resched()) {
+			spin_unlock(&dentry->d_lock);
 			goto out;
+		}
 
 		/*
 		 * Descend a level if the d_subdirs list is non-empty.
 		 */
 		if (!list_empty(&dentry->d_subdirs)) {
+			spin_unlock(&this_parent->d_lock);
+			spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
 			this_parent = dentry;
+			spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
 			goto repeat;
 		}
+
+		spin_unlock(&dentry->d_lock);
 	}
 	/*
 	 * All done at this level ... ascend and resume the search.
 	 */
 	if (this_parent != parent) {
+		struct dentry *tmp;
 		next = this_parent->d_u.d_child.next;
-		this_parent = this_parent->d_parent;
+		tmp = this_parent->d_parent;
+		spin_unlock(&this_parent->d_lock);
+		BUG_ON(tmp == this_parent);
+		this_parent = tmp;
+		spin_lock(&this_parent->d_lock);
 		goto resume;
 	}
 out:
+	spin_unlock(&this_parent->d_lock);
 	spin_unlock(&dcache_lock);
 	return found;
 }
@@ -1155,18 +1221,19 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	INIT_LIST_HEAD(&dentry->d_lru);
 	INIT_LIST_HEAD(&dentry->d_subdirs);
 	INIT_LIST_HEAD(&dentry->d_alias);
+	INIT_LIST_HEAD(&dentry->d_u.d_child);
 
 	if (parent) {
-		dentry->d_parent = dget(parent);
+		spin_lock(&dcache_lock);
+		spin_lock(&parent->d_lock);
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+		dentry->d_parent = dget_dlock(parent);
 		dentry->d_sb = parent->d_sb;
-	} else {
-		INIT_LIST_HEAD(&dentry->d_u.d_child);
-	}
-
-	spin_lock(&dcache_lock);
-	if (parent)
 		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
-	spin_unlock(&dcache_lock);
+		spin_unlock(&dentry->d_lock);
+		spin_unlock(&parent->d_lock);
+		spin_unlock(&dcache_lock);
+	}
 
 	this_cpu_inc(nr_dentry);
 
@@ -1684,13 +1751,18 @@ int d_validate(struct dentry *dentry, struct dentry *dparent)
 	struct dentry *child;
 
 	spin_lock(&dcache_lock);
+	spin_lock(&dparent->d_lock);
 	list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) {
 		if (dentry == child) {
-			__dget_locked(dentry);
+			spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+			__dget_locked_dlock(dentry);
+			spin_unlock(&dentry->d_lock);
+			spin_unlock(&dparent->d_lock);
 			spin_unlock(&dcache_lock);
 			return 1;
 		}
 	}
+	spin_unlock(&dparent->d_lock);
 	spin_unlock(&dcache_lock);
 
 	return 0;
@@ -1802,17 +1874,6 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
 }
 EXPORT_SYMBOL(dentry_update_name_case);
 
-/*
- * When switching names, the actual string doesn't strictly have to
- * be preserved in the target - because we're dropping the target
- * anyway. As such, we can just do a simple memcpy() to copy over
- * the new name before we switch.
- *
- * Note that we have to be a lot more careful about getting the hash
- * switched - we have to switch the hash value properly even if it
- * then no longer matches the actual (corrupted) string of the target.
- * The hash value has to match the hash queue that the dentry is on..
- */
 static void switch_names(struct dentry *dentry, struct dentry *target)
 {
 	if (dname_external(target)) {
@@ -1854,18 +1915,53 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
 	swap(dentry->d_name.len, target->d_name.len);
 }
 
+static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
+{
+	/*
+	 * XXXX: do we really need to take target->d_lock?
+	 */
+	if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent)
+		spin_lock(&target->d_parent->d_lock);
+	else {
+		if (d_ancestor(dentry->d_parent, target->d_parent)) {
+			spin_lock(&dentry->d_parent->d_lock);
+			spin_lock_nested(&target->d_parent->d_lock,
+						DENTRY_D_LOCK_NESTED);
+		} else {
+			spin_lock(&target->d_parent->d_lock);
+			spin_lock_nested(&dentry->d_parent->d_lock,
+						DENTRY_D_LOCK_NESTED);
+		}
+	}
+	if (target < dentry) {
+		spin_lock_nested(&target->d_lock, 2);
+		spin_lock_nested(&dentry->d_lock, 3);
+	} else {
+		spin_lock_nested(&dentry->d_lock, 2);
+		spin_lock_nested(&target->d_lock, 3);
+	}
+}
+
+static void dentry_unlock_parents_for_move(struct dentry *dentry,
+					struct dentry *target)
+{
+	if (target->d_parent != dentry->d_parent)
+		spin_unlock(&dentry->d_parent->d_lock);
+	if (target->d_parent != target)
+		spin_unlock(&target->d_parent->d_lock);
+}
+
 /*
- * We cannibalize "target" when moving dentry on top of it,
- * because it's going to be thrown away anyway. We could be more
- * polite about it, though.
- *
- * This forceful removal will result in ugly /proc output if
- * somebody holds a file open that got deleted due to a rename.
- * We could be nicer about the deleted file, and let it show
- * up under the name it had before it was deleted rather than
- * under the original name of the file that was moved on top of it.
+ * When switching names, the actual string doesn't strictly have to
+ * be preserved in the target - because we're dropping the target
+ * anyway. As such, we can just do a simple memcpy() to copy over
+ * the new name before we switch.
+ *
+ * Note that we have to be a lot more careful about getting the hash
+ * switched - we have to switch the hash value properly even if it
+ * then no longer matches the actual (corrupted) string of the target.
+ * The hash value has to match the hash queue that the dentry is on..
  */
- 
 /*
  * d_move_locked - move a dentry
  * @dentry: entry to move
@@ -1879,20 +1975,12 @@ static void d_move_locked(struct dentry * dentry, struct dentry * target)
 	if (!dentry->d_inode)
 		printk(KERN_WARNING "VFS: moving negative dcache entry\n");
 
+	BUG_ON(d_ancestor(dentry, target));
+	BUG_ON(d_ancestor(target, dentry));
+
 	write_seqlock(&rename_lock);
-	/*
-	 * XXXX: do we really need to take target->d_lock?
-	 */
-	if (d_ancestor(dentry, target)) {
-		spin_lock(&dentry->d_lock);
-		spin_lock_nested(&target->d_lock, DENTRY_D_LOCK_NESTED);
-	} else if (d_ancestor(target, dentry) || target < dentry) {
-		spin_lock(&target->d_lock);
-		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-	} else {
-		spin_lock(&dentry->d_lock);
-		spin_lock_nested(&target->d_lock, DENTRY_D_LOCK_NESTED);
-	}
+
+	dentry_lock_for_move(dentry, target);
 
 	/* Move the dentry to the target hash queue, if on different bucket */
 	spin_lock(&dcache_hash_lock);
@@ -1924,6 +2012,8 @@ static void d_move_locked(struct dentry * dentry, struct dentry * target)
 	}
 
 	list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
+
+	dentry_unlock_parents_for_move(dentry, target);
 	spin_unlock(&target->d_lock);
 	fsnotify_d_move(dentry);
 	spin_unlock(&dentry->d_lock);
@@ -2013,17 +2103,20 @@ out_err:
 /*
  * Prepare an anonymous dentry for life in the superblock's dentry tree as a
  * named dentry in place of the dentry to be replaced.
+ * returns with anon->d_lock held!
  */
 static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 {
 	struct dentry *dparent, *aparent;
 
-	switch_names(dentry, anon);
-	swap(dentry->d_name.hash, anon->d_name.hash);
+	dentry_lock_for_move(anon, dentry);
 
 	dparent = dentry->d_parent;
 	aparent = anon->d_parent;
 
+	switch_names(dentry, anon);
+	swap(dentry->d_name.hash, anon->d_name.hash);
+
 	dentry->d_parent = (aparent == anon) ? dentry : aparent;
 	list_del(&dentry->d_u.d_child);
 	if (!IS_ROOT(dentry))
@@ -2038,6 +2131,10 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 	else
 		INIT_LIST_HEAD(&anon->d_u.d_child);
 
+	dentry_unlock_parents_for_move(anon, dentry);
+	spin_unlock(&dentry->d_lock);
+
+	/* anon->d_lock still locked, returns locked */
 	anon->d_flags &= ~DCACHE_DISCONNECTED;
 }
 
@@ -2073,7 +2170,6 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 			/* Is this an anonymous mountpoint that we could splice
 			 * into our tree? */
 			if (IS_ROOT(alias)) {
-				spin_lock(&alias->d_lock);
 				__d_materialise_dentry(dentry, alias);
 				__d_drop(alias);
 				goto found;
@@ -2558,6 +2654,7 @@ void d_genocide(struct dentry *root)
 	struct list_head *next;
 
 	spin_lock(&dcache_lock);
+	spin_lock(&this_parent->d_lock);
 repeat:
 	next = this_parent->d_subdirs.next;
 resume:
@@ -2571,8 +2668,10 @@ resume:
 			continue;
 		}
 		if (!list_empty(&dentry->d_subdirs)) {
-			spin_unlock(&dentry->d_lock);
+			spin_unlock(&this_parent->d_lock);
+			spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
 			this_parent = dentry;
+			spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
 			goto repeat;
 		}
 		dentry->d_count--;
@@ -2580,12 +2679,13 @@ resume:
 	}
 	if (this_parent != root) {
 		next = this_parent->d_u.d_child.next;
-		spin_lock(&this_parent->d_lock);
 		this_parent->d_count--;
 		spin_unlock(&this_parent->d_lock);
 		this_parent = this_parent->d_parent;
+		spin_lock(&this_parent->d_lock);
 		goto resume;
 	}
+	spin_unlock(&this_parent->d_lock);
 	spin_unlock(&dcache_lock);
 }
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 433e7139c23a..cc4794914b52 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -81,7 +81,8 @@ int dcache_dir_close(struct inode *inode, struct file *file)
 
 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 {
-	mutex_lock(&file->f_path.dentry->d_inode->i_mutex);
+	struct dentry *dentry = file->f_path.dentry;
+	mutex_lock(&dentry->d_inode->i_mutex);
 	switch (origin) {
 		case 1:
 			offset += file->f_pos;
@@ -89,7 +90,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 			if (offset >= 0)
 				break;
 		default:
-			mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+			mutex_unlock(&dentry->d_inode->i_mutex);
 			return -EINVAL;
 	}
 	if (offset != file->f_pos) {
@@ -100,22 +101,25 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 			loff_t n = file->f_pos - 2;
 
 			spin_lock(&dcache_lock);
+			spin_lock(&dentry->d_lock);
+			/* d_lock not required for cursor */
 			list_del(&cursor->d_u.d_child);
-			p = file->f_path.dentry->d_subdirs.next;
-			while (n && p != &file->f_path.dentry->d_subdirs) {
+			p = dentry->d_subdirs.next;
+			while (n && p != &dentry->d_subdirs) {
 				struct dentry *next;
 				next = list_entry(p, struct dentry, d_u.d_child);
-				spin_lock(&next->d_lock);
+				spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
 				if (simple_positive(next))
 					n--;
 				spin_unlock(&next->d_lock);
 				p = p->next;
 			}
 			list_add_tail(&cursor->d_u.d_child, p);
+			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
 		}
 	}
-	mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 	return offset;
 }
 
@@ -156,6 +160,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			/* fallthrough */
 		default:
 			spin_lock(&dcache_lock);
+			spin_lock(&dentry->d_lock);
 			if (filp->f_pos == 2)
 				list_move(q, &dentry->d_subdirs);
 
@@ -169,6 +174,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 				}
 
 				spin_unlock(&next->d_lock);
+				spin_unlock(&dentry->d_lock);
 				spin_unlock(&dcache_lock);
 				if (filldir(dirent, next->d_name.name, 
 					    next->d_name.len, filp->f_pos, 
@@ -176,11 +182,15 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 					    dt_type(next->d_inode)) < 0)
 					return 0;
 				spin_lock(&dcache_lock);
+				spin_lock(&dentry->d_lock);
+				spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
 				/* next is still alive */
 				list_move(q, p);
+				spin_unlock(&next->d_lock);
 				p = q;
 				filp->f_pos++;
 			}
+			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
 	}
 	return 0;
@@ -276,6 +286,7 @@ int simple_empty(struct dentry *dentry)
 	int ret = 0;
 
 	spin_lock(&dcache_lock);
+	spin_lock(&dentry->d_lock);
 	list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {
 		spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
 		if (simple_positive(child)) {
@@ -286,6 +297,7 @@ int simple_empty(struct dentry *dentry)
 	}
 	ret = 1;
 out:
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 	return ret;
 }
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index bbbf7922f422..102278ed38bd 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -392,6 +392,7 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 
 	/* If a pointer is invalid, we search the dentry. */
 	spin_lock(&dcache_lock);
+	spin_lock(&parent->d_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
 		dent = list_entry(next, struct dentry, d_u.d_child);
@@ -400,11 +401,13 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 				dget_locked(dent);
 			else
 				dent = NULL;
+			spin_unlock(&parent->d_lock);
 			spin_unlock(&dcache_lock);
 			goto out;
 		}
 		next = next->next;
 	}
+	spin_unlock(&parent->d_lock);
 	spin_unlock(&dcache_lock);
 	return NULL;
 
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 244d1b73fda7..c4b718ff9a6b 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -194,6 +194,7 @@ ncp_renew_dentries(struct dentry *parent)
 	struct dentry *dentry;
 
 	spin_lock(&dcache_lock);
+	spin_lock(&parent->d_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
 		dentry = list_entry(next, struct dentry, d_u.d_child);
@@ -205,6 +206,7 @@ ncp_renew_dentries(struct dentry *parent)
 
 		next = next->next;
 	}
+	spin_unlock(&parent->d_lock);
 	spin_unlock(&dcache_lock);
 }
 
@@ -216,6 +218,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
 	struct dentry *dentry;
 
 	spin_lock(&dcache_lock);
+	spin_lock(&parent->d_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
 		dentry = list_entry(next, struct dentry, d_u.d_child);
@@ -223,6 +226,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
 		ncp_age_dentry(server, dentry);
 		next = next->next;
 	}
+	spin_unlock(&parent->d_lock);
 	spin_unlock(&dcache_lock);
 }
 
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 20dc218707ca..aa4f25e803f6 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -68,17 +68,19 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 		/* run all of the children of the original inode and fix their
 		 * d_flags to indicate parental interest (their parent is the
 		 * original inode) */
+		spin_lock(&alias->d_lock);
 		list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
 			if (!child->d_inode)
 				continue;
 
-			spin_lock(&child->d_lock);
+			spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
 			if (watched)
 				child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
 			else
 				child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
 			spin_unlock(&child->d_lock);
 		}
+		spin_unlock(&alias->d_lock);
 	}
 	spin_unlock(&dcache_lock);
 }
-- 
cgit v1.2.2


From b23fb0a60379a95e10c671f646b259ea2558421e Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:35 +1100
Subject: fs: scale inode alias list

Add a new lock, dcache_inode_lock, to protect the inode's i_dentry list
from concurrent modification. d_alias is also protected by d_lock.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/9p/vfs_inode.c    |  2 ++
 fs/affs/amigaffs.c   |  2 ++
 fs/cifs/inode.c      |  3 +++
 fs/dcache.c          | 66 +++++++++++++++++++++++++++++++++++++++++++++-------
 fs/exportfs/expfs.c  |  4 ++++
 fs/nfs/getroot.c     |  4 ++++
 fs/notify/fsnotify.c |  2 ++
 fs/ocfs2/dcache.c    |  3 ++-
 8 files changed, 77 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 34bf71b56542..47dfd5d29a6b 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -271,9 +271,11 @@ static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
 	struct dentry *dentry;
 
 	spin_lock(&dcache_lock);
+	spin_lock(&dcache_inode_lock);
 	/* Directory should have only one entry. */
 	BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
 	dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+	spin_unlock(&dcache_inode_lock);
 	spin_unlock(&dcache_lock);
 	return dentry;
 }
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 7d0f0a30f7a3..2321cc92d44f 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -129,6 +129,7 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
 	struct list_head *head, *next;
 
 	spin_lock(&dcache_lock);
+	spin_lock(&dcache_inode_lock);
 	head = &inode->i_dentry;
 	next = head->next;
 	while (next != head) {
@@ -139,6 +140,7 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
 		}
 		next = next->next;
 	}
+	spin_unlock(&dcache_inode_lock);
 	spin_unlock(&dcache_lock);
 }
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 589f3e3f6e00..003698365ece 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -810,12 +810,15 @@ inode_has_hashed_dentries(struct inode *inode)
 	struct dentry *dentry;
 
 	spin_lock(&dcache_lock);
+	spin_lock(&dcache_inode_lock);
 	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
+			spin_unlock(&dcache_inode_lock);
 			spin_unlock(&dcache_lock);
 			return true;
 		}
 	}
+	spin_unlock(&dcache_inode_lock);
 	spin_unlock(&dcache_lock);
 	return false;
 }
diff --git a/fs/dcache.c b/fs/dcache.c
index a661247a20d5..de38680ee0ed 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -37,6 +37,8 @@
 
 /*
  * Usage:
+ * dcache_inode_lock protects:
+ *   - i_dentry, d_alias, d_inode
  * dcache_hash_lock protects:
  *   - the dcache hash table, s_anon lists
  * dcache_lru_lock protects:
@@ -49,12 +51,14 @@
  *   - d_unhashed()
  *   - d_parent and d_subdirs
  *   - childrens' d_child and d_parent
+ *   - d_alias, d_inode
  *
  * Ordering:
  * dcache_lock
- *   dentry->d_lock
- *     dcache_lru_lock
- *     dcache_hash_lock
+ *   dcache_inode_lock
+ *     dentry->d_lock
+ *       dcache_lru_lock
+ *       dcache_hash_lock
  *
  * If there is an ancestor relationship:
  * dentry->d_parent->...->d_parent->d_lock
@@ -70,11 +74,13 @@
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_inode_lock);
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_hash_lock);
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
+EXPORT_SYMBOL(dcache_inode_lock);
 EXPORT_SYMBOL(dcache_lock);
 
 static struct kmem_cache *dentry_cache __read_mostly;
@@ -154,6 +160,7 @@ static void d_free(struct dentry *dentry)
  */
 static void dentry_iput(struct dentry * dentry)
 	__releases(dentry->d_lock)
+	__releases(dcache_inode_lock)
 	__releases(dcache_lock)
 {
 	struct inode *inode = dentry->d_inode;
@@ -161,6 +168,7 @@ static void dentry_iput(struct dentry * dentry)
 		dentry->d_inode = NULL;
 		list_del_init(&dentry->d_alias);
 		spin_unlock(&dentry->d_lock);
+		spin_unlock(&dcache_inode_lock);
 		spin_unlock(&dcache_lock);
 		if (!inode->i_nlink)
 			fsnotify_inoderemove(inode);
@@ -170,6 +178,7 @@ static void dentry_iput(struct dentry * dentry)
 			iput(inode);
 	} else {
 		spin_unlock(&dentry->d_lock);
+		spin_unlock(&dcache_inode_lock);
 		spin_unlock(&dcache_lock);
 	}
 }
@@ -231,6 +240,7 @@ static void dentry_lru_move_tail(struct dentry *dentry)
 static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
 	__releases(dentry->d_lock)
 	__releases(parent->d_lock)
+	__releases(dcache_inode_lock)
 	__releases(dcache_lock)
 {
 	list_del(&dentry->d_u.d_child);
@@ -332,13 +342,18 @@ repeat:
 			 * want to reduce dcache_lock anyway so this will
 			 * get improved.
 			 */
+drop1:
 			spin_unlock(&dentry->d_lock);
 			goto repeat;
 		}
-		if (parent && !spin_trylock(&parent->d_lock)) {
-			spin_unlock(&dentry->d_lock);
+		if (!spin_trylock(&dcache_inode_lock)) {
+drop2:
 			spin_unlock(&dcache_lock);
-			goto repeat;
+			goto drop1;
+		}
+		if (parent && !spin_trylock(&parent->d_lock)) {
+			spin_unlock(&dcache_inode_lock);
+			goto drop2;
 		}
 	}
 	dentry->d_count--;
@@ -369,6 +384,7 @@ repeat:
  	spin_unlock(&dentry->d_lock);
 	if (parent)
 		spin_unlock(&parent->d_lock);
+	spin_unlock(&dcache_inode_lock);
 	spin_unlock(&dcache_lock);
 	return;
 
@@ -558,7 +574,9 @@ struct dentry *d_find_alias(struct inode *inode)
 
 	if (!list_empty(&inode->i_dentry)) {
 		spin_lock(&dcache_lock);
+		spin_lock(&dcache_inode_lock);
 		de = __d_find_alias(inode, 0);
+		spin_unlock(&dcache_inode_lock);
 		spin_unlock(&dcache_lock);
 	}
 	return de;
@@ -574,18 +592,21 @@ void d_prune_aliases(struct inode *inode)
 	struct dentry *dentry;
 restart:
 	spin_lock(&dcache_lock);
+	spin_lock(&dcache_inode_lock);
 	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		spin_lock(&dentry->d_lock);
 		if (!dentry->d_count) {
 			__dget_locked_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
+			spin_unlock(&dcache_inode_lock);
 			spin_unlock(&dcache_lock);
 			dput(dentry);
 			goto restart;
 		}
 		spin_unlock(&dentry->d_lock);
 	}
+	spin_unlock(&dcache_inode_lock);
 	spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(d_prune_aliases);
@@ -601,6 +622,7 @@ EXPORT_SYMBOL(d_prune_aliases);
 static void prune_one_dentry(struct dentry *dentry, struct dentry *parent)
 	__releases(dentry->d_lock)
 	__releases(parent->d_lock)
+	__releases(dcache_inode_lock)
 	__releases(dcache_lock)
 {
 	__d_drop(dentry);
@@ -612,6 +634,7 @@ static void prune_one_dentry(struct dentry *dentry, struct dentry *parent)
 	 */
 	while (dentry) {
 		spin_lock(&dcache_lock);
+		spin_lock(&dcache_inode_lock);
 again:
 		spin_lock(&dentry->d_lock);
 		if (IS_ROOT(dentry))
@@ -627,6 +650,7 @@ again:
 			if (parent)
 				spin_unlock(&parent->d_lock);
 			spin_unlock(&dentry->d_lock);
+			spin_unlock(&dcache_inode_lock);
 			spin_unlock(&dcache_lock);
 			return;
 		}
@@ -676,8 +700,9 @@ relock:
 		spin_unlock(&dcache_lru_lock);
 
 		prune_one_dentry(dentry, parent);
-		/* dcache_lock and dentry->d_lock dropped */
+		/* dcache_lock, dcache_inode_lock and dentry->d_lock dropped */
 		spin_lock(&dcache_lock);
+		spin_lock(&dcache_inode_lock);
 		spin_lock(&dcache_lru_lock);
 	}
 }
@@ -699,6 +724,7 @@ static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
 	int cnt = *count;
 
 	spin_lock(&dcache_lock);
+	spin_lock(&dcache_inode_lock);
 relock:
 	spin_lock(&dcache_lru_lock);
 	while (!list_empty(&sb->s_dentry_lru)) {
@@ -737,8 +763,8 @@ relock:
 	if (!list_empty(&referenced))
 		list_splice(&referenced, &sb->s_dentry_lru);
 	spin_unlock(&dcache_lru_lock);
+	spin_unlock(&dcache_inode_lock);
 	spin_unlock(&dcache_lock);
-
 }
 
 /**
@@ -832,12 +858,14 @@ void shrink_dcache_sb(struct super_block *sb)
 	LIST_HEAD(tmp);
 
 	spin_lock(&dcache_lock);
+	spin_lock(&dcache_inode_lock);
 	spin_lock(&dcache_lru_lock);
 	while (!list_empty(&sb->s_dentry_lru)) {
 		list_splice_init(&sb->s_dentry_lru, &tmp);
 		shrink_dentry_list(&tmp);
 	}
 	spin_unlock(&dcache_lru_lock);
+	spin_unlock(&dcache_inode_lock);
 	spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
@@ -1255,9 +1283,11 @@ EXPORT_SYMBOL(d_alloc_name);
 /* the caller must hold dcache_lock */
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 {
+	spin_lock(&dentry->d_lock);
 	if (inode)
 		list_add(&dentry->d_alias, &inode->i_dentry);
 	dentry->d_inode = inode;
+	spin_unlock(&dentry->d_lock);
 	fsnotify_d_instantiate(dentry, inode);
 }
 
@@ -1280,7 +1310,9 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
 {
 	BUG_ON(!list_empty(&entry->d_alias));
 	spin_lock(&dcache_lock);
+	spin_lock(&dcache_inode_lock);
 	__d_instantiate(entry, inode);
+	spin_unlock(&dcache_inode_lock);
 	spin_unlock(&dcache_lock);
 	security_d_instantiate(entry, inode);
 }
@@ -1341,7 +1373,9 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
 	BUG_ON(!list_empty(&entry->d_alias));
 
 	spin_lock(&dcache_lock);
+	spin_lock(&dcache_inode_lock);
 	result = __d_instantiate_unique(entry, inode);
+	spin_unlock(&dcache_inode_lock);
 	spin_unlock(&dcache_lock);
 
 	if (!result) {
@@ -1432,8 +1466,10 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	tmp->d_parent = tmp; /* make sure dput doesn't croak */
 
 	spin_lock(&dcache_lock);
+	spin_lock(&dcache_inode_lock);
 	res = __d_find_alias(inode, 0);
 	if (res) {
+		spin_unlock(&dcache_inode_lock);
 		spin_unlock(&dcache_lock);
 		dput(tmp);
 		goto out_iput;
@@ -1450,6 +1486,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	hlist_add_head(&tmp->d_hash, &inode->i_sb->s_anon);
 	spin_unlock(&dcache_hash_lock);
 	spin_unlock(&tmp->d_lock);
+	spin_unlock(&dcache_inode_lock);
 
 	spin_unlock(&dcache_lock);
 	return tmp;
@@ -1482,9 +1519,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 
 	if (inode && S_ISDIR(inode->i_mode)) {
 		spin_lock(&dcache_lock);
+		spin_lock(&dcache_inode_lock);
 		new = __d_find_alias(inode, 1);
 		if (new) {
 			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
+			spin_unlock(&dcache_inode_lock);
 			spin_unlock(&dcache_lock);
 			security_d_instantiate(new, inode);
 			d_move(new, dentry);
@@ -1492,6 +1531,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 		} else {
 			/* already taking dcache_lock, so d_add() by hand */
 			__d_instantiate(dentry, inode);
+			spin_unlock(&dcache_inode_lock);
 			spin_unlock(&dcache_lock);
 			security_d_instantiate(dentry, inode);
 			d_rehash(dentry);
@@ -1566,8 +1606,10 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 	 * already has a dentry.
 	 */
 	spin_lock(&dcache_lock);
+	spin_lock(&dcache_inode_lock);
 	if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) {
 		__d_instantiate(found, inode);
+		spin_unlock(&dcache_inode_lock);
 		spin_unlock(&dcache_lock);
 		security_d_instantiate(found, inode);
 		return found;
@@ -1579,6 +1621,7 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 	 */
 	new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
 	dget_locked(new);
+	spin_unlock(&dcache_inode_lock);
 	spin_unlock(&dcache_lock);
 	security_d_instantiate(found, inode);
 	d_move(new, found);
@@ -1797,6 +1840,7 @@ void d_delete(struct dentry * dentry)
 	 * Are we the only user?
 	 */
 	spin_lock(&dcache_lock);
+	spin_lock(&dcache_inode_lock);
 	spin_lock(&dentry->d_lock);
 	isdir = S_ISDIR(dentry->d_inode->i_mode);
 	if (dentry->d_count == 1) {
@@ -1810,6 +1854,7 @@ void d_delete(struct dentry * dentry)
 		__d_drop(dentry);
 
 	spin_unlock(&dentry->d_lock);
+	spin_unlock(&dcache_inode_lock);
 	spin_unlock(&dcache_lock);
 
 	fsnotify_nameremove(dentry, isdir);
@@ -2067,6 +2112,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
  */
 static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
 	__releases(dcache_lock)
+	__releases(dcache_inode_lock)
 {
 	struct mutex *m1 = NULL, *m2 = NULL;
 	struct dentry *ret;
@@ -2092,6 +2138,7 @@ out_unalias:
 	d_move_locked(alias, dentry);
 	ret = alias;
 out_err:
+	spin_unlock(&dcache_inode_lock);
 	spin_unlock(&dcache_lock);
 	if (m2)
 		mutex_unlock(m2);
@@ -2153,6 +2200,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 	BUG_ON(!d_unhashed(dentry));
 
 	spin_lock(&dcache_lock);
+	spin_lock(&dcache_inode_lock);
 
 	if (!inode) {
 		actual = dentry;
@@ -2196,6 +2244,7 @@ found:
 	_d_rehash(actual);
 	spin_unlock(&dcache_hash_lock);
 	spin_unlock(&actual->d_lock);
+	spin_unlock(&dcache_inode_lock);
 	spin_unlock(&dcache_lock);
 out_nolock:
 	if (actual == dentry) {
@@ -2207,6 +2256,7 @@ out_nolock:
 	return actual;
 
 shouldnt_be_hashed:
+	spin_unlock(&dcache_inode_lock);
 	spin_unlock(&dcache_lock);
 	BUG();
 }
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 51b304056f10..84b8c460a781 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -48,8 +48,10 @@ find_acceptable_alias(struct dentry *result,
 		return result;
 
 	spin_lock(&dcache_lock);
+	spin_lock(&dcache_inode_lock);
 	list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) {
 		dget_locked(dentry);
+		spin_unlock(&dcache_inode_lock);
 		spin_unlock(&dcache_lock);
 		if (toput)
 			dput(toput);
@@ -58,8 +60,10 @@ find_acceptable_alias(struct dentry *result,
 			return dentry;
 		}
 		spin_lock(&dcache_lock);
+		spin_lock(&dcache_inode_lock);
 		toput = dentry;
 	}
+	spin_unlock(&dcache_inode_lock);
 	spin_unlock(&dcache_lock);
 
 	if (toput)
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index ac7b814ce162..850f67d5f0ac 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -64,7 +64,11 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
 		 * Oops, since the test for IS_ROOT() will fail.
 		 */
 		spin_lock(&dcache_lock);
+		spin_lock(&dcache_inode_lock);
+		spin_lock(&sb->s_root->d_lock);
 		list_del_init(&sb->s_root->d_alias);
+		spin_unlock(&sb->s_root->d_lock);
+		spin_unlock(&dcache_inode_lock);
 		spin_unlock(&dcache_lock);
 	}
 	return 0;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index aa4f25e803f6..ae769fc9b66c 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -60,6 +60,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 	watched = fsnotify_inode_watches_children(inode);
 
 	spin_lock(&dcache_lock);
+	spin_lock(&dcache_inode_lock);
 	/* run all of the dentries associated with this inode.  Since this is a
 	 * directory, there damn well better only be one item on this list */
 	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
@@ -82,6 +83,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 		}
 		spin_unlock(&alias->d_lock);
 	}
+	spin_unlock(&dcache_inode_lock);
 	spin_unlock(&dcache_lock);
 }
 
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 35e5f5a9ef59..c31b5c647ac7 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -170,7 +170,7 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
 	struct dentry *dentry = NULL;
 
 	spin_lock(&dcache_lock);
-
+	spin_lock(&dcache_inode_lock);
 	list_for_each(p, &inode->i_dentry) {
 		dentry = list_entry(p, struct dentry, d_alias);
 
@@ -188,6 +188,7 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
 		dentry = NULL;
 	}
 
+	spin_unlock(&dcache_inode_lock);
 	spin_unlock(&dcache_lock);
 
 	return dentry;
-- 
cgit v1.2.2


From 9abca36087288fe28de4749c71ca003d4b9e3ed0 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:36 +1100
Subject: fs: increase d_name lock coverage

Cover d_name with d_lock in more cases, where there may be concurrent
modification to it.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index de38680ee0ed..a09f0771fd27 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1350,6 +1350,11 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
 	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
 		struct qstr *qstr = &alias->d_name;
 
+		/*
+		 * Don't need alias->d_lock here, because aliases with
+		 * d_parent == entry->d_parent are not subject to name or
+		 * parent changes, because the parent inode i_mutex is held.
+		 */
 		if (qstr->hash != hash)
 			continue;
 		if (alias->d_parent != entry->d_parent)
@@ -2313,7 +2318,9 @@ static int prepend_path(const struct path *path, struct path *root,
 		}
 		parent = dentry->d_parent;
 		prefetch(parent);
+		spin_lock(&dentry->d_lock);
 		error = prepend_name(buffer, buflen, &dentry->d_name);
+		spin_unlock(&dentry->d_lock);
 		if (!error)
 			error = prepend(buffer, buflen, "/", 1);
 		if (error)
@@ -2515,10 +2522,13 @@ static char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
 
 	while (!IS_ROOT(dentry)) {
 		struct dentry *parent = dentry->d_parent;
+		int error;
 
 		prefetch(parent);
-		if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
-		    (prepend(&end, &buflen, "/", 1) != 0))
+		spin_lock(&dentry->d_lock);
+		error = prepend_name(&end, &buflen, &dentry->d_name);
+		spin_unlock(&dentry->d_lock);
+		if (error != 0 || prepend(&end, &buflen, "/", 1) != 0)
 			goto Elong;
 
 		retval = end;
-- 
cgit v1.2.2


From 949854d02455080d20cd3e1db28a3a18daf7599d Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:37 +1100
Subject: fs: Use rename lock and RCU for multi-step operations

The remaining usages for dcache_lock is to allow atomic, multi-step read-side
operations over the directory tree by excluding modifications to the tree.
Also, to walk in the leaf->root direction in the tree where we don't have
a natural d_lock ordering.

This could be accomplished by taking every d_lock, but this would mean a
huge number of locks and actually gets very tricky.

Solve this instead by using the rename seqlock for multi-step read-side
operations, retry in case of a rename so we don't walk up the wrong parent.
Concurrent dentry insertions are not serialised against.  Concurrent deletes
are tricky when walking up the directory: our parent might have been deleted
when dropping locks so also need to check and retry for that.

We can also use the rename lock in cases where livelock is a worry (and it
is introduced in subsequent patch).

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/autofs4/waitq.c |  18 +++++--
 fs/dcache.c        | 134 ++++++++++++++++++++++++++++++++++++++++++++---------
 fs/nfs/namespace.c |  14 +++++-
 3 files changed, 139 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 2341375386f8..4be8f778a418 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -186,16 +186,25 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
 {
 	struct dentry *root = sbi->sb->s_root;
 	struct dentry *tmp;
-	char *buf = *name;
+	char *buf;
 	char *p;
-	int len = 0;
-
+	int len;
+	unsigned seq;
+
+rename_retry:
+	buf = *name;
+	len = 0;
+	seq = read_seqbegin(&rename_lock);
+	rcu_read_lock();
 	spin_lock(&dcache_lock);
 	for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
 		len += tmp->d_name.len + 1;
 
 	if (!len || --len > NAME_MAX) {
 		spin_unlock(&dcache_lock);
+		rcu_read_unlock();
+		if (read_seqretry(&rename_lock, seq))
+			goto rename_retry;
 		return 0;
 	}
 
@@ -209,6 +218,9 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
 		strncpy(p, tmp->d_name.name, tmp->d_name.len);
 	}
 	spin_unlock(&dcache_lock);
+	rcu_read_unlock();
+	if (read_seqretry(&rename_lock, seq))
+		goto rename_retry;
 
 	return len;
 }
diff --git a/fs/dcache.c b/fs/dcache.c
index a09f0771fd27..a9bc4ecc21e1 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -80,6 +80,7 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
+EXPORT_SYMBOL(rename_lock);
 EXPORT_SYMBOL(dcache_inode_lock);
 EXPORT_SYMBOL(dcache_lock);
 
@@ -243,6 +244,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
 	__releases(dcache_inode_lock)
 	__releases(dcache_lock)
 {
+	dentry->d_parent = NULL;
 	list_del(&dentry->d_u.d_child);
 	if (parent)
 		spin_unlock(&parent->d_lock);
@@ -1017,11 +1019,15 @@ void shrink_dcache_for_umount(struct super_block *sb)
  * Return true if the parent or its subdirectories contain
  * a mount point
  */
- 
 int have_submounts(struct dentry *parent)
 {
-	struct dentry *this_parent = parent;
+	struct dentry *this_parent;
 	struct list_head *next;
+	unsigned seq;
+
+rename_retry:
+	this_parent = parent;
+	seq = read_seqbegin(&rename_lock);
 
 	spin_lock(&dcache_lock);
 	if (d_mountpoint(parent))
@@ -1055,17 +1061,37 @@ resume:
 	 * All done at this level ... ascend and resume the search.
 	 */
 	if (this_parent != parent) {
-		next = this_parent->d_u.d_child.next;
+		struct dentry *tmp;
+		struct dentry *child;
+
+		tmp = this_parent->d_parent;
+		rcu_read_lock();
 		spin_unlock(&this_parent->d_lock);
-		this_parent = this_parent->d_parent;
+		child = this_parent;
+		this_parent = tmp;
 		spin_lock(&this_parent->d_lock);
+		/* might go back up the wrong parent if we have had a rename
+		 * or deletion */
+		if (this_parent != child->d_parent ||
+				read_seqretry(&rename_lock, seq)) {
+			spin_unlock(&this_parent->d_lock);
+			spin_unlock(&dcache_lock);
+			rcu_read_unlock();
+			goto rename_retry;
+		}
+		rcu_read_unlock();
+		next = child->d_u.d_child.next;
 		goto resume;
 	}
 	spin_unlock(&this_parent->d_lock);
 	spin_unlock(&dcache_lock);
+	if (read_seqretry(&rename_lock, seq))
+		goto rename_retry;
 	return 0; /* No mount points found in tree */
 positive:
 	spin_unlock(&dcache_lock);
+	if (read_seqretry(&rename_lock, seq))
+		goto rename_retry;
 	return 1;
 }
 EXPORT_SYMBOL(have_submounts);
@@ -1086,10 +1112,15 @@ EXPORT_SYMBOL(have_submounts);
  */
 static int select_parent(struct dentry * parent)
 {
-	struct dentry *this_parent = parent;
+	struct dentry *this_parent;
 	struct list_head *next;
+	unsigned seq;
 	int found = 0;
 
+rename_retry:
+	this_parent = parent;
+	seq = read_seqbegin(&rename_lock);
+
 	spin_lock(&dcache_lock);
 	spin_lock(&this_parent->d_lock);
 repeat:
@@ -1099,7 +1130,6 @@ resume:
 		struct list_head *tmp = next;
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
-		BUG_ON(this_parent == dentry);
 
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 
@@ -1142,17 +1172,32 @@ resume:
 	 */
 	if (this_parent != parent) {
 		struct dentry *tmp;
-		next = this_parent->d_u.d_child.next;
+		struct dentry *child;
+
 		tmp = this_parent->d_parent;
+		rcu_read_lock();
 		spin_unlock(&this_parent->d_lock);
-		BUG_ON(tmp == this_parent);
+		child = this_parent;
 		this_parent = tmp;
 		spin_lock(&this_parent->d_lock);
+		/* might go back up the wrong parent if we have had a rename
+		 * or deletion */
+		if (this_parent != child->d_parent ||
+				read_seqretry(&rename_lock, seq)) {
+			spin_unlock(&this_parent->d_lock);
+			spin_unlock(&dcache_lock);
+			rcu_read_unlock();
+			goto rename_retry;
+		}
+		rcu_read_unlock();
+		next = child->d_u.d_child.next;
 		goto resume;
 	}
 out:
 	spin_unlock(&this_parent->d_lock);
 	spin_unlock(&dcache_lock);
+	if (read_seqretry(&rename_lock, seq))
+		goto rename_retry;
 	return found;
 }
 
@@ -1654,7 +1699,7 @@ EXPORT_SYMBOL(d_add_ci);
 struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 {
 	struct dentry * dentry = NULL;
-	unsigned long seq;
+	unsigned seq;
 
         do {
                 seq = read_seqbegin(&rename_lock);
@@ -2290,7 +2335,7 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
  * @buffer: pointer to the end of the buffer
  * @buflen: pointer to buffer length
  *
- * Caller holds the dcache_lock.
+ * Caller holds the rename_lock.
  *
  * If path is not reachable from the supplied root, then the value of
  * root is changed (without modifying refcounts).
@@ -2377,7 +2422,9 @@ char *__d_path(const struct path *path, struct path *root,
 
 	prepend(&res, &buflen, "\0", 1);
 	spin_lock(&dcache_lock);
+	write_seqlock(&rename_lock);
 	error = prepend_path(path, root, &res, &buflen);
+	write_sequnlock(&rename_lock);
 	spin_unlock(&dcache_lock);
 
 	if (error)
@@ -2441,10 +2488,12 @@ char *d_path(const struct path *path, char *buf, int buflen)
 
 	get_fs_root(current->fs, &root);
 	spin_lock(&dcache_lock);
+	write_seqlock(&rename_lock);
 	tmp = root;
 	error = path_with_deleted(path, &tmp, &res, &buflen);
 	if (error)
 		res = ERR_PTR(error);
+	write_sequnlock(&rename_lock);
 	spin_unlock(&dcache_lock);
 	path_put(&root);
 	return res;
@@ -2472,10 +2521,12 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
 
 	get_fs_root(current->fs, &root);
 	spin_lock(&dcache_lock);
+	write_seqlock(&rename_lock);
 	tmp = root;
 	error = path_with_deleted(path, &tmp, &res, &buflen);
 	if (!error && !path_equal(&tmp, &root))
 		error = prepend_unreachable(&res, &buflen);
+	write_sequnlock(&rename_lock);
 	spin_unlock(&dcache_lock);
 	path_put(&root);
 	if (error)
@@ -2544,7 +2595,9 @@ char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
 	char *retval;
 
 	spin_lock(&dcache_lock);
+	write_seqlock(&rename_lock);
 	retval = __dentry_path(dentry, buf, buflen);
+	write_sequnlock(&rename_lock);
 	spin_unlock(&dcache_lock);
 
 	return retval;
@@ -2557,6 +2610,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 	char *retval;
 
 	spin_lock(&dcache_lock);
+	write_seqlock(&rename_lock);
 	if (d_unlinked(dentry)) {
 		p = buf + buflen;
 		if (prepend(&p, &buflen, "//deleted", 10) != 0)
@@ -2564,6 +2618,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 		buflen++;
 	}
 	retval = __dentry_path(dentry, buf, buflen);
+	write_sequnlock(&rename_lock);
 	spin_unlock(&dcache_lock);
 	if (!IS_ERR(retval) && p)
 		*p = '/';	/* restore '/' overriden with '\0' */
@@ -2604,6 +2659,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 
 	error = -ENOENT;
 	spin_lock(&dcache_lock);
+	write_seqlock(&rename_lock);
 	if (!d_unlinked(pwd.dentry)) {
 		unsigned long len;
 		struct path tmp = root;
@@ -2612,6 +2668,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 
 		prepend(&cwd, &buflen, "\0", 1);
 		error = prepend_path(&pwd, &tmp, &cwd, &buflen);
+		write_sequnlock(&rename_lock);
 		spin_unlock(&dcache_lock);
 
 		if (error)
@@ -2631,8 +2688,10 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 			if (copy_to_user(buf, cwd, len))
 				error = -EFAULT;
 		}
-	} else
+	} else {
+		write_sequnlock(&rename_lock);
 		spin_unlock(&dcache_lock);
+	}
 
 out:
 	path_put(&pwd);
@@ -2660,25 +2719,25 @@ out:
 int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
 {
 	int result;
-	unsigned long seq;
+	unsigned seq;
 
 	if (new_dentry == old_dentry)
 		return 1;
 
-	/*
-	 * Need rcu_readlock to protect against the d_parent trashing
-	 * due to d_move
-	 */
-	rcu_read_lock();
 	do {
 		/* for restarting inner loop in case of seq retry */
 		seq = read_seqbegin(&rename_lock);
+		/*
+		 * Need rcu_readlock to protect against the d_parent trashing
+		 * due to d_move
+		 */
+		rcu_read_lock();
 		if (d_ancestor(old_dentry, new_dentry))
 			result = 1;
 		else
 			result = 0;
+		rcu_read_unlock();
 	} while (read_seqretry(&rename_lock, seq));
-	rcu_read_unlock();
 
 	return result;
 }
@@ -2710,9 +2769,13 @@ EXPORT_SYMBOL(path_is_under);
 
 void d_genocide(struct dentry *root)
 {
-	struct dentry *this_parent = root;
+	struct dentry *this_parent;
 	struct list_head *next;
+	unsigned seq;
 
+rename_retry:
+	this_parent = root;
+	seq = read_seqbegin(&rename_lock);
 	spin_lock(&dcache_lock);
 	spin_lock(&this_parent->d_lock);
 repeat:
@@ -2722,6 +2785,7 @@ resume:
 		struct list_head *tmp = next;
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
+
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 		if (d_unhashed(dentry) || !dentry->d_inode) {
 			spin_unlock(&dentry->d_lock);
@@ -2734,19 +2798,43 @@ resume:
 			spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
 			goto repeat;
 		}
-		dentry->d_count--;
+		if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
+			dentry->d_flags |= DCACHE_GENOCIDE;
+			dentry->d_count--;
+		}
 		spin_unlock(&dentry->d_lock);
 	}
 	if (this_parent != root) {
-		next = this_parent->d_u.d_child.next;
-		this_parent->d_count--;
+		struct dentry *tmp;
+		struct dentry *child;
+
+		tmp = this_parent->d_parent;
+		if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
+			this_parent->d_flags |= DCACHE_GENOCIDE;
+			this_parent->d_count--;
+		}
+		rcu_read_lock();
 		spin_unlock(&this_parent->d_lock);
-		this_parent = this_parent->d_parent;
+		child = this_parent;
+		this_parent = tmp;
 		spin_lock(&this_parent->d_lock);
+		/* might go back up the wrong parent if we have had a rename
+		 * or deletion */
+		if (this_parent != child->d_parent ||
+				read_seqretry(&rename_lock, seq)) {
+			spin_unlock(&this_parent->d_lock);
+			spin_unlock(&dcache_lock);
+			rcu_read_unlock();
+			goto rename_retry;
+		}
+		rcu_read_unlock();
+		next = child->d_u.d_child.next;
 		goto resume;
 	}
 	spin_unlock(&this_parent->d_lock);
 	spin_unlock(&dcache_lock);
+	if (read_seqretry(&rename_lock, seq))
+		goto rename_retry;
 }
 
 /**
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index db6aa3673cf3..78c0ebb0b07c 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -49,11 +49,17 @@ char *nfs_path(const char *base,
 	       const struct dentry *dentry,
 	       char *buffer, ssize_t buflen)
 {
-	char *end = buffer+buflen;
+	char *end;
 	int namelen;
+	unsigned seq;
 
+rename_retry:
+	end = buffer+buflen;
 	*--end = '\0';
 	buflen--;
+
+	seq = read_seqbegin(&rename_lock);
+	rcu_read_lock();
 	spin_lock(&dcache_lock);
 	while (!IS_ROOT(dentry) && dentry != droot) {
 		namelen = dentry->d_name.len;
@@ -66,6 +72,9 @@ char *nfs_path(const char *base,
 		dentry = dentry->d_parent;
 	}
 	spin_unlock(&dcache_lock);
+	rcu_read_unlock();
+	if (read_seqretry(&rename_lock, seq))
+		goto rename_retry;
 	if (*end != '/') {
 		if (--buflen < 0)
 			goto Elong;
@@ -83,6 +92,9 @@ char *nfs_path(const char *base,
 	return end;
 Elong_unlock:
 	spin_unlock(&dcache_lock);
+	rcu_read_unlock();
+	if (read_seqretry(&rename_lock, seq))
+		goto rename_retry;
 Elong:
 	return ERR_PTR(-ENAMETOOLONG);
 }
-- 
cgit v1.2.2


From b5c84bf6f6fa3a7dfdcb556023a62953574b60ee Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:38 +1100
Subject: fs: dcache remove dcache_lock

dcache_lock no longer protects anything. remove it.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/9p/vfs_inode.c               |   2 -
 fs/affs/amigaffs.c              |   2 -
 fs/autofs4/autofs_i.h           |   3 +
 fs/autofs4/expire.c             |  10 +--
 fs/autofs4/root.c               |  44 +++++------
 fs/autofs4/waitq.c              |   7 +-
 fs/ceph/dir.c                   |   6 +-
 fs/ceph/inode.c                 |   4 -
 fs/cifs/inode.c                 |   3 -
 fs/coda/cache.c                 |   2 -
 fs/configfs/configfs_internal.h |   2 -
 fs/configfs/inode.c             |   6 +-
 fs/dcache.c                     | 160 +++++-----------------------------------
 fs/exportfs/expfs.c             |   4 -
 fs/libfs.c                      |   8 --
 fs/namei.c                      |   9 +--
 fs/ncpfs/dir.c                  |   3 -
 fs/ncpfs/ncplib_kernel.h        |   4 -
 fs/nfs/dir.c                    |   3 -
 fs/nfs/getroot.c                |   2 -
 fs/nfs/namespace.c              |   3 -
 fs/notify/fsnotify.c            |   2 -
 fs/ocfs2/dcache.c               |   2 -
 23 files changed, 59 insertions(+), 232 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 47dfd5d29a6b..1073bca8488c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -270,13 +270,11 @@ static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
 {
 	struct dentry *dentry;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	/* Directory should have only one entry. */
 	BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
 	dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 	return dentry;
 }
 
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 2321cc92d44f..600101a21ba2 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -128,7 +128,6 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
 	void *data = dentry->d_fsdata;
 	struct list_head *head, *next;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	head = &inode->i_dentry;
 	next = head->next;
@@ -141,7 +140,6 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
 		next = next->next;
 	}
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 }
 
 
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 9d2ae9b30d9f..0fffe1c24cec 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -16,6 +16,7 @@
 #include <linux/auto_fs4.h>
 #include <linux/auto_dev-ioctl.h>
 #include <linux/mutex.h>
+#include <linux/spinlock.h>
 #include <linux/list.h>
 
 /* This is the range of ioctl() numbers we claim as ours */
@@ -60,6 +61,8 @@ do {							\
 		current->pid, __func__, ##args);	\
 } while (0)
 
+extern spinlock_t autofs4_lock;
+
 /* Unified info structure.  This is pointed to by both the dentry and
    inode structures.  Each file in the filesystem has an instance of this
    structure.  It holds a reference to the dentry, so dentries are never
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 968c1434af62..2f7951d67d16 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -102,7 +102,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
 	if (prev == NULL)
 		return dget(prev);
 
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 relock:
 	p = prev;
 	spin_lock(&p->d_lock);
@@ -114,7 +114,7 @@ again:
 
 			if (p == root) {
 				spin_unlock(&p->d_lock);
-				spin_unlock(&dcache_lock);
+				spin_unlock(&autofs4_lock);
 				dput(prev);
 				return NULL;
 			}
@@ -144,7 +144,7 @@ again:
 	dget_dlock(ret);
 	spin_unlock(&ret->d_lock);
 	spin_unlock(&p->d_lock);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 
 	dput(prev);
 
@@ -408,13 +408,13 @@ found:
 	ino->flags |= AUTOFS_INF_EXPIRING;
 	init_completion(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 	spin_lock(&expired->d_parent->d_lock);
 	spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
 	list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
 	spin_unlock(&expired->d_lock);
 	spin_unlock(&expired->d_parent->d_lock);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 	return expired;
 }
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 7a9ed6b88291..10ca68a96dc7 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -23,6 +23,8 @@
 
 #include "autofs_i.h"
 
+DEFINE_SPINLOCK(autofs4_lock);
+
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
@@ -142,15 +144,15 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
 	 * autofs file system so just let the libfs routines handle
 	 * it.
 	 */
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 	spin_lock(&dentry->d_lock);
 	if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_lock);
+		spin_unlock(&autofs4_lock);
 		return -ENOENT;
 	}
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 
 out:
 	return dcache_dir_open(inode, file);
@@ -255,11 +257,11 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	/* We trigger a mount for almost all flags */
 	lookup_type = autofs4_need_mount(nd->flags);
 	spin_lock(&sbi->fs_lock);
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 	spin_lock(&dentry->d_lock);
 	if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_lock);
+		spin_unlock(&autofs4_lock);
 		spin_unlock(&sbi->fs_lock);
 		goto follow;
 	}
@@ -272,7 +274,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	if (ino->flags & AUTOFS_INF_PENDING ||
 	    (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_lock);
+		spin_unlock(&autofs4_lock);
 		spin_unlock(&sbi->fs_lock);
 
 		status = try_to_fill_dentry(dentry, nd->flags);
@@ -282,7 +284,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		goto follow;
 	}
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 	spin_unlock(&sbi->fs_lock);
 follow:
 	/*
@@ -353,14 +355,14 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 		return 0;
 
 	/* Check for a non-mountpoint directory with no contents */
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 	spin_lock(&dentry->d_lock);
 	if (S_ISDIR(dentry->d_inode->i_mode) &&
 	    !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
 		DPRINTK("dentry=%p %.*s, emptydir",
 			 dentry, dentry->d_name.len, dentry->d_name.name);
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_lock);
+		spin_unlock(&autofs4_lock);
 
 		/* The daemon never causes a mount to trigger */
 		if (oz_mode)
@@ -377,7 +379,7 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 		return status;
 	}
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 
 	return 1;
 }
@@ -432,7 +434,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 	const unsigned char *str = name->name;
 	struct list_head *p, *head;
 
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 	spin_lock(&sbi->lookup_lock);
 	head = &sbi->active_list;
 	list_for_each(p, head) {
@@ -465,14 +467,14 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 			dget_dlock(active);
 			spin_unlock(&active->d_lock);
 			spin_unlock(&sbi->lookup_lock);
-			spin_unlock(&dcache_lock);
+			spin_unlock(&autofs4_lock);
 			return active;
 		}
 next:
 		spin_unlock(&active->d_lock);
 	}
 	spin_unlock(&sbi->lookup_lock);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 
 	return NULL;
 }
@@ -487,7 +489,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
 	const unsigned char *str = name->name;
 	struct list_head *p, *head;
 
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 	spin_lock(&sbi->lookup_lock);
 	head = &sbi->expiring_list;
 	list_for_each(p, head) {
@@ -520,14 +522,14 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
 			dget_dlock(expiring);
 			spin_unlock(&expiring->d_lock);
 			spin_unlock(&sbi->lookup_lock);
-			spin_unlock(&dcache_lock);
+			spin_unlock(&autofs4_lock);
 			return expiring;
 		}
 next:
 		spin_unlock(&expiring->d_lock);
 	}
 	spin_unlock(&sbi->lookup_lock);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 
 	return NULL;
 }
@@ -763,12 +765,12 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 
 	dir->i_mtime = CURRENT_TIME;
 
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 	autofs4_add_expiring(dentry);
 	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 
 	return 0;
 }
@@ -785,20 +787,20 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!autofs4_oz_mode(sbi))
 		return -EACCES;
 
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 	spin_lock(&sbi->lookup_lock);
 	spin_lock(&dentry->d_lock);
 	if (!list_empty(&dentry->d_subdirs)) {
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&sbi->lookup_lock);
-		spin_unlock(&dcache_lock);
+		spin_unlock(&autofs4_lock);
 		return -ENOTEMPTY;
 	}
 	__autofs4_add_expiring(dentry);
 	spin_unlock(&sbi->lookup_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 
 	if (atomic_dec_and_test(&ino->count)) {
 		p_ino = autofs4_dentry_ino(dentry->d_parent);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 4be8f778a418..c5f8459c905e 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -194,14 +194,15 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
 rename_retry:
 	buf = *name;
 	len = 0;
+
 	seq = read_seqbegin(&rename_lock);
 	rcu_read_lock();
-	spin_lock(&dcache_lock);
+	spin_lock(&autofs4_lock);
 	for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
 		len += tmp->d_name.len + 1;
 
 	if (!len || --len > NAME_MAX) {
-		spin_unlock(&dcache_lock);
+		spin_unlock(&autofs4_lock);
 		rcu_read_unlock();
 		if (read_seqretry(&rename_lock, seq))
 			goto rename_retry;
@@ -217,7 +218,7 @@ rename_retry:
 		p -= tmp->d_name.len;
 		strncpy(p, tmp->d_name.name, tmp->d_name.len);
 	}
-	spin_unlock(&dcache_lock);
+	spin_unlock(&autofs4_lock);
 	rcu_read_unlock();
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 2c924e8d85fe..58abc3da6111 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -112,7 +112,6 @@ static int __dcache_readdir(struct file *filp,
 	dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
 	     last);
 
-	spin_lock(&dcache_lock);
 	spin_lock(&parent->d_lock);
 
 	/* start at beginning? */
@@ -156,7 +155,6 @@ more:
 	dget_dlock(dentry);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&parent->d_lock);
-	spin_unlock(&dcache_lock);
 
 	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
 	     dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -182,21 +180,19 @@ more:
 
 	filp->f_pos++;
 
-	/* make sure a dentry wasn't dropped while we didn't have dcache_lock */
+	/* make sure a dentry wasn't dropped while we didn't have parent lock */
 	if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
 		dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
 		err = -EAGAIN;
 		goto out;
 	}
 
-	spin_lock(&dcache_lock);
 	spin_lock(&parent->d_lock);
 	p = p->prev;	/* advance to next dentry */
 	goto more;
 
 out_unlock:
 	spin_unlock(&parent->d_lock);
-	spin_unlock(&dcache_lock);
 out:
 	if (last)
 		dput(last);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 2c6944473366..2a48cafc174b 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -841,7 +841,6 @@ static void ceph_set_dentry_offset(struct dentry *dn)
 	di->offset = ceph_inode(inode)->i_max_offset++;
 	spin_unlock(&inode->i_lock);
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dir->d_lock);
 	spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
 	list_move(&dn->d_u.d_child, &dir->d_subdirs);
@@ -849,7 +848,6 @@ static void ceph_set_dentry_offset(struct dentry *dn)
 	     dn->d_u.d_child.prev, dn->d_u.d_child.next);
 	spin_unlock(&dn->d_lock);
 	spin_unlock(&dir->d_lock);
-	spin_unlock(&dcache_lock);
 }
 
 /*
@@ -1233,13 +1231,11 @@ retry_lookup:
 			goto retry_lookup;
 		} else {
 			/* reorder parent's d_subdirs */
-			spin_lock(&dcache_lock);
 			spin_lock(&parent->d_lock);
 			spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
 			list_move(&dn->d_u.d_child, &parent->d_subdirs);
 			spin_unlock(&dn->d_lock);
 			spin_unlock(&parent->d_lock);
-			spin_unlock(&dcache_lock);
 		}
 
 		di = dn->d_fsdata;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 003698365ece..99b9a2cc14b7 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -809,17 +809,14 @@ inode_has_hashed_dentries(struct inode *inode)
 {
 	struct dentry *dentry;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
 			spin_unlock(&dcache_inode_lock);
-			spin_unlock(&dcache_lock);
 			return true;
 		}
 	}
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 	return false;
 }
 
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 859393fca2b7..5525e1c660fd 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -93,7 +93,6 @@ static void coda_flag_children(struct dentry *parent, int flag)
 	struct list_head *child;
 	struct dentry *de;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&parent->d_lock);
 	list_for_each(child, &parent->d_subdirs)
 	{
@@ -104,7 +103,6 @@ static void coda_flag_children(struct dentry *parent, int flag)
 		coda_flag_inode(de->d_inode, flag);
 	}
 	spin_unlock(&parent->d_lock);
-	spin_unlock(&dcache_lock);
 	return; 
 }
 
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index e58b4c30e216..026cf68553a4 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -120,7 +120,6 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
 {
 	struct config_item * item = NULL;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	if (!d_unhashed(dentry)) {
 		struct configfs_dirent * sd = dentry->d_fsdata;
@@ -131,7 +130,6 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
 			item = config_item_get(sd->s_element);
 	}
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 
 	return item;
 }
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 79b37765d8ff..fb3a55fff82a 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -250,18 +250,14 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
 	struct dentry * dentry = sd->s_dentry;
 
 	if (dentry) {
-		spin_lock(&dcache_lock);
 		spin_lock(&dentry->d_lock);
 		if (!(d_unhashed(dentry) && dentry->d_inode)) {
 			dget_locked_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dcache_lock);
 			simple_unlink(parent->d_inode, dentry);
-		} else {
+		} else
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dcache_lock);
-		}
 	}
 }
 
diff --git a/fs/dcache.c b/fs/dcache.c
index a9bc4ecc21e1..0dbae053b664 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -54,11 +54,10 @@
  *   - d_alias, d_inode
  *
  * Ordering:
- * dcache_lock
- *   dcache_inode_lock
- *     dentry->d_lock
- *       dcache_lru_lock
- *       dcache_hash_lock
+ * dcache_inode_lock
+ *   dentry->d_lock
+ *     dcache_lru_lock
+ *     dcache_hash_lock
  *
  * If there is an ancestor relationship:
  * dentry->d_parent->...->d_parent->d_lock
@@ -77,12 +76,10 @@ EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_inode_lock);
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_hash_lock);
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
 EXPORT_SYMBOL(rename_lock);
 EXPORT_SYMBOL(dcache_inode_lock);
-EXPORT_SYMBOL(dcache_lock);
 
 static struct kmem_cache *dentry_cache __read_mostly;
 
@@ -139,7 +136,7 @@ static void __d_free(struct rcu_head *head)
 }
 
 /*
- * no dcache_lock, please.
+ * no locks, please.
  */
 static void d_free(struct dentry *dentry)
 {
@@ -162,7 +159,6 @@ static void d_free(struct dentry *dentry)
 static void dentry_iput(struct dentry * dentry)
 	__releases(dentry->d_lock)
 	__releases(dcache_inode_lock)
-	__releases(dcache_lock)
 {
 	struct inode *inode = dentry->d_inode;
 	if (inode) {
@@ -170,7 +166,6 @@ static void dentry_iput(struct dentry * dentry)
 		list_del_init(&dentry->d_alias);
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&dcache_inode_lock);
-		spin_unlock(&dcache_lock);
 		if (!inode->i_nlink)
 			fsnotify_inoderemove(inode);
 		if (dentry->d_op && dentry->d_op->d_iput)
@@ -180,7 +175,6 @@ static void dentry_iput(struct dentry * dentry)
 	} else {
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&dcache_inode_lock);
-		spin_unlock(&dcache_lock);
 	}
 }
 
@@ -235,14 +229,13 @@ static void dentry_lru_move_tail(struct dentry *dentry)
  *
  * If this is the root of the dentry tree, return NULL.
  *
- * dcache_lock and d_lock and d_parent->d_lock must be held by caller, and
- * are dropped by d_kill.
+ * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by
+ * d_kill.
  */
 static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
 	__releases(dentry->d_lock)
 	__releases(parent->d_lock)
 	__releases(dcache_inode_lock)
-	__releases(dcache_lock)
 {
 	dentry->d_parent = NULL;
 	list_del(&dentry->d_u.d_child);
@@ -285,11 +278,9 @@ EXPORT_SYMBOL(__d_drop);
 
 void d_drop(struct dentry *dentry)
 {
-	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(d_drop);
 
@@ -337,21 +328,10 @@ repeat:
 	else
 		parent = dentry->d_parent;
 	if (dentry->d_count == 1) {
-		if (!spin_trylock(&dcache_lock)) {
-			/*
-			 * Something of a livelock possibility we could avoid
-			 * by taking dcache_lock and trying again, but we
-			 * want to reduce dcache_lock anyway so this will
-			 * get improved.
-			 */
-drop1:
-			spin_unlock(&dentry->d_lock);
-			goto repeat;
-		}
 		if (!spin_trylock(&dcache_inode_lock)) {
 drop2:
-			spin_unlock(&dcache_lock);
-			goto drop1;
+			spin_unlock(&dentry->d_lock);
+			goto repeat;
 		}
 		if (parent && !spin_trylock(&parent->d_lock)) {
 			spin_unlock(&dcache_inode_lock);
@@ -363,7 +343,6 @@ drop2:
 		spin_unlock(&dentry->d_lock);
 		if (parent)
 			spin_unlock(&parent->d_lock);
-		spin_unlock(&dcache_lock);
 		return;
 	}
 
@@ -387,7 +366,6 @@ drop2:
 	if (parent)
 		spin_unlock(&parent->d_lock);
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 	return;
 
 unhash_it:
@@ -418,11 +396,9 @@ int d_invalidate(struct dentry * dentry)
 	/*
 	 * If it's already been dropped, return OK.
 	 */
-	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	if (d_unhashed(dentry)) {
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_lock);
 		return 0;
 	}
 	/*
@@ -431,9 +407,7 @@ int d_invalidate(struct dentry * dentry)
 	 */
 	if (!list_empty(&dentry->d_subdirs)) {
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_lock);
 		shrink_dcache_parent(dentry);
-		spin_lock(&dcache_lock);
 		spin_lock(&dentry->d_lock);
 	}
 
@@ -450,19 +424,17 @@ int d_invalidate(struct dentry * dentry)
 	if (dentry->d_count > 1) {
 		if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) {
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dcache_lock);
 			return -EBUSY;
 		}
 	}
 
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 	return 0;
 }
 EXPORT_SYMBOL(d_invalidate);
 
-/* This must be called with dcache_lock and d_lock held */
+/* This must be called with d_lock held */
 static inline struct dentry * __dget_locked_dlock(struct dentry *dentry)
 {
 	dentry->d_count++;
@@ -470,7 +442,7 @@ static inline struct dentry * __dget_locked_dlock(struct dentry *dentry)
 	return dentry;
 }
 
-/* This should be called _only_ with dcache_lock held */
+/* This must be called with d_lock held */
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
 	spin_lock(&dentry->d_lock);
@@ -575,11 +547,9 @@ struct dentry *d_find_alias(struct inode *inode)
 	struct dentry *de = NULL;
 
 	if (!list_empty(&inode->i_dentry)) {
-		spin_lock(&dcache_lock);
 		spin_lock(&dcache_inode_lock);
 		de = __d_find_alias(inode, 0);
 		spin_unlock(&dcache_inode_lock);
-		spin_unlock(&dcache_lock);
 	}
 	return de;
 }
@@ -593,7 +563,6 @@ void d_prune_aliases(struct inode *inode)
 {
 	struct dentry *dentry;
 restart:
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		spin_lock(&dentry->d_lock);
@@ -602,14 +571,12 @@ restart:
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_inode_lock);
-			spin_unlock(&dcache_lock);
 			dput(dentry);
 			goto restart;
 		}
 		spin_unlock(&dentry->d_lock);
 	}
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(d_prune_aliases);
 
@@ -625,17 +592,14 @@ static void prune_one_dentry(struct dentry *dentry, struct dentry *parent)
 	__releases(dentry->d_lock)
 	__releases(parent->d_lock)
 	__releases(dcache_inode_lock)
-	__releases(dcache_lock)
 {
 	__d_drop(dentry);
 	dentry = d_kill(dentry, parent);
 
 	/*
-	 * Prune ancestors.  Locking is simpler than in dput(),
-	 * because dcache_lock needs to be taken anyway.
+	 * Prune ancestors.
 	 */
 	while (dentry) {
-		spin_lock(&dcache_lock);
 		spin_lock(&dcache_inode_lock);
 again:
 		spin_lock(&dentry->d_lock);
@@ -653,7 +617,6 @@ again:
 				spin_unlock(&parent->d_lock);
 			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_inode_lock);
-			spin_unlock(&dcache_lock);
 			return;
 		}
 
@@ -702,8 +665,7 @@ relock:
 		spin_unlock(&dcache_lru_lock);
 
 		prune_one_dentry(dentry, parent);
-		/* dcache_lock, dcache_inode_lock and dentry->d_lock dropped */
-		spin_lock(&dcache_lock);
+		/* dcache_inode_lock and dentry->d_lock dropped */
 		spin_lock(&dcache_inode_lock);
 		spin_lock(&dcache_lru_lock);
 	}
@@ -725,7 +687,6 @@ static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
 	LIST_HEAD(tmp);
 	int cnt = *count;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 relock:
 	spin_lock(&dcache_lru_lock);
@@ -766,7 +727,6 @@ relock:
 		list_splice(&referenced, &sb->s_dentry_lru);
 	spin_unlock(&dcache_lru_lock);
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 }
 
 /**
@@ -788,7 +748,6 @@ static void prune_dcache(int count)
 
 	if (unused == 0 || count == 0)
 		return;
-	spin_lock(&dcache_lock);
 	if (count >= unused)
 		prune_ratio = 1;
 	else
@@ -825,11 +784,9 @@ static void prune_dcache(int count)
 		if (down_read_trylock(&sb->s_umount)) {
 			if ((sb->s_root != NULL) &&
 			    (!list_empty(&sb->s_dentry_lru))) {
-				spin_unlock(&dcache_lock);
 				__shrink_dcache_sb(sb, &w_count,
 						DCACHE_REFERENCED);
 				pruned -= w_count;
-				spin_lock(&dcache_lock);
 			}
 			up_read(&sb->s_umount);
 		}
@@ -845,7 +802,6 @@ static void prune_dcache(int count)
 	if (p)
 		__put_super(p);
 	spin_unlock(&sb_lock);
-	spin_unlock(&dcache_lock);
 }
 
 /**
@@ -859,7 +815,6 @@ void shrink_dcache_sb(struct super_block *sb)
 {
 	LIST_HEAD(tmp);
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	spin_lock(&dcache_lru_lock);
 	while (!list_empty(&sb->s_dentry_lru)) {
@@ -868,7 +823,6 @@ void shrink_dcache_sb(struct super_block *sb)
 	}
 	spin_unlock(&dcache_lru_lock);
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
 
@@ -885,12 +839,10 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 	BUG_ON(!IS_ROOT(dentry));
 
 	/* detach this root from the system */
-	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	dentry_lru_del(dentry);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 
 	for (;;) {
 		/* descend to the first leaf in the current subtree */
@@ -899,7 +851,6 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 
 			/* this is a branch with children - detach all of them
 			 * from the system in one go */
-			spin_lock(&dcache_lock);
 			spin_lock(&dentry->d_lock);
 			list_for_each_entry(loop, &dentry->d_subdirs,
 					    d_u.d_child) {
@@ -910,7 +861,6 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 				spin_unlock(&loop->d_lock);
 			}
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dcache_lock);
 
 			/* move to the first child */
 			dentry = list_entry(dentry->d_subdirs.next,
@@ -977,8 +927,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 
 /*
  * destroy the dentries attached to a superblock on unmounting
- * - we don't need to use dentry->d_lock, and only need dcache_lock when
- *   removing the dentry from the system lists and hashes because:
+ * - we don't need to use dentry->d_lock because:
  *   - the superblock is detached from all mountings and open files, so the
  *     dentry trees will not be rearranged by the VFS
  *   - s_umount is write-locked, so the memory pressure shrinker will ignore
@@ -1029,7 +978,6 @@ rename_retry:
 	this_parent = parent;
 	seq = read_seqbegin(&rename_lock);
 
-	spin_lock(&dcache_lock);
 	if (d_mountpoint(parent))
 		goto positive;
 	spin_lock(&this_parent->d_lock);
@@ -1075,7 +1023,6 @@ resume:
 		if (this_parent != child->d_parent ||
 				read_seqretry(&rename_lock, seq)) {
 			spin_unlock(&this_parent->d_lock);
-			spin_unlock(&dcache_lock);
 			rcu_read_unlock();
 			goto rename_retry;
 		}
@@ -1084,12 +1031,10 @@ resume:
 		goto resume;
 	}
 	spin_unlock(&this_parent->d_lock);
-	spin_unlock(&dcache_lock);
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
 	return 0; /* No mount points found in tree */
 positive:
-	spin_unlock(&dcache_lock);
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
 	return 1;
@@ -1121,7 +1066,6 @@ rename_retry:
 	this_parent = parent;
 	seq = read_seqbegin(&rename_lock);
 
-	spin_lock(&dcache_lock);
 	spin_lock(&this_parent->d_lock);
 repeat:
 	next = this_parent->d_subdirs.next;
@@ -1185,7 +1129,6 @@ resume:
 		if (this_parent != child->d_parent ||
 				read_seqretry(&rename_lock, seq)) {
 			spin_unlock(&this_parent->d_lock);
-			spin_unlock(&dcache_lock);
 			rcu_read_unlock();
 			goto rename_retry;
 		}
@@ -1195,7 +1138,6 @@ resume:
 	}
 out:
 	spin_unlock(&this_parent->d_lock);
-	spin_unlock(&dcache_lock);
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
 	return found;
@@ -1297,7 +1239,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	INIT_LIST_HEAD(&dentry->d_u.d_child);
 
 	if (parent) {
-		spin_lock(&dcache_lock);
 		spin_lock(&parent->d_lock);
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 		dentry->d_parent = dget_dlock(parent);
@@ -1305,7 +1246,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&parent->d_lock);
-		spin_unlock(&dcache_lock);
 	}
 
 	this_cpu_inc(nr_dentry);
@@ -1325,7 +1265,6 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 }
 EXPORT_SYMBOL(d_alloc_name);
 
-/* the caller must hold dcache_lock */
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 {
 	spin_lock(&dentry->d_lock);
@@ -1354,11 +1293,9 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 void d_instantiate(struct dentry *entry, struct inode * inode)
 {
 	BUG_ON(!list_empty(&entry->d_alias));
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	__d_instantiate(entry, inode);
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 	security_d_instantiate(entry, inode);
 }
 EXPORT_SYMBOL(d_instantiate);
@@ -1422,11 +1359,9 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
 
 	BUG_ON(!list_empty(&entry->d_alias));
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	result = __d_instantiate_unique(entry, inode);
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 
 	if (!result) {
 		security_d_instantiate(entry, inode);
@@ -1515,12 +1450,11 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	}
 	tmp->d_parent = tmp; /* make sure dput doesn't croak */
 
-	spin_lock(&dcache_lock);
+
 	spin_lock(&dcache_inode_lock);
 	res = __d_find_alias(inode, 0);
 	if (res) {
 		spin_unlock(&dcache_inode_lock);
-		spin_unlock(&dcache_lock);
 		dput(tmp);
 		goto out_iput;
 	}
@@ -1538,7 +1472,6 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	spin_unlock(&tmp->d_lock);
 	spin_unlock(&dcache_inode_lock);
 
-	spin_unlock(&dcache_lock);
 	return tmp;
 
  out_iput:
@@ -1568,21 +1501,18 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 	struct dentry *new = NULL;
 
 	if (inode && S_ISDIR(inode->i_mode)) {
-		spin_lock(&dcache_lock);
 		spin_lock(&dcache_inode_lock);
 		new = __d_find_alias(inode, 1);
 		if (new) {
 			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
 			spin_unlock(&dcache_inode_lock);
-			spin_unlock(&dcache_lock);
 			security_d_instantiate(new, inode);
 			d_move(new, dentry);
 			iput(inode);
 		} else {
-			/* already taking dcache_lock, so d_add() by hand */
+			/* already taking dcache_inode_lock, so d_add() by hand */
 			__d_instantiate(dentry, inode);
 			spin_unlock(&dcache_inode_lock);
-			spin_unlock(&dcache_lock);
 			security_d_instantiate(dentry, inode);
 			d_rehash(dentry);
 		}
@@ -1655,12 +1585,10 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 	 * Negative dentry: instantiate it unless the inode is a directory and
 	 * already has a dentry.
 	 */
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) {
 		__d_instantiate(found, inode);
 		spin_unlock(&dcache_inode_lock);
-		spin_unlock(&dcache_lock);
 		security_d_instantiate(found, inode);
 		return found;
 	}
@@ -1672,7 +1600,6 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 	new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
 	dget_locked(new);
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 	security_d_instantiate(found, inode);
 	d_move(new, found);
 	iput(inode);
@@ -1843,7 +1770,6 @@ int d_validate(struct dentry *dentry, struct dentry *dparent)
 {
 	struct dentry *child;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dparent->d_lock);
 	list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) {
 		if (dentry == child) {
@@ -1851,12 +1777,10 @@ int d_validate(struct dentry *dentry, struct dentry *dparent)
 			__dget_locked_dlock(dentry);
 			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dparent->d_lock);
-			spin_unlock(&dcache_lock);
 			return 1;
 		}
 	}
 	spin_unlock(&dparent->d_lock);
-	spin_unlock(&dcache_lock);
 
 	return 0;
 }
@@ -1889,7 +1813,6 @@ void d_delete(struct dentry * dentry)
 	/*
 	 * Are we the only user?
 	 */
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	spin_lock(&dentry->d_lock);
 	isdir = S_ISDIR(dentry->d_inode->i_mode);
@@ -1905,7 +1828,6 @@ void d_delete(struct dentry * dentry)
 
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 
 	fsnotify_nameremove(dentry, isdir);
 }
@@ -1932,13 +1854,11 @@ static void _d_rehash(struct dentry * entry)
  
 void d_rehash(struct dentry * entry)
 {
-	spin_lock(&dcache_lock);
 	spin_lock(&entry->d_lock);
 	spin_lock(&dcache_hash_lock);
 	_d_rehash(entry);
 	spin_unlock(&dcache_hash_lock);
 	spin_unlock(&entry->d_lock);
-	spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(d_rehash);
 
@@ -1961,11 +1881,9 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
 	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
 	BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(dentry_update_name_case);
 
@@ -2058,14 +1976,14 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry,
  * The hash value has to match the hash queue that the dentry is on..
  */
 /*
- * d_move_locked - move a dentry
+ * d_move - move a dentry
  * @dentry: entry to move
  * @target: new dentry
  *
  * Update the dcache to reflect the move of a file name. Negative
  * dcache entries should not be moved in this way.
  */
-static void d_move_locked(struct dentry * dentry, struct dentry * target)
+void d_move(struct dentry * dentry, struct dentry * target)
 {
 	if (!dentry->d_inode)
 		printk(KERN_WARNING "VFS: moving negative dcache entry\n");
@@ -2114,22 +2032,6 @@ static void d_move_locked(struct dentry * dentry, struct dentry * target)
 	spin_unlock(&dentry->d_lock);
 	write_sequnlock(&rename_lock);
 }
-
-/**
- * d_move - move a dentry
- * @dentry: entry to move
- * @target: new dentry
- *
- * Update the dcache to reflect the move of a file name. Negative
- * dcache entries should not be moved in this way.
- */
-
-void d_move(struct dentry * dentry, struct dentry * target)
-{
-	spin_lock(&dcache_lock);
-	d_move_locked(dentry, target);
-	spin_unlock(&dcache_lock);
-}
 EXPORT_SYMBOL(d_move);
 
 /**
@@ -2155,13 +2057,12 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
  * This helper attempts to cope with remotely renamed directories
  *
  * It assumes that the caller is already holding
- * dentry->d_parent->d_inode->i_mutex and the dcache_lock
+ * dentry->d_parent->d_inode->i_mutex and the dcache_inode_lock
  *
  * Note: If ever the locking in lock_rename() changes, then please
  * remember to update this too...
  */
 static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
-	__releases(dcache_lock)
 	__releases(dcache_inode_lock)
 {
 	struct mutex *m1 = NULL, *m2 = NULL;
@@ -2185,11 +2086,10 @@ static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
 		goto out_err;
 	m2 = &alias->d_parent->d_inode->i_mutex;
 out_unalias:
-	d_move_locked(alias, dentry);
+	d_move(alias, dentry);
 	ret = alias;
 out_err:
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 	if (m2)
 		mutex_unlock(m2);
 	if (m1)
@@ -2249,7 +2149,6 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 
 	BUG_ON(!d_unhashed(dentry));
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 
 	if (!inode) {
@@ -2295,7 +2194,6 @@ found:
 	spin_unlock(&dcache_hash_lock);
 	spin_unlock(&actual->d_lock);
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 out_nolock:
 	if (actual == dentry) {
 		security_d_instantiate(dentry, inode);
@@ -2307,7 +2205,6 @@ out_nolock:
 
 shouldnt_be_hashed:
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 	BUG();
 }
 EXPORT_SYMBOL_GPL(d_materialise_unique);
@@ -2421,11 +2318,9 @@ char *__d_path(const struct path *path, struct path *root,
 	int error;
 
 	prepend(&res, &buflen, "\0", 1);
-	spin_lock(&dcache_lock);
 	write_seqlock(&rename_lock);
 	error = prepend_path(path, root, &res, &buflen);
 	write_sequnlock(&rename_lock);
-	spin_unlock(&dcache_lock);
 
 	if (error)
 		return ERR_PTR(error);
@@ -2487,14 +2382,12 @@ char *d_path(const struct path *path, char *buf, int buflen)
 		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
 
 	get_fs_root(current->fs, &root);
-	spin_lock(&dcache_lock);
 	write_seqlock(&rename_lock);
 	tmp = root;
 	error = path_with_deleted(path, &tmp, &res, &buflen);
 	if (error)
 		res = ERR_PTR(error);
 	write_sequnlock(&rename_lock);
-	spin_unlock(&dcache_lock);
 	path_put(&root);
 	return res;
 }
@@ -2520,14 +2413,12 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
 		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
 
 	get_fs_root(current->fs, &root);
-	spin_lock(&dcache_lock);
 	write_seqlock(&rename_lock);
 	tmp = root;
 	error = path_with_deleted(path, &tmp, &res, &buflen);
 	if (!error && !path_equal(&tmp, &root))
 		error = prepend_unreachable(&res, &buflen);
 	write_sequnlock(&rename_lock);
-	spin_unlock(&dcache_lock);
 	path_put(&root);
 	if (error)
 		res =  ERR_PTR(error);
@@ -2594,11 +2485,9 @@ char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
 {
 	char *retval;
 
-	spin_lock(&dcache_lock);
 	write_seqlock(&rename_lock);
 	retval = __dentry_path(dentry, buf, buflen);
 	write_sequnlock(&rename_lock);
-	spin_unlock(&dcache_lock);
 
 	return retval;
 }
@@ -2609,7 +2498,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 	char *p = NULL;
 	char *retval;
 
-	spin_lock(&dcache_lock);
 	write_seqlock(&rename_lock);
 	if (d_unlinked(dentry)) {
 		p = buf + buflen;
@@ -2619,12 +2507,10 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 	}
 	retval = __dentry_path(dentry, buf, buflen);
 	write_sequnlock(&rename_lock);
-	spin_unlock(&dcache_lock);
 	if (!IS_ERR(retval) && p)
 		*p = '/';	/* restore '/' overriden with '\0' */
 	return retval;
 Elong:
-	spin_unlock(&dcache_lock);
 	return ERR_PTR(-ENAMETOOLONG);
 }
 
@@ -2658,7 +2544,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 	get_fs_root_and_pwd(current->fs, &root, &pwd);
 
 	error = -ENOENT;
-	spin_lock(&dcache_lock);
 	write_seqlock(&rename_lock);
 	if (!d_unlinked(pwd.dentry)) {
 		unsigned long len;
@@ -2669,7 +2554,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 		prepend(&cwd, &buflen, "\0", 1);
 		error = prepend_path(&pwd, &tmp, &cwd, &buflen);
 		write_sequnlock(&rename_lock);
-		spin_unlock(&dcache_lock);
 
 		if (error)
 			goto out;
@@ -2690,7 +2574,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 		}
 	} else {
 		write_sequnlock(&rename_lock);
-		spin_unlock(&dcache_lock);
 	}
 
 out:
@@ -2776,7 +2659,6 @@ void d_genocide(struct dentry *root)
 rename_retry:
 	this_parent = root;
 	seq = read_seqbegin(&rename_lock);
-	spin_lock(&dcache_lock);
 	spin_lock(&this_parent->d_lock);
 repeat:
 	next = this_parent->d_subdirs.next;
@@ -2823,7 +2705,6 @@ resume:
 		if (this_parent != child->d_parent ||
 				read_seqretry(&rename_lock, seq)) {
 			spin_unlock(&this_parent->d_lock);
-			spin_unlock(&dcache_lock);
 			rcu_read_unlock();
 			goto rename_retry;
 		}
@@ -2832,7 +2713,6 @@ resume:
 		goto resume;
 	}
 	spin_unlock(&this_parent->d_lock);
-	spin_unlock(&dcache_lock);
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
 }
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 84b8c460a781..53a5c08fb63c 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -47,24 +47,20 @@ find_acceptable_alias(struct dentry *result,
 	if (acceptable(context, result))
 		return result;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) {
 		dget_locked(dentry);
 		spin_unlock(&dcache_inode_lock);
-		spin_unlock(&dcache_lock);
 		if (toput)
 			dput(toput);
 		if (dentry != result && acceptable(context, dentry)) {
 			dput(result);
 			return dentry;
 		}
-		spin_lock(&dcache_lock);
 		spin_lock(&dcache_inode_lock);
 		toput = dentry;
 	}
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 
 	if (toput)
 		dput(toput);
diff --git a/fs/libfs.c b/fs/libfs.c
index cc4794914b52..28b36663c44e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -100,7 +100,6 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 			struct dentry *cursor = file->private_data;
 			loff_t n = file->f_pos - 2;
 
-			spin_lock(&dcache_lock);
 			spin_lock(&dentry->d_lock);
 			/* d_lock not required for cursor */
 			list_del(&cursor->d_u.d_child);
@@ -116,7 +115,6 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 			}
 			list_add_tail(&cursor->d_u.d_child, p);
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dcache_lock);
 		}
 	}
 	mutex_unlock(&dentry->d_inode->i_mutex);
@@ -159,7 +157,6 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			i++;
 			/* fallthrough */
 		default:
-			spin_lock(&dcache_lock);
 			spin_lock(&dentry->d_lock);
 			if (filp->f_pos == 2)
 				list_move(q, &dentry->d_subdirs);
@@ -175,13 +172,11 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 
 				spin_unlock(&next->d_lock);
 				spin_unlock(&dentry->d_lock);
-				spin_unlock(&dcache_lock);
 				if (filldir(dirent, next->d_name.name, 
 					    next->d_name.len, filp->f_pos, 
 					    next->d_inode->i_ino, 
 					    dt_type(next->d_inode)) < 0)
 					return 0;
-				spin_lock(&dcache_lock);
 				spin_lock(&dentry->d_lock);
 				spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
 				/* next is still alive */
@@ -191,7 +186,6 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 				filp->f_pos++;
 			}
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dcache_lock);
 	}
 	return 0;
 }
@@ -285,7 +279,6 @@ int simple_empty(struct dentry *dentry)
 	struct dentry *child;
 	int ret = 0;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {
 		spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
@@ -298,7 +291,6 @@ int simple_empty(struct dentry *dentry)
 	ret = 1;
 out:
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 	return ret;
 }
 
diff --git a/fs/namei.c b/fs/namei.c
index cbfa5fb31072..5642bc2be418 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -612,8 +612,8 @@ int follow_up(struct path *path)
 	return 1;
 }
 
-/* no need for dcache_lock, as serialization is taken care in
- * namespace.c
+/*
+ * serialization is taken care of in namespace.c
  */
 static int __follow_mount(struct path *path)
 {
@@ -645,9 +645,6 @@ static void follow_mount(struct path *path)
 	}
 }
 
-/* no need for dcache_lock, as serialization is taken care in
- * namespace.c
- */
 int follow_down(struct path *path)
 {
 	struct vfsmount *mounted;
@@ -2131,12 +2128,10 @@ void dentry_unhash(struct dentry *dentry)
 {
 	dget(dentry);
 	shrink_dcache_parent(dentry);
-	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	if (dentry->d_count == 2)
 		__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 }
 
 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 102278ed38bd..de15c533311c 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -391,7 +391,6 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 	}
 
 	/* If a pointer is invalid, we search the dentry. */
-	spin_lock(&dcache_lock);
 	spin_lock(&parent->d_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
@@ -402,13 +401,11 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 			else
 				dent = NULL;
 			spin_unlock(&parent->d_lock);
-			spin_unlock(&dcache_lock);
 			goto out;
 		}
 		next = next->next;
 	}
 	spin_unlock(&parent->d_lock);
-	spin_unlock(&dcache_lock);
 	return NULL;
 
 out:
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index c4b718ff9a6b..1220df75ff22 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -193,7 +193,6 @@ ncp_renew_dentries(struct dentry *parent)
 	struct list_head *next;
 	struct dentry *dentry;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&parent->d_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
@@ -207,7 +206,6 @@ ncp_renew_dentries(struct dentry *parent)
 		next = next->next;
 	}
 	spin_unlock(&parent->d_lock);
-	spin_unlock(&dcache_lock);
 }
 
 static inline void
@@ -217,7 +215,6 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
 	struct list_head *next;
 	struct dentry *dentry;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&parent->d_lock);
 	next = parent->d_subdirs.next;
 	while (next != &parent->d_subdirs) {
@@ -227,7 +224,6 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
 		next = next->next;
 	}
 	spin_unlock(&parent->d_lock);
-	spin_unlock(&dcache_lock);
 }
 
 struct ncp_cache_head {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 12de824edb5c..eb77471b8823 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1718,11 +1718,9 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
 	dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
 		dir->i_ino, dentry->d_name.name);
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
 	if (dentry->d_count > 1) {
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_lock);
 		/* Start asynchronous writeout of the inode */
 		write_inode_now(dentry->d_inode, 0);
 		error = nfs_sillyrename(dir, dentry);
@@ -1733,7 +1731,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
 		need_rehash = 1;
 	}
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 	error = nfs_safe_remove(dentry);
 	if (!error || error == -ENOENT) {
 		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 850f67d5f0ac..b3e36c3430de 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -63,13 +63,11 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
 		 * This again causes shrink_dcache_for_umount_subtree() to
 		 * Oops, since the test for IS_ROOT() will fail.
 		 */
-		spin_lock(&dcache_lock);
 		spin_lock(&dcache_inode_lock);
 		spin_lock(&sb->s_root->d_lock);
 		list_del_init(&sb->s_root->d_alias);
 		spin_unlock(&sb->s_root->d_lock);
 		spin_unlock(&dcache_inode_lock);
-		spin_unlock(&dcache_lock);
 	}
 	return 0;
 }
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 78c0ebb0b07c..74aaf3963c10 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -60,7 +60,6 @@ rename_retry:
 
 	seq = read_seqbegin(&rename_lock);
 	rcu_read_lock();
-	spin_lock(&dcache_lock);
 	while (!IS_ROOT(dentry) && dentry != droot) {
 		namelen = dentry->d_name.len;
 		buflen -= namelen + 1;
@@ -71,7 +70,6 @@ rename_retry:
 		*--end = '/';
 		dentry = dentry->d_parent;
 	}
-	spin_unlock(&dcache_lock);
 	rcu_read_unlock();
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
@@ -91,7 +89,6 @@ rename_retry:
 	memcpy(end, base, namelen);
 	return end;
 Elong_unlock:
-	spin_unlock(&dcache_lock);
 	rcu_read_unlock();
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ae769fc9b66c..9be6ec1f36d8 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -59,7 +59,6 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 	/* determine if the children should tell inode about their events */
 	watched = fsnotify_inode_watches_children(inode);
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	/* run all of the dentries associated with this inode.  Since this is a
 	 * directory, there damn well better only be one item on this list */
@@ -84,7 +83,6 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 		spin_unlock(&alias->d_lock);
 	}
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 }
 
 /* Notify this dentry's parent about a child's events. */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index c31b5c647ac7..b7de749bdd12 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -169,7 +169,6 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
 	struct list_head *p;
 	struct dentry *dentry = NULL;
 
-	spin_lock(&dcache_lock);
 	spin_lock(&dcache_inode_lock);
 	list_for_each(p, &inode->i_dentry) {
 		dentry = list_entry(p, struct dentry, d_alias);
@@ -189,7 +188,6 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
 	}
 
 	spin_unlock(&dcache_inode_lock);
-	spin_unlock(&dcache_lock);
 
 	return dentry;
 }
-- 
cgit v1.2.2


From 58db63d086790eec2ed433f9d8c4962239809cf8 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:39 +1100
Subject: fs: dcache avoid starvation in dcache multi-step operations

Long lived dcache "multi-step" operations which retry on rename seq can
be starved with a lot of rename activity. If they fail after the 1st pass,
take the rename_lock for writing to avoid further starvation.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 56 ++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 0dbae053b664..bf6294a20f0e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -973,10 +973,11 @@ int have_submounts(struct dentry *parent)
 	struct dentry *this_parent;
 	struct list_head *next;
 	unsigned seq;
+	int locked = 0;
 
-rename_retry:
-	this_parent = parent;
 	seq = read_seqbegin(&rename_lock);
+again:
+	this_parent = parent;
 
 	if (d_mountpoint(parent))
 		goto positive;
@@ -1021,7 +1022,7 @@ resume:
 		/* might go back up the wrong parent if we have had a rename
 		 * or deletion */
 		if (this_parent != child->d_parent ||
-				read_seqretry(&rename_lock, seq)) {
+			 (!locked && read_seqretry(&rename_lock, seq))) {
 			spin_unlock(&this_parent->d_lock);
 			rcu_read_unlock();
 			goto rename_retry;
@@ -1031,13 +1032,22 @@ resume:
 		goto resume;
 	}
 	spin_unlock(&this_parent->d_lock);
-	if (read_seqretry(&rename_lock, seq))
+	if (!locked && read_seqretry(&rename_lock, seq))
 		goto rename_retry;
+	if (locked)
+		write_sequnlock(&rename_lock);
 	return 0; /* No mount points found in tree */
 positive:
-	if (read_seqretry(&rename_lock, seq))
+	if (!locked && read_seqretry(&rename_lock, seq))
 		goto rename_retry;
+	if (locked)
+		write_sequnlock(&rename_lock);
 	return 1;
+
+rename_retry:
+	locked = 1;
+	write_seqlock(&rename_lock);
+	goto again;
 }
 EXPORT_SYMBOL(have_submounts);
 
@@ -1061,11 +1071,11 @@ static int select_parent(struct dentry * parent)
 	struct list_head *next;
 	unsigned seq;
 	int found = 0;
+	int locked = 0;
 
-rename_retry:
-	this_parent = parent;
 	seq = read_seqbegin(&rename_lock);
-
+again:
+	this_parent = parent;
 	spin_lock(&this_parent->d_lock);
 repeat:
 	next = this_parent->d_subdirs.next;
@@ -1127,7 +1137,7 @@ resume:
 		/* might go back up the wrong parent if we have had a rename
 		 * or deletion */
 		if (this_parent != child->d_parent ||
-				read_seqretry(&rename_lock, seq)) {
+			(!locked && read_seqretry(&rename_lock, seq))) {
 			spin_unlock(&this_parent->d_lock);
 			rcu_read_unlock();
 			goto rename_retry;
@@ -1138,9 +1148,18 @@ resume:
 	}
 out:
 	spin_unlock(&this_parent->d_lock);
-	if (read_seqretry(&rename_lock, seq))
+	if (!locked && read_seqretry(&rename_lock, seq))
 		goto rename_retry;
+	if (locked)
+		write_sequnlock(&rename_lock);
 	return found;
+
+rename_retry:
+	if (found)
+		return found;
+	locked = 1;
+	write_seqlock(&rename_lock);
+	goto again;
 }
 
 /**
@@ -2655,10 +2674,11 @@ void d_genocide(struct dentry *root)
 	struct dentry *this_parent;
 	struct list_head *next;
 	unsigned seq;
+	int locked = 0;
 
-rename_retry:
-	this_parent = root;
 	seq = read_seqbegin(&rename_lock);
+again:
+	this_parent = root;
 	spin_lock(&this_parent->d_lock);
 repeat:
 	next = this_parent->d_subdirs.next;
@@ -2703,7 +2723,7 @@ resume:
 		/* might go back up the wrong parent if we have had a rename
 		 * or deletion */
 		if (this_parent != child->d_parent ||
-				read_seqretry(&rename_lock, seq)) {
+			 (!locked && read_seqretry(&rename_lock, seq))) {
 			spin_unlock(&this_parent->d_lock);
 			rcu_read_unlock();
 			goto rename_retry;
@@ -2713,8 +2733,16 @@ resume:
 		goto resume;
 	}
 	spin_unlock(&this_parent->d_lock);
-	if (read_seqretry(&rename_lock, seq))
+	if (!locked && read_seqretry(&rename_lock, seq))
 		goto rename_retry;
+	if (locked)
+		write_sequnlock(&rename_lock);
+	return;
+
+rename_retry:
+	locked = 1;
+	write_seqlock(&rename_lock);
+	goto again;
 }
 
 /**
-- 
cgit v1.2.2


From 61f3dee4af09528997a970280da240577bf60721 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:40 +1100
Subject: fs: dcache reduce dput locking

It is possible to run dput without taking data structure locks up-front. In
many cases where we don't kill the dentry anyway, these locks are not required.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 52 +++++++++++++++++++++++-----------------------------
 1 file changed, 23 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index bf6294a20f0e..98a05696593e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -323,35 +323,16 @@ repeat:
 	if (dentry->d_count == 1)
 		might_sleep();
 	spin_lock(&dentry->d_lock);
-	if (IS_ROOT(dentry))
-		parent = NULL;
-	else
-		parent = dentry->d_parent;
-	if (dentry->d_count == 1) {
-		if (!spin_trylock(&dcache_inode_lock)) {
-drop2:
-			spin_unlock(&dentry->d_lock);
-			goto repeat;
-		}
-		if (parent && !spin_trylock(&parent->d_lock)) {
-			spin_unlock(&dcache_inode_lock);
-			goto drop2;
-		}
-	}
-	dentry->d_count--;
-	if (dentry->d_count) {
+	BUG_ON(!dentry->d_count);
+	if (dentry->d_count > 1) {
+		dentry->d_count--;
 		spin_unlock(&dentry->d_lock);
-		if (parent)
-			spin_unlock(&parent->d_lock);
 		return;
 	}
 
-	/*
-	 * AV: ->d_delete() is _NOT_ allowed to block now.
-	 */
 	if (dentry->d_op && dentry->d_op->d_delete) {
 		if (dentry->d_op->d_delete(dentry))
-			goto unhash_it;
+			goto kill_it;
 	}
 
 	/* Unreachable? Get rid of it */
@@ -362,17 +343,30 @@ drop2:
 	dentry->d_flags |= DCACHE_REFERENCED;
 	dentry_lru_add(dentry);
 
- 	spin_unlock(&dentry->d_lock);
-	if (parent)
-		spin_unlock(&parent->d_lock);
-	spin_unlock(&dcache_inode_lock);
+	dentry->d_count--;
+	spin_unlock(&dentry->d_lock);
 	return;
 
-unhash_it:
-	__d_drop(dentry);
 kill_it:
+	if (!spin_trylock(&dcache_inode_lock)) {
+relock:
+		spin_unlock(&dentry->d_lock);
+		cpu_relax();
+		goto repeat;
+	}
+	if (IS_ROOT(dentry))
+		parent = NULL;
+	else
+		parent = dentry->d_parent;
+	if (parent && !spin_trylock(&parent->d_lock)) {
+		spin_unlock(&dcache_inode_lock);
+		goto relock;
+	}
+	dentry->d_count--;
 	/* if dentry was on the d_lru list delete it from there */
 	dentry_lru_del(dentry);
+	/* if it was on the hash (d_delete case), then remove it */
+	__d_drop(dentry);
 	dentry = d_kill(dentry, parent);
 	if (dentry)
 		goto repeat;
-- 
cgit v1.2.2


From 89ad485f01fd83c47f17a128db3bd7b89c0f244f Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:41 +1100
Subject: fs: dcache reduce locking in d_alloc

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 98a05696593e..ccdc5c2512df 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1253,11 +1253,13 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 
 	if (parent) {
 		spin_lock(&parent->d_lock);
-		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+		/*
+		 * don't need child lock because it is not subject
+		 * to concurrency here
+		 */
 		dentry->d_parent = dget_dlock(parent);
 		dentry->d_sb = parent->d_sb;
 		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
-		spin_unlock(&dentry->d_lock);
 		spin_unlock(&parent->d_lock);
 	}
 
-- 
cgit v1.2.2


From 357f8e658bba8a085c4a5d4331e30894be8096b8 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:42 +1100
Subject: fs: dcache reduce dcache_inode_lock

dcache_inode_lock can be avoided in d_delete() and d_materialise_unique()
in cases where it is not required.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index ccdc5c2512df..01f016799fd4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1828,10 +1828,15 @@ void d_delete(struct dentry * dentry)
 	/*
 	 * Are we the only user?
 	 */
-	spin_lock(&dcache_inode_lock);
+again:
 	spin_lock(&dentry->d_lock);
 	isdir = S_ISDIR(dentry->d_inode->i_mode);
 	if (dentry->d_count == 1) {
+		if (!spin_trylock(&dcache_inode_lock)) {
+			spin_unlock(&dentry->d_lock);
+			cpu_relax();
+			goto again;
+		}
 		dentry->d_flags &= ~DCACHE_CANT_MOUNT;
 		dentry_iput(dentry);
 		fsnotify_nameremove(dentry, isdir);
@@ -1842,7 +1847,6 @@ void d_delete(struct dentry * dentry)
 		__d_drop(dentry);
 
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_inode_lock);
 
 	fsnotify_nameremove(dentry, isdir);
 }
@@ -2164,14 +2168,15 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 
 	BUG_ON(!d_unhashed(dentry));
 
-	spin_lock(&dcache_inode_lock);
-
 	if (!inode) {
 		actual = dentry;
 		__d_instantiate(dentry, NULL);
-		goto found_lock;
+		d_rehash(actual);
+		goto out_nolock;
 	}
 
+	spin_lock(&dcache_inode_lock);
+
 	if (S_ISDIR(inode->i_mode)) {
 		struct dentry *alias;
 
@@ -2198,10 +2203,9 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 	actual = __d_instantiate_unique(dentry, inode);
 	if (!actual)
 		actual = dentry;
-	else if (unlikely(!d_unhashed(actual)))
-		goto shouldnt_be_hashed;
+	else
+		BUG_ON(!d_unhashed(actual));
 
-found_lock:
 	spin_lock(&actual->d_lock);
 found:
 	spin_lock(&dcache_hash_lock);
@@ -2217,10 +2221,6 @@ out_nolock:
 
 	iput(inode);
 	return actual;
-
-shouldnt_be_hashed:
-	spin_unlock(&dcache_inode_lock);
-	BUG();
 }
 EXPORT_SYMBOL_GPL(d_materialise_unique);
 
-- 
cgit v1.2.2


From dc0474be3e27463d4d4a2793f82366eed906f223 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:43 +1100
Subject: fs: dcache rationalise dget variants

dget_locked was a shortcut to avoid the lazy lru manipulation when we already
held dcache_lock (lru manipulation was relatively cheap at that point).
However, how that the lru lock is an innermost one, we never hold it at any
caller, so the lock cost can now be avoided. We already have well working lazy
dcache LRU, so it should be fine to defer LRU manipulations to scan time.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/configfs/inode.c |  2 +-
 fs/dcache.c         | 36 +++++++++++-------------------------
 fs/exportfs/expfs.c |  2 +-
 fs/ncpfs/dir.c      |  2 +-
 fs/ocfs2/dcache.c   |  2 +-
 5 files changed, 15 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index fb3a55fff82a..c83f4768eeaa 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -252,7 +252,7 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
 	if (dentry) {
 		spin_lock(&dentry->d_lock);
 		if (!(d_unhashed(dentry) && dentry->d_inode)) {
-			dget_locked_dlock(dentry);
+			dget_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
 			simple_unlink(parent->d_inode, dentry);
diff --git a/fs/dcache.c b/fs/dcache.c
index 01f016799fd4..b4d2e28eef5b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -429,32 +429,17 @@ int d_invalidate(struct dentry * dentry)
 EXPORT_SYMBOL(d_invalidate);
 
 /* This must be called with d_lock held */
-static inline struct dentry * __dget_locked_dlock(struct dentry *dentry)
+static inline void __dget_dlock(struct dentry *dentry)
 {
 	dentry->d_count++;
-	dentry_lru_del(dentry);
-	return dentry;
 }
 
-/* This must be called with d_lock held */
-static inline struct dentry * __dget_locked(struct dentry *dentry)
+static inline void __dget(struct dentry *dentry)
 {
 	spin_lock(&dentry->d_lock);
-	__dget_locked_dlock(dentry);
+	__dget_dlock(dentry);
 	spin_unlock(&dentry->d_lock);
-	return dentry;
-}
-
-struct dentry * dget_locked_dlock(struct dentry *dentry)
-{
-	return __dget_locked_dlock(dentry);
-}
-
-struct dentry * dget_locked(struct dentry *dentry)
-{
-	return __dget_locked(dentry);
 }
-EXPORT_SYMBOL(dget_locked);
 
 struct dentry *dget_parent(struct dentry *dentry)
 {
@@ -512,7 +497,7 @@ again:
 			    (alias->d_flags & DCACHE_DISCONNECTED)) {
 				discon_alias = alias;
 			} else if (!want_discon) {
-				__dget_locked_dlock(alias);
+				__dget_dlock(alias);
 				spin_unlock(&alias->d_lock);
 				return alias;
 			}
@@ -525,7 +510,7 @@ again:
 		if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
 			if (IS_ROOT(alias) &&
 			    (alias->d_flags & DCACHE_DISCONNECTED)) {
-				__dget_locked_dlock(alias);
+				__dget_dlock(alias);
 				spin_unlock(&alias->d_lock);
 				return alias;
 			}
@@ -561,7 +546,7 @@ restart:
 	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		spin_lock(&dentry->d_lock);
 		if (!dentry->d_count) {
-			__dget_locked_dlock(dentry);
+			__dget_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_inode_lock);
@@ -1257,7 +1242,8 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 		 * don't need child lock because it is not subject
 		 * to concurrency here
 		 */
-		dentry->d_parent = dget_dlock(parent);
+		__dget_dlock(parent);
+		dentry->d_parent = parent;
 		dentry->d_sb = parent->d_sb;
 		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
 		spin_unlock(&parent->d_lock);
@@ -1360,7 +1346,7 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
 			continue;
 		if (memcmp(qstr->name, name, len))
 			continue;
-		dget_locked(alias);
+		__dget(alias);
 		return alias;
 	}
 
@@ -1613,7 +1599,7 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 	 * reference to it, move it in place and use it.
 	 */
 	new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
-	dget_locked(new);
+	__dget(new);
 	spin_unlock(&dcache_inode_lock);
 	security_d_instantiate(found, inode);
 	d_move(new, found);
@@ -1789,7 +1775,7 @@ int d_validate(struct dentry *dentry, struct dentry *dparent)
 	list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) {
 		if (dentry == child) {
 			spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-			__dget_locked_dlock(dentry);
+			__dget_dlock(dentry);
 			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dparent->d_lock);
 			return 1;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 53a5c08fb63c..f06a940065f6 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -49,7 +49,7 @@ find_acceptable_alias(struct dentry *result,
 
 	spin_lock(&dcache_inode_lock);
 	list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) {
-		dget_locked(dentry);
+		dget(dentry);
 		spin_unlock(&dcache_inode_lock);
 		if (toput)
 			dput(toput);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index de15c533311c..0ba3cdc95a44 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -397,7 +397,7 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
 		dent = list_entry(next, struct dentry, d_u.d_child);
 		if ((unsigned long)dent->d_fsdata == fpos) {
 			if (dent->d_inode)
-				dget_locked(dent);
+				dget(dent);
 			else
 				dent = NULL;
 			spin_unlock(&parent->d_lock);
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b7de749bdd12..4d54c60ceee4 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -178,7 +178,7 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
 			mlog(0, "dentry found: %.*s\n",
 			     dentry->d_name.len, dentry->d_name.name);
 
-			dget_locked_dlock(dentry);
+			dget_dlock(dentry);
 			spin_unlock(&dentry->d_lock);
 			break;
 		}
-- 
cgit v1.2.2


From a734eb458ab2bd11479a27dd54f48e1b26a55845 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:44 +1100
Subject: fs: dcache reduce d_parent locking

Use RCU to simplify locking in dget_parent.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index b4d2e28eef5b..a8f89765d602 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -446,24 +446,27 @@ struct dentry *dget_parent(struct dentry *dentry)
 	struct dentry *ret;
 
 repeat:
-	spin_lock(&dentry->d_lock);
+	/*
+	 * Don't need rcu_dereference because we re-check it was correct under
+	 * the lock.
+	 */
+	rcu_read_lock();
 	ret = dentry->d_parent;
-	if (!ret)
-		goto out;
-	if (dentry == ret) {
-		ret->d_count++;
+	if (!ret) {
+		rcu_read_unlock();
 		goto out;
 	}
-	if (!spin_trylock(&ret->d_lock)) {
-		spin_unlock(&dentry->d_lock);
-		cpu_relax();
+	spin_lock(&ret->d_lock);
+	if (unlikely(ret != dentry->d_parent)) {
+		spin_unlock(&ret->d_lock);
+		rcu_read_unlock();
 		goto repeat;
 	}
+	rcu_read_unlock();
 	BUG_ON(!ret->d_count);
 	ret->d_count++;
 	spin_unlock(&ret->d_lock);
 out:
-	spin_unlock(&dentry->d_lock);
 	return ret;
 }
 EXPORT_SYMBOL(dget_parent);
-- 
cgit v1.2.2


From 89e6054836a7b1e7500cd70a14b5579e752c9250 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:45 +1100
Subject: fs: dcache reduce prune_one_dentry locking

prune_one_dentry can avoid quite a bit of locking in the common case where
ancestors have an elevated refcount. Alternatively, we could have gone the
other way and made fewer trylocks in the case where d_count goes to zero, but
is probably less common.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index a8f89765d602..195706374697 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -582,26 +582,29 @@ static void prune_one_dentry(struct dentry *dentry, struct dentry *parent)
 	 * Prune ancestors.
 	 */
 	while (dentry) {
-		spin_lock(&dcache_inode_lock);
-again:
+relock:
 		spin_lock(&dentry->d_lock);
+		if (dentry->d_count > 1) {
+			dentry->d_count--;
+			spin_unlock(&dentry->d_lock);
+			return;
+		}
+		if (!spin_trylock(&dcache_inode_lock)) {
+relock2:
+			spin_unlock(&dentry->d_lock);
+			cpu_relax();
+			goto relock;
+		}
+
 		if (IS_ROOT(dentry))
 			parent = NULL;
 		else
 			parent = dentry->d_parent;
 		if (parent && !spin_trylock(&parent->d_lock)) {
-			spin_unlock(&dentry->d_lock);
-			goto again;
-		}
-		dentry->d_count--;
-		if (dentry->d_count) {
-			if (parent)
-				spin_unlock(&parent->d_lock);
-			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_inode_lock);
-			return;
+			goto relock2;
 		}
-
+		dentry->d_count--;
 		dentry_lru_del(dentry);
 		__d_drop(dentry);
 		dentry = d_kill(dentry, parent);
-- 
cgit v1.2.2


From be182bff72fae6a3eb25624b39170c40b72f0909 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:46 +1100
Subject: fs: reduce dcache_inode_lock width in lru scanning

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 195706374697..fe8f28a2878e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -621,7 +621,7 @@ static void shrink_dentry_list(struct list_head *list)
 		dentry = list_entry(list->prev, struct dentry, d_lru);
 
 		if (!spin_trylock(&dentry->d_lock)) {
-relock:
+relock1:
 			spin_unlock(&dcache_lru_lock);
 			cpu_relax();
 			spin_lock(&dcache_lru_lock);
@@ -638,20 +638,24 @@ relock:
 			spin_unlock(&dentry->d_lock);
 			continue;
 		}
+		if (!spin_trylock(&dcache_inode_lock)) {
+relock2:
+			spin_unlock(&dentry->d_lock);
+			goto relock1;
+		}
 		if (IS_ROOT(dentry))
 			parent = NULL;
 		else
 			parent = dentry->d_parent;
 		if (parent && !spin_trylock(&parent->d_lock)) {
-			spin_unlock(&dentry->d_lock);
-			goto relock;
+			spin_unlock(&dcache_inode_lock);
+			goto relock2;
 		}
 		__dentry_lru_del(dentry);
 		spin_unlock(&dcache_lru_lock);
 
 		prune_one_dentry(dentry, parent);
 		/* dcache_inode_lock and dentry->d_lock dropped */
-		spin_lock(&dcache_inode_lock);
 		spin_lock(&dcache_lru_lock);
 	}
 }
@@ -672,7 +676,6 @@ static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
 	LIST_HEAD(tmp);
 	int cnt = *count;
 
-	spin_lock(&dcache_inode_lock);
 relock:
 	spin_lock(&dcache_lru_lock);
 	while (!list_empty(&sb->s_dentry_lru)) {
@@ -711,7 +714,6 @@ relock:
 	if (!list_empty(&referenced))
 		list_splice(&referenced, &sb->s_dentry_lru);
 	spin_unlock(&dcache_lru_lock);
-	spin_unlock(&dcache_inode_lock);
 }
 
 /**
@@ -800,14 +802,12 @@ void shrink_dcache_sb(struct super_block *sb)
 {
 	LIST_HEAD(tmp);
 
-	spin_lock(&dcache_inode_lock);
 	spin_lock(&dcache_lru_lock);
 	while (!list_empty(&sb->s_dentry_lru)) {
 		list_splice_init(&sb->s_dentry_lru, &tmp);
 		shrink_dentry_list(&tmp);
 	}
 	spin_unlock(&dcache_lru_lock);
-	spin_unlock(&dcache_inode_lock);
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
 
-- 
cgit v1.2.2


From ec33679d78f9d653a44ddba10b5fb824c06330a1 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:47 +1100
Subject: fs: use RCU in shrink_dentry_list to reduce lock nesting

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 46 +++++++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index fe8f28a2878e..d1840b30c673 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -615,16 +615,16 @@ static void shrink_dentry_list(struct list_head *list)
 {
 	struct dentry *dentry;
 
-	while (!list_empty(list)) {
+	rcu_read_lock();
+	for (;;) {
 		struct dentry *parent;
 
-		dentry = list_entry(list->prev, struct dentry, d_lru);
-
-		if (!spin_trylock(&dentry->d_lock)) {
-relock1:
-			spin_unlock(&dcache_lru_lock);
-			cpu_relax();
-			spin_lock(&dcache_lru_lock);
+		dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
+		if (&dentry->d_lru == list)
+			break; /* empty */
+		spin_lock(&dentry->d_lock);
+		if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
+			spin_unlock(&dentry->d_lock);
 			continue;
 		}
 
@@ -634,14 +634,16 @@ relock1:
 		 * it - just keep it off the LRU list.
 		 */
 		if (dentry->d_count) {
-			__dentry_lru_del(dentry);
+			dentry_lru_del(dentry);
 			spin_unlock(&dentry->d_lock);
 			continue;
 		}
+
 		if (!spin_trylock(&dcache_inode_lock)) {
-relock2:
+relock:
 			spin_unlock(&dentry->d_lock);
-			goto relock1;
+			cpu_relax();
+			continue;
 		}
 		if (IS_ROOT(dentry))
 			parent = NULL;
@@ -649,15 +651,15 @@ relock2:
 			parent = dentry->d_parent;
 		if (parent && !spin_trylock(&parent->d_lock)) {
 			spin_unlock(&dcache_inode_lock);
-			goto relock2;
+			goto relock;
 		}
-		__dentry_lru_del(dentry);
-		spin_unlock(&dcache_lru_lock);
+		dentry_lru_del(dentry);
 
+		rcu_read_unlock();
 		prune_one_dentry(dentry, parent);
-		/* dcache_inode_lock and dentry->d_lock dropped */
-		spin_lock(&dcache_lru_lock);
+		rcu_read_lock();
 	}
+	rcu_read_unlock();
 }
 
 /**
@@ -705,15 +707,15 @@ relock:
 			if (!--cnt)
 				break;
 		}
-		/* XXX: re-add cond_resched_lock when dcache_lock goes away */
+		cond_resched_lock(&dcache_lru_lock);
 	}
-
-	*count = cnt;
-	shrink_dentry_list(&tmp);
-
 	if (!list_empty(&referenced))
 		list_splice(&referenced, &sb->s_dentry_lru);
 	spin_unlock(&dcache_lru_lock);
+
+	shrink_dentry_list(&tmp);
+
+	*count = cnt;
 }
 
 /**
@@ -805,7 +807,9 @@ void shrink_dcache_sb(struct super_block *sb)
 	spin_lock(&dcache_lru_lock);
 	while (!list_empty(&sb->s_dentry_lru)) {
 		list_splice_init(&sb->s_dentry_lru, &tmp);
+		spin_unlock(&dcache_lru_lock);
 		shrink_dentry_list(&tmp);
+		spin_lock(&dcache_lru_lock);
 	}
 	spin_unlock(&dcache_lru_lock);
 }
-- 
cgit v1.2.2


From 77812a1ef139d84270d27faacc0630c887411013 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:48 +1100
Subject: fs: consolidate dentry kill sequence

The tricky locking for disposing of a dentry is duplicated 3 times in the
dcache (dput, pruning a dentry from the LRU, and pruning its ancestors).
Consolidate them all into a single function dentry_kill.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 135 +++++++++++++++++++++++++++---------------------------------
 1 file changed, 61 insertions(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index d1840b30c673..dc0551c9755d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -284,6 +284,40 @@ void d_drop(struct dentry *dentry)
 }
 EXPORT_SYMBOL(d_drop);
 
+/*
+ * Finish off a dentry we've decided to kill.
+ * dentry->d_lock must be held, returns with it unlocked.
+ * If ref is non-zero, then decrement the refcount too.
+ * Returns dentry requiring refcount drop, or NULL if we're done.
+ */
+static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
+	__releases(dentry->d_lock)
+{
+	struct dentry *parent;
+
+	if (!spin_trylock(&dcache_inode_lock)) {
+relock:
+		spin_unlock(&dentry->d_lock);
+		cpu_relax();
+		return dentry; /* try again with same dentry */
+	}
+	if (IS_ROOT(dentry))
+		parent = NULL;
+	else
+		parent = dentry->d_parent;
+	if (parent && !spin_trylock(&parent->d_lock)) {
+		spin_unlock(&dcache_inode_lock);
+		goto relock;
+	}
+	if (ref)
+		dentry->d_count--;
+	/* if dentry was on the d_lru list delete it from there */
+	dentry_lru_del(dentry);
+	/* if it was on the hash then remove it */
+	__d_drop(dentry);
+	return d_kill(dentry, parent);
+}
+
 /* 
  * This is dput
  *
@@ -309,13 +343,9 @@ EXPORT_SYMBOL(d_drop);
  * call the dentry unlink method as well as removing it from the queues and
  * releasing its resources. If the parent dentries were scheduled for release
  * they too may now get deleted.
- *
- * no dcache lock, please.
  */
-
 void dput(struct dentry *dentry)
 {
-	struct dentry *parent;
 	if (!dentry)
 		return;
 
@@ -348,26 +378,7 @@ repeat:
 	return;
 
 kill_it:
-	if (!spin_trylock(&dcache_inode_lock)) {
-relock:
-		spin_unlock(&dentry->d_lock);
-		cpu_relax();
-		goto repeat;
-	}
-	if (IS_ROOT(dentry))
-		parent = NULL;
-	else
-		parent = dentry->d_parent;
-	if (parent && !spin_trylock(&parent->d_lock)) {
-		spin_unlock(&dcache_inode_lock);
-		goto relock;
-	}
-	dentry->d_count--;
-	/* if dentry was on the d_lru list delete it from there */
-	dentry_lru_del(dentry);
-	/* if it was on the hash (d_delete case), then remove it */
-	__d_drop(dentry);
-	dentry = d_kill(dentry, parent);
+	dentry = dentry_kill(dentry, 1);
 	if (dentry)
 		goto repeat;
 }
@@ -563,51 +574,43 @@ restart:
 EXPORT_SYMBOL(d_prune_aliases);
 
 /*
- * Throw away a dentry - free the inode, dput the parent.  This requires that
- * the LRU list has already been removed.
+ * Try to throw away a dentry - free the inode, dput the parent.
+ * Requires dentry->d_lock is held, and dentry->d_count == 0.
+ * Releases dentry->d_lock.
  *
- * Try to prune ancestors as well.  This is necessary to prevent
- * quadratic behavior of shrink_dcache_parent(), but is also expected
- * to be beneficial in reducing dentry cache fragmentation.
+ * This may fail if locks cannot be acquired no problem, just try again.
  */
-static void prune_one_dentry(struct dentry *dentry, struct dentry *parent)
+static void try_prune_one_dentry(struct dentry *dentry)
 	__releases(dentry->d_lock)
-	__releases(parent->d_lock)
-	__releases(dcache_inode_lock)
 {
-	__d_drop(dentry);
-	dentry = d_kill(dentry, parent);
+	struct dentry *parent;
 
+	parent = dentry_kill(dentry, 0);
 	/*
-	 * Prune ancestors.
+	 * If dentry_kill returns NULL, we have nothing more to do.
+	 * if it returns the same dentry, trylocks failed. In either
+	 * case, just loop again.
+	 *
+	 * Otherwise, we need to prune ancestors too. This is necessary
+	 * to prevent quadratic behavior of shrink_dcache_parent(), but
+	 * is also expected to be beneficial in reducing dentry cache
+	 * fragmentation.
 	 */
+	if (!parent)
+		return;
+	if (parent == dentry)
+		return;
+
+	/* Prune ancestors. */
+	dentry = parent;
 	while (dentry) {
-relock:
 		spin_lock(&dentry->d_lock);
 		if (dentry->d_count > 1) {
 			dentry->d_count--;
 			spin_unlock(&dentry->d_lock);
 			return;
 		}
-		if (!spin_trylock(&dcache_inode_lock)) {
-relock2:
-			spin_unlock(&dentry->d_lock);
-			cpu_relax();
-			goto relock;
-		}
-
-		if (IS_ROOT(dentry))
-			parent = NULL;
-		else
-			parent = dentry->d_parent;
-		if (parent && !spin_trylock(&parent->d_lock)) {
-			spin_unlock(&dcache_inode_lock);
-			goto relock2;
-		}
-		dentry->d_count--;
-		dentry_lru_del(dentry);
-		__d_drop(dentry);
-		dentry = d_kill(dentry, parent);
+		dentry = dentry_kill(dentry, 1);
 	}
 }
 
@@ -617,8 +620,6 @@ static void shrink_dentry_list(struct list_head *list)
 
 	rcu_read_lock();
 	for (;;) {
-		struct dentry *parent;
-
 		dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
 		if (&dentry->d_lru == list)
 			break; /* empty */
@@ -639,24 +640,10 @@ static void shrink_dentry_list(struct list_head *list)
 			continue;
 		}
 
-		if (!spin_trylock(&dcache_inode_lock)) {
-relock:
-			spin_unlock(&dentry->d_lock);
-			cpu_relax();
-			continue;
-		}
-		if (IS_ROOT(dentry))
-			parent = NULL;
-		else
-			parent = dentry->d_parent;
-		if (parent && !spin_trylock(&parent->d_lock)) {
-			spin_unlock(&dcache_inode_lock);
-			goto relock;
-		}
-		dentry_lru_del(dentry);
-
 		rcu_read_unlock();
-		prune_one_dentry(dentry, parent);
+
+		try_prune_one_dentry(dentry);
+
 		rcu_read_lock();
 	}
 	rcu_read_unlock();
-- 
cgit v1.2.2


From fa0d7e3de6d6fc5004ad9dea0dd6b286af8f03e9 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:49 +1100
Subject: fs: icache RCU free inodes

RCU free the struct inode. This will allow:

- Subsequent store-free path walking patch. The inode must be consulted for
  permissions when walking, so an RCU inode reference is a must.
- sb_inode_list_lock to be moved inside i_lock because sb list walkers who want
  to take i_lock no longer need to take sb_inode_list_lock to walk the list in
  the first place. This will simplify and optimize locking.
- Could remove some nested trylock loops in dcache code
- Could potentially simplify things a bit in VM land. Do not need to take the
  page lock to follow page->mapping.

The downsides of this is the performance cost of using RCU. In a simple
creat/unlink microbenchmark, performance drops by about 10% due to inability to
reuse cache-hot slab objects. As iterations increase and RCU freeing starts
kicking over, this increases to about 20%.

In cases where inode lifetimes are longer (ie. many inodes may be allocated
during the average life span of a single inode), a lot of this cache reuse is
not applicable, so the regression caused by this patch is smaller.

The cache-hot regression could largely be avoided by using SLAB_DESTROY_BY_RCU,
however this adds some complexity to list walking and store-free path walking,
so I prefer to implement this at a later date, if it is shown to be a win in
real situations. I haven't found a regression in any non-micro benchmark so I
doubt it will be a problem.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/9p/vfs_inode.c        |  9 ++++++++-
 fs/adfs/super.c          |  9 ++++++++-
 fs/affs/super.c          |  9 ++++++++-
 fs/afs/super.c           | 10 +++++++++-
 fs/befs/linuxvfs.c       | 10 ++++++++--
 fs/bfs/inode.c           |  9 ++++++++-
 fs/block_dev.c           |  9 ++++++++-
 fs/btrfs/inode.c         |  9 ++++++++-
 fs/ceph/inode.c          | 11 ++++++++++-
 fs/cifs/cifsfs.c         |  9 ++++++++-
 fs/coda/inode.c          |  9 ++++++++-
 fs/ecryptfs/super.c      | 12 +++++++++++-
 fs/efs/super.c           |  9 ++++++++-
 fs/exofs/super.c         |  9 ++++++++-
 fs/ext2/super.c          |  9 ++++++++-
 fs/ext3/super.c          |  9 ++++++++-
 fs/ext4/super.c          |  9 ++++++++-
 fs/fat/inode.c           |  9 ++++++++-
 fs/freevxfs/vxfs_inode.c |  9 ++++++++-
 fs/fuse/inode.c          |  9 ++++++++-
 fs/gfs2/super.c          |  9 ++++++++-
 fs/hfs/super.c           |  9 ++++++++-
 fs/hfsplus/super.c       | 10 +++++++++-
 fs/hostfs/hostfs_kern.c  |  9 ++++++++-
 fs/hpfs/super.c          |  9 ++++++++-
 fs/hppfs/hppfs.c         |  9 ++++++++-
 fs/hugetlbfs/inode.c     |  9 ++++++++-
 fs/inode.c               | 10 +++++++++-
 fs/isofs/inode.c         |  9 ++++++++-
 fs/jffs2/super.c         |  9 ++++++++-
 fs/jfs/super.c           | 10 +++++++++-
 fs/logfs/inode.c         |  9 ++++++++-
 fs/minix/inode.c         |  9 ++++++++-
 fs/ncpfs/inode.c         |  9 ++++++++-
 fs/nfs/inode.c           |  9 ++++++++-
 fs/nilfs2/super.c        | 10 +++++++++-
 fs/ntfs/inode.c          |  9 ++++++++-
 fs/ocfs2/dlmfs/dlmfs.c   |  9 ++++++++-
 fs/ocfs2/super.c         |  9 ++++++++-
 fs/openpromfs/inode.c    |  9 ++++++++-
 fs/proc/inode.c          |  9 ++++++++-
 fs/qnx4/inode.c          |  9 ++++++++-
 fs/reiserfs/super.c      |  9 ++++++++-
 fs/romfs/super.c         |  9 ++++++++-
 fs/squashfs/super.c      |  9 ++++++++-
 fs/sysv/inode.c          |  9 ++++++++-
 fs/ubifs/super.c         | 10 +++++++++-
 fs/udf/super.c           |  9 ++++++++-
 fs/ufs/super.c           |  9 ++++++++-
 fs/xfs/xfs_iget.c        | 13 ++++++++++++-
 50 files changed, 415 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 1073bca8488c..f6f9081e6d2c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -237,10 +237,17 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
  *
  */
 
-void v9fs_destroy_inode(struct inode *inode)
+static void v9fs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
 }
+
+void v9fs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, v9fs_i_callback);
+}
 #endif
 
 /**
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 959dbff2d42d..47dffc513a26 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -240,11 +240,18 @@ static struct inode *adfs_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void adfs_destroy_inode(struct inode *inode)
+static void adfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
 }
 
+static void adfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, adfs_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 0cf7f4384cbd..4c18fcfb0a19 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -95,11 +95,18 @@ static struct inode *affs_alloc_inode(struct super_block *sb)
 	return &i->vfs_inode;
 }
 
-static void affs_destroy_inode(struct inode *inode)
+static void affs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
 }
 
+static void affs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, affs_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct affs_inode_info *ei = (struct affs_inode_info *) foo;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 27201cffece4..f901a9d7c111 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -498,6 +498,14 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
 	return &vnode->vfs_inode;
 }
 
+static void afs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(afs_inode_cachep, vnode);
+}
+
 /*
  * destroy an AFS inode struct
  */
@@ -511,7 +519,7 @@ static void afs_destroy_inode(struct inode *inode)
 
 	ASSERTCMP(vnode->server, ==, NULL);
 
-	kmem_cache_free(afs_inode_cachep, vnode);
+	call_rcu(&inode->i_rcu, afs_i_callback);
 	atomic_dec(&afs_count_active_inodes);
 }
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index aa4e7c7ae3c6..de93581b79a2 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -284,12 +284,18 @@ befs_alloc_inode(struct super_block *sb)
         return &bi->vfs_inode;
 }
 
-static void
-befs_destroy_inode(struct inode *inode)
+static void befs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
         kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
 }
 
+static void befs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, befs_i_callback);
+}
+
 static void init_once(void *foo)
 {
         struct befs_inode_info *bi = (struct befs_inode_info *) foo;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 76db6d7d49bb..a8e37f81d097 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -248,11 +248,18 @@ static struct inode *bfs_alloc_inode(struct super_block *sb)
 	return &bi->vfs_inode;
 }
 
-static void bfs_destroy_inode(struct inode *inode)
+static void bfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
 }
 
+static void bfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, bfs_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct bfs_inode_info *bi = foo;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4230252fd689..771f23527010 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -409,13 +409,20 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void bdev_destroy_inode(struct inode *inode)
+static void bdev_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
 	struct bdev_inode *bdi = BDEV_I(inode);
 
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(bdev_cachep, bdi);
 }
 
+static void bdev_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, bdev_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct bdev_inode *ei = (struct bdev_inode *) foo;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7ce9f8932789..f9d2994a42a2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6495,6 +6495,13 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	return inode;
 }
 
+static void btrfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+
 void btrfs_destroy_inode(struct inode *inode)
 {
 	struct btrfs_ordered_extent *ordered;
@@ -6564,7 +6571,7 @@ void btrfs_destroy_inode(struct inode *inode)
 	inode_tree_del(inode);
 	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
-	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+	call_rcu(&inode->i_rcu, btrfs_i_callback);
 }
 
 int btrfs_drop_inode(struct inode *inode)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 2a48cafc174b..47f8c8baf3b5 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -368,6 +368,15 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	return &ci->vfs_inode;
 }
 
+static void ceph_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ceph_inode_cachep, ci);
+}
+
 void ceph_destroy_inode(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -407,7 +416,7 @@ void ceph_destroy_inode(struct inode *inode)
 	if (ci->i_xattrs.prealloc_blob)
 		ceph_buffer_put(ci->i_xattrs.prealloc_blob);
 
-	kmem_cache_free(ceph_inode_cachep, ci);
+	call_rcu(&inode->i_rcu, ceph_i_callback);
 }
 
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 3936aa7f2c22..223717dcc401 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -334,10 +334,17 @@ cifs_alloc_inode(struct super_block *sb)
 	return &cifs_inode->vfs_inode;
 }
 
+static void cifs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
+}
+
 static void
 cifs_destroy_inode(struct inode *inode)
 {
-	kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
+	call_rcu(&inode->i_rcu, cifs_i_callback);
 }
 
 static void
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 5ea57c8c7f97..50dc7d189f56 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -56,11 +56,18 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void coda_destroy_inode(struct inode *inode)
+static void coda_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(coda_inode_cachep, ITOC(inode));
 }
 
+static void coda_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, coda_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct coda_inode_info *ei = (struct coda_inode_info *) foo;
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 2720178b7718..3042fe123a34 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -62,6 +62,16 @@ out:
 	return inode;
 }
 
+static void ecryptfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct ecryptfs_inode_info *inode_info;
+	inode_info = ecryptfs_inode_to_private(inode);
+
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
+}
+
 /**
  * ecryptfs_destroy_inode
  * @inode: The ecryptfs inode
@@ -88,7 +98,7 @@ static void ecryptfs_destroy_inode(struct inode *inode)
 		}
 	}
 	ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
-	kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
+	call_rcu(&inode->i_rcu, ecryptfs_i_callback);
 }
 
 /**
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 5073a07652cc..0f31acb0131c 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -65,11 +65,18 @@ static struct inode *efs_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void efs_destroy_inode(struct inode *inode)
+static void efs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
 }
 
+static void efs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, efs_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct efs_inode_info *ei = (struct efs_inode_info *) foo;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 79c3ae6e0456..8c6c4669b381 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -150,12 +150,19 @@ static struct inode *exofs_alloc_inode(struct super_block *sb)
 	return &oi->vfs_inode;
 }
 
+static void exofs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+}
+
 /*
  * Remove an inode from the cache
  */
 static void exofs_destroy_inode(struct inode *inode)
 {
-	kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+	call_rcu(&inode->i_rcu, exofs_i_callback);
 }
 
 /*
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index d89e0b6a2d78..e0c6380ff992 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -161,11 +161,18 @@ static struct inode *ext2_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void ext2_destroy_inode(struct inode *inode)
+static void ext2_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
 }
 
+static void ext2_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ext2_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index acf8695fa8f0..77ce1616f725 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -479,6 +479,13 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
+static void ext3_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
+}
+
 static void ext3_destroy_inode(struct inode *inode)
 {
 	if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
@@ -489,7 +496,7 @@ static void ext3_destroy_inode(struct inode *inode)
 				false);
 		dump_stack();
 	}
-	kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
+	call_rcu(&inode->i_rcu, ext3_i_callback);
 }
 
 static void init_once(void *foo)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index fb15c9c0be74..cd37f9d5e447 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -841,6 +841,13 @@ static int ext4_drop_inode(struct inode *inode)
 	return drop;
 }
 
+static void ext4_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+}
+
 static void ext4_destroy_inode(struct inode *inode)
 {
 	ext4_ioend_wait(inode);
@@ -853,7 +860,7 @@ static void ext4_destroy_inode(struct inode *inode)
 				true);
 		dump_stack();
 	}
-	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+	call_rcu(&inode->i_rcu, ext4_i_callback);
 }
 
 static void init_once(void *foo)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index ad6998a92c30..8cccfebee180 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -514,11 +514,18 @@ static struct inode *fat_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void fat_destroy_inode(struct inode *inode)
+static void fat_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
 }
 
+static void fat_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, fat_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 8c04eac5079d..2ba6719ac612 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -337,6 +337,13 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
 	return ip;
 }
 
+static void vxfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(vxfs_inode_cachep, inode->i_private);
+}
+
 /**
  * vxfs_evict_inode - remove inode from main memory
  * @ip:		inode to discard.
@@ -350,5 +357,5 @@ vxfs_evict_inode(struct inode *ip)
 {
 	truncate_inode_pages(&ip->i_data, 0);
 	end_writeback(ip);
-	kmem_cache_free(vxfs_inode_cachep, ip->i_private);
+	call_rcu(&ip->i_rcu, vxfs_i_callback);
 }
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index cfce3ad86a92..44e0a6c57e87 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -99,6 +99,13 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 	return inode;
 }
 
+static void fuse_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(fuse_inode_cachep, inode);
+}
+
 static void fuse_destroy_inode(struct inode *inode)
 {
 	struct fuse_inode *fi = get_fuse_inode(inode);
@@ -106,7 +113,7 @@ static void fuse_destroy_inode(struct inode *inode)
 	BUG_ON(!list_empty(&fi->queued_writes));
 	if (fi->forget_req)
 		fuse_request_free(fi->forget_req);
-	kmem_cache_free(fuse_inode_cachep, inode);
+	call_rcu(&inode->i_rcu, fuse_i_callback);
 }
 
 void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2b2c4997430b..16c2ecac7eb7 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1405,11 +1405,18 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
 	return &ip->i_inode;
 }
 
-static void gfs2_destroy_inode(struct inode *inode)
+static void gfs2_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(gfs2_inode_cachep, inode);
 }
 
+static void gfs2_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, gfs2_i_callback);
+}
+
 const struct super_operations gfs2_super_ops = {
 	.alloc_inode		= gfs2_alloc_inode,
 	.destroy_inode		= gfs2_destroy_inode,
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 4824c27cebb8..ef4ee5716abe 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -167,11 +167,18 @@ static struct inode *hfs_alloc_inode(struct super_block *sb)
 	return i ? &i->vfs_inode : NULL;
 }
 
-static void hfs_destroy_inode(struct inode *inode)
+static void hfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(hfs_inode_cachep, HFS_I(inode));
 }
 
+static void hfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hfs_i_callback);
+}
+
 static const struct super_operations hfs_super_operations = {
 	.alloc_inode	= hfs_alloc_inode,
 	.destroy_inode	= hfs_destroy_inode,
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 52cc746d3ba3..182e83a9079e 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -488,11 +488,19 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
 	return i ? &i->vfs_inode : NULL;
 }
 
-static void hfsplus_destroy_inode(struct inode *inode)
+static void hfsplus_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
 }
 
+static void hfsplus_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hfsplus_i_callback);
+}
+
 #define HFSPLUS_INODE_SIZE	sizeof(struct hfsplus_inode_info)
 
 static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 39dc505ed273..861113fcfc88 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -247,11 +247,18 @@ static void hostfs_evict_inode(struct inode *inode)
 	}
 }
 
-static void hostfs_destroy_inode(struct inode *inode)
+static void hostfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kfree(HOSTFS_I(inode));
 }
 
+static void hostfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hostfs_i_callback);
+}
+
 static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
 	const char *root_path = vfs->mnt_sb->s_fs_info;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 6c5f01597c3a..49935ba78db8 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -177,11 +177,18 @@ static struct inode *hpfs_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void hpfs_destroy_inode(struct inode *inode)
+static void hpfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
 }
 
+static void hpfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hpfs_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index f702b5f713fc..87ed48e0343d 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -632,11 +632,18 @@ void hppfs_evict_inode(struct inode *ino)
 	mntput(ino->i_sb->s_fs_info);
 }
 
-static void hppfs_destroy_inode(struct inode *inode)
+static void hppfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kfree(HPPFS_I(inode));
 }
 
+static void hppfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, hppfs_i_callback);
+}
+
 static const struct super_operations hppfs_sbops = {
 	.alloc_inode	= hppfs_alloc_inode,
 	.destroy_inode	= hppfs_destroy_inode,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a5fe68189eed..9885082b470f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -663,11 +663,18 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
 	return &p->vfs_inode;
 }
 
+static void hugetlbfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
+}
+
 static void hugetlbfs_destroy_inode(struct inode *inode)
 {
 	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
 	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
-	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
+	call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
 }
 
 static const struct address_space_operations hugetlbfs_aops = {
diff --git a/fs/inode.c b/fs/inode.c
index 5a0a898f55d1..6751dfe8cc06 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -272,6 +272,13 @@ void __destroy_inode(struct inode *inode)
 }
 EXPORT_SYMBOL(__destroy_inode);
 
+static void i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(inode_cachep, inode);
+}
+
 static void destroy_inode(struct inode *inode)
 {
 	BUG_ON(!list_empty(&inode->i_lru));
@@ -279,7 +286,7 @@ static void destroy_inode(struct inode *inode)
 	if (inode->i_sb->s_op->destroy_inode)
 		inode->i_sb->s_op->destroy_inode(inode);
 	else
-		kmem_cache_free(inode_cachep, (inode));
+		call_rcu(&inode->i_rcu, i_callback);
 }
 
 /*
@@ -432,6 +439,7 @@ void end_writeback(struct inode *inode)
 	BUG_ON(!(inode->i_state & I_FREEING));
 	BUG_ON(inode->i_state & I_CLEAR);
 	inode_sync_wait(inode);
+	/* don't need i_lock here, no concurrent mods to i_state */
 	inode->i_state = I_FREEING | I_CLEAR;
 }
 EXPORT_SYMBOL(end_writeback);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d204ee4235fd..d8f3a652243d 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -81,11 +81,18 @@ static struct inode *isofs_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void isofs_destroy_inode(struct inode *inode)
+static void isofs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
 }
 
+static void isofs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, isofs_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct iso_inode_info *ei = foo;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index c86041b866a4..853b8e300084 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -40,11 +40,18 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb)
 	return &f->vfs_inode;
 }
 
-static void jffs2_destroy_inode(struct inode *inode)
+static void jffs2_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
 }
 
+static void jffs2_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, jffs2_i_callback);
+}
+
 static void jffs2_i_init_once(void *foo)
 {
 	struct jffs2_inode_info *f = foo;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 0669fc1cc3bf..b715b0f7bdfd 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -115,6 +115,14 @@ static struct inode *jfs_alloc_inode(struct super_block *sb)
 	return &jfs_inode->vfs_inode;
 }
 
+static void jfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct jfs_inode_info *ji = JFS_IP(inode);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(jfs_inode_cachep, ji);
+}
+
 static void jfs_destroy_inode(struct inode *inode)
 {
 	struct jfs_inode_info *ji = JFS_IP(inode);
@@ -128,7 +136,7 @@ static void jfs_destroy_inode(struct inode *inode)
 		ji->active_ag = -1;
 	}
 	spin_unlock_irq(&ji->ag_lock);
-	kmem_cache_free(jfs_inode_cachep, ji);
+	call_rcu(&inode->i_rcu, jfs_i_callback);
 }
 
 static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index d8c71ece098f..03b8c240aeda 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -141,13 +141,20 @@ struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
 	return __logfs_iget(sb, ino);
 }
 
+static void logfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
+}
+
 static void __logfs_destroy_inode(struct inode *inode)
 {
 	struct logfs_inode *li = logfs_inode(inode);
 
 	BUG_ON(li->li_block);
 	list_del(&li->li_freeing_list);
-	kmem_cache_free(logfs_inode_cache, li);
+	call_rcu(&inode->i_rcu, logfs_i_callback);
 }
 
 static void logfs_destroy_inode(struct inode *inode)
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index fb2020858a34..ae0b83f476a6 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -68,11 +68,18 @@ static struct inode *minix_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void minix_destroy_inode(struct inode *inode)
+static void minix_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(minix_inode_cachep, minix_i(inode));
 }
 
+static void minix_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, minix_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct minix_inode_info *ei = (struct minix_inode_info *) foo;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 8fb93b604e73..60047dbeb38d 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -58,11 +58,18 @@ static struct inode *ncp_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void ncp_destroy_inode(struct inode *inode)
+static void ncp_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
 }
 
+static void ncp_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ncp_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e67e31c73416..017daa3bed38 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1438,11 +1438,18 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
 	return &nfsi->vfs_inode;
 }
 
-void nfs_destroy_inode(struct inode *inode)
+static void nfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
 }
 
+void nfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, nfs_i_callback);
+}
+
 static inline void nfs4_init_once(struct nfs_inode *nfsi)
 {
 #ifdef CONFIG_NFS_V4
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index d36fc7ee615f..e2dcc9c733f7 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -162,10 +162,13 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
 	return &ii->vfs_inode;
 }
 
-void nilfs_destroy_inode(struct inode *inode)
+static void nilfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
 	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
 
+	INIT_LIST_HEAD(&inode->i_dentry);
+
 	if (mdi) {
 		kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
 		kfree(mdi);
@@ -173,6 +176,11 @@ void nilfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
 }
 
+void nilfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, nilfs_i_callback);
+}
+
 static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
 {
 	struct the_nilfs *nilfs = sbi->s_nilfs;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 93622b175fc7..a627ed82c0a3 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -332,6 +332,13 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb)
 	return NULL;
 }
 
+static void ntfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
+}
+
 void ntfs_destroy_big_inode(struct inode *inode)
 {
 	ntfs_inode *ni = NTFS_I(inode);
@@ -340,7 +347,7 @@ void ntfs_destroy_big_inode(struct inode *inode)
 	BUG_ON(ni->page);
 	if (!atomic_dec_and_test(&ni->count))
 		BUG();
-	kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
+	call_rcu(&inode->i_rcu, ntfs_i_callback);
 }
 
 static inline ntfs_inode *ntfs_alloc_extent_inode(void)
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b2df490a19ed..8c5c0eddc365 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -351,11 +351,18 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb)
 	return &ip->ip_vfs_inode;
 }
 
-static void dlmfs_destroy_inode(struct inode *inode)
+static void dlmfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
 }
 
+static void dlmfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, dlmfs_i_callback);
+}
+
 static void dlmfs_evict_inode(struct inode *inode)
 {
 	int status;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index cfeab7ce3697..17ff46fa8a10 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -569,11 +569,18 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
 	return &oi->vfs_inode;
 }
 
-static void ocfs2_destroy_inode(struct inode *inode)
+static void ocfs2_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
 }
 
+static void ocfs2_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ocfs2_i_callback);
+}
+
 static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
 						unsigned int cbits)
 {
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 911e61f348fc..a2a5bff774e3 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -343,11 +343,18 @@ static struct inode *openprom_alloc_inode(struct super_block *sb)
 	return &oi->vfs_inode;
 }
 
-static void openprom_destroy_inode(struct inode *inode)
+static void openprom_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(op_inode_cachep, OP_I(inode));
 }
 
+static void openprom_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, openprom_i_callback);
+}
+
 static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
 {
 	struct inode *inode;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3ddb6068177c..6bcb926b101b 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -65,11 +65,18 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 	return inode;
 }
 
-static void proc_destroy_inode(struct inode *inode)
+static void proc_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(proc_inode_cachep, PROC_I(inode));
 }
 
+static void proc_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, proc_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct proc_inode *ei = (struct proc_inode *) foo;
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index fcada42f1aa3..e63b4171d583 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -425,11 +425,18 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void qnx4_destroy_inode(struct inode *inode)
+static void qnx4_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
 }
 
+static void qnx4_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, qnx4_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b243117b8752..2575682a9ead 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -529,11 +529,18 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void reiserfs_destroy_inode(struct inode *inode)
+static void reiserfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
 }
 
+static void reiserfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, reiserfs_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 6647f90e55cd..2305e3121cb1 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -400,11 +400,18 @@ static struct inode *romfs_alloc_inode(struct super_block *sb)
 /*
  * return a spent inode to the slab cache
  */
-static void romfs_destroy_inode(struct inode *inode)
+static void romfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
 }
 
+static void romfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, romfs_i_callback);
+}
+
 /*
  * get filesystem statistics
  */
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 24de30ba34c1..20700b9f2b4c 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -440,11 +440,18 @@ static struct inode *squashfs_alloc_inode(struct super_block *sb)
 }
 
 
-static void squashfs_destroy_inode(struct inode *inode)
+static void squashfs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
 }
 
+static void squashfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, squashfs_i_callback);
+}
+
 
 static struct file_system_type squashfs_fs_type = {
 	.owner = THIS_MODULE,
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index de44d067b9e6..0630eb969a28 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -333,11 +333,18 @@ static struct inode *sysv_alloc_inode(struct super_block *sb)
 	return &si->vfs_inode;
 }
 
-static void sysv_destroy_inode(struct inode *inode)
+static void sysv_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
 }
 
+static void sysv_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, sysv_i_callback);
+}
+
 static void init_once(void *p)
 {
 	struct sysv_inode_info *si = (struct sysv_inode_info *)p;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 91fac54c70e3..6e11c2975dcf 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -272,12 +272,20 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb)
 	return &ui->vfs_inode;
 };
 
+static void ubifs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(ubifs_inode_slab, ui);
+}
+
 static void ubifs_destroy_inode(struct inode *inode)
 {
 	struct ubifs_inode *ui = ubifs_inode(inode);
 
 	kfree(ui->data);
-	kmem_cache_free(ubifs_inode_slab, inode);
+	call_rcu(&inode->i_rcu, ubifs_i_callback);
 }
 
 /*
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 4a5c7c61836a..b539d53320fb 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -139,11 +139,18 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void udf_destroy_inode(struct inode *inode)
+static void udf_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(udf_inode_cachep, UDF_I(inode));
 }
 
+static void udf_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, udf_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct udf_inode_info *ei = (struct udf_inode_info *)foo;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 2c47daed56da..2c61ac5d4e48 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1412,11 +1412,18 @@ static struct inode *ufs_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
-static void ufs_destroy_inode(struct inode *inode)
+static void ufs_i_callback(struct rcu_head *head)
 {
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
 	kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
 }
 
+static void ufs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, ufs_i_callback);
+}
+
 static void init_once(void *foo)
 {
 	struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 0cdd26932d8e..d7de5a3f7867 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -91,6 +91,17 @@ xfs_inode_alloc(
 	return ip;
 }
 
+STATIC void
+xfs_inode_free_callback(
+	struct rcu_head		*head)
+{
+	struct inode		*inode = container_of(head, struct inode, i_rcu);
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_zone_free(xfs_inode_zone, ip);
+}
+
 void
 xfs_inode_free(
 	struct xfs_inode	*ip)
@@ -134,7 +145,7 @@ xfs_inode_free(
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
 
-	kmem_zone_free(xfs_inode_zone, ip);
+	call_rcu(&ip->i_vnode.i_rcu, xfs_inode_free_callback);
 }
 
 /*
-- 
cgit v1.2.2


From ff0c7d15f9787b7e8c601533c015295cc68329f8 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:50 +1100
Subject: fs: avoid inode RCU freeing for pseudo fs

Pseudo filesystems that don't put inode on RCU list or reachable by
rcu-walk dentries do not need to RCU free their inodes.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/inode.c | 6 ++++++
 fs/pipe.c  | 6 +++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 6751dfe8cc06..da85e56378f3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -257,6 +257,12 @@ static struct inode *alloc_inode(struct super_block *sb)
 	return inode;
 }
 
+void free_inode_nonrcu(struct inode *inode)
+{
+	kmem_cache_free(inode_cachep, inode);
+}
+EXPORT_SYMBOL(free_inode_nonrcu);
+
 void __destroy_inode(struct inode *inode)
 {
 	BUG_ON(inode_has_buffers(inode));
diff --git a/fs/pipe.c b/fs/pipe.c
index 04629f36e397..ae3592dcc88c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1253,6 +1253,10 @@ out:
 	return ret;
 }
 
+static const struct super_operations pipefs_ops = {
+	.destroy_inode = free_inode_nonrcu,
+};
+
 /*
  * pipefs should _never_ be mounted by userland - too much of security hassle,
  * no real gain from having the whole whorehouse mounted. So we don't need
@@ -1262,7 +1266,7 @@ out:
 static struct dentry *pipefs_mount(struct file_system_type *fs_type,
 			 int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
+	return mount_pseudo(fs_type, "pipe:", &pipefs_ops, PIPEFS_MAGIC);
 }
 
 static struct file_system_type pipe_fs_type = {
-- 
cgit v1.2.2


From 31e6b01f4183ff419a6d1f86177cbf4662347cec Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:52 +1100
Subject: fs: rcu-walk for path lookup

Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.

This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.

The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
  of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
  not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
  access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
  refcounts are not required for persistence. Also we are free to perform mount
  lookups, and to assume dentry mount points and mount roots are stable up and
  down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
  so we can load this tuple atomically, and also check whether any of its
  members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
  sequence after the child is found in case anything changed in the parent
  during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
  limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.

When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.

Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).

The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links

In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.

Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c           | 203 ++++++++++++--
 fs/filesystems.c      |   3 +
 fs/namei.c            | 743 ++++++++++++++++++++++++++++++++++++++++----------
 fs/proc/proc_sysctl.c |   4 +
 4 files changed, 794 insertions(+), 159 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index dc0551c9755d..187fea040108 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -152,9 +152,23 @@ static void d_free(struct dentry *dentry)
 		call_rcu(&dentry->d_u.d_rcu, __d_free);
 }
 
+/**
+ * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * After this call, in-progress rcu-walk path lookup will fail. This
+ * should be called after unhashing, and after changing d_inode (if
+ * the dentry has not already been unhashed).
+ */
+static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
+{
+	assert_spin_locked(&dentry->d_lock);
+	/* Go through a barrier */
+	write_seqcount_barrier(&dentry->d_seq);
+}
+
 /*
  * Release the dentry's inode, using the filesystem
- * d_iput() operation if defined.
+ * d_iput() operation if defined. Dentry has no refcount
+ * and is unhashed.
  */
 static void dentry_iput(struct dentry * dentry)
 	__releases(dentry->d_lock)
@@ -178,6 +192,28 @@ static void dentry_iput(struct dentry * dentry)
 	}
 }
 
+/*
+ * Release the dentry's inode, using the filesystem
+ * d_iput() operation if defined. dentry remains in-use.
+ */
+static void dentry_unlink_inode(struct dentry * dentry)
+	__releases(dentry->d_lock)
+	__releases(dcache_inode_lock)
+{
+	struct inode *inode = dentry->d_inode;
+	dentry->d_inode = NULL;
+	list_del_init(&dentry->d_alias);
+	dentry_rcuwalk_barrier(dentry);
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&dcache_inode_lock);
+	if (!inode->i_nlink)
+		fsnotify_inoderemove(inode);
+	if (dentry->d_op && dentry->d_op->d_iput)
+		dentry->d_op->d_iput(dentry, inode);
+	else
+		iput(inode);
+}
+
 /*
  * dentry_lru_(add|del|move_tail) must be called with d_lock held.
  */
@@ -272,6 +308,7 @@ void __d_drop(struct dentry *dentry)
 		spin_lock(&dcache_hash_lock);
 		hlist_del_rcu(&dentry->d_hash);
 		spin_unlock(&dcache_hash_lock);
+		dentry_rcuwalk_barrier(dentry);
 	}
 }
 EXPORT_SYMBOL(__d_drop);
@@ -309,6 +346,7 @@ relock:
 		spin_unlock(&dcache_inode_lock);
 		goto relock;
 	}
+
 	if (ref)
 		dentry->d_count--;
 	/* if dentry was on the d_lru list delete it from there */
@@ -1221,6 +1259,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	dentry->d_count = 1;
 	dentry->d_flags = DCACHE_UNHASHED;
 	spin_lock_init(&dentry->d_lock);
+	seqcount_init(&dentry->d_seq);
 	dentry->d_inode = NULL;
 	dentry->d_parent = NULL;
 	dentry->d_sb = NULL;
@@ -1269,6 +1308,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 	if (inode)
 		list_add(&dentry->d_alias, &inode->i_dentry);
 	dentry->d_inode = inode;
+	dentry_rcuwalk_barrier(dentry);
 	spin_unlock(&dentry->d_lock);
 	fsnotify_d_instantiate(dentry, inode);
 }
@@ -1610,6 +1650,111 @@ err_out:
 }
 EXPORT_SYMBOL(d_add_ci);
 
+/**
+ * __d_lookup_rcu - search for a dentry (racy, store-free)
+ * @parent: parent dentry
+ * @name: qstr of name we wish to find
+ * @seq: returns d_seq value at the point where the dentry was found
+ * @inode: returns dentry->d_inode when the inode was found valid.
+ * Returns: dentry, or NULL
+ *
+ * __d_lookup_rcu is the dcache lookup function for rcu-walk name
+ * resolution (store-free path walking) design described in
+ * Documentation/filesystems/path-lookup.txt.
+ *
+ * This is not to be used outside core vfs.
+ *
+ * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
+ * held, and rcu_read_lock held. The returned dentry must not be stored into
+ * without taking d_lock and checking d_seq sequence count against @seq
+ * returned here.
+ *
+ * A refcount may be taken on the found dentry with the __d_rcu_to_refcount
+ * function.
+ *
+ * Alternatively, __d_lookup_rcu may be called again to look up the child of
+ * the returned dentry, so long as its parent's seqlock is checked after the
+ * child is looked up. Thus, an interlocking stepping of sequence lock checks
+ * is formed, giving integrity down the path walk.
+ */
+struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
+				unsigned *seq, struct inode **inode)
+{
+	unsigned int len = name->len;
+	unsigned int hash = name->hash;
+	const unsigned char *str = name->name;
+	struct hlist_head *head = d_hash(parent, hash);
+	struct hlist_node *node;
+	struct dentry *dentry;
+
+	/*
+	 * Note: There is significant duplication with __d_lookup_rcu which is
+	 * required to prevent single threaded performance regressions
+	 * especially on architectures where smp_rmb (in seqcounts) are costly.
+	 * Keep the two functions in sync.
+	 */
+
+	/*
+	 * The hash list is protected using RCU.
+	 *
+	 * Carefully use d_seq when comparing a candidate dentry, to avoid
+	 * races with d_move().
+	 *
+	 * It is possible that concurrent renames can mess up our list
+	 * walk here and result in missing our dentry, resulting in the
+	 * false-negative result. d_lookup() protects against concurrent
+	 * renames using rename_lock seqlock.
+	 *
+	 * See Documentation/vfs/dcache-locking.txt for more details.
+	 */
+	hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
+		struct inode *i;
+		const char *tname;
+		int tlen;
+
+		if (dentry->d_name.hash != hash)
+			continue;
+
+seqretry:
+		*seq = read_seqcount_begin(&dentry->d_seq);
+		if (dentry->d_parent != parent)
+			continue;
+		if (d_unhashed(dentry))
+			continue;
+		tlen = dentry->d_name.len;
+		tname = dentry->d_name.name;
+		i = dentry->d_inode;
+		/*
+		 * This seqcount check is required to ensure name and
+		 * len are loaded atomically, so as not to walk off the
+		 * edge of memory when walking. If we could load this
+		 * atomically some other way, we could drop this check.
+		 */
+		if (read_seqcount_retry(&dentry->d_seq, *seq))
+			goto seqretry;
+		if (parent->d_op && parent->d_op->d_compare) {
+			if (parent->d_op->d_compare(parent, *inode,
+						dentry, i,
+						tlen, tname, name))
+				continue;
+		} else {
+			if (tlen != len)
+				continue;
+			if (memcmp(tname, str, tlen))
+				continue;
+		}
+		/*
+		 * No extra seqcount check is required after the name
+		 * compare. The caller must perform a seqcount check in
+		 * order to do anything useful with the returned dentry
+		 * anyway.
+		 */
+		*inode = i;
+		return dentry;
+	}
+	return NULL;
+}
+
 /**
  * d_lookup - search for a dentry
  * @parent: parent dentry
@@ -1621,9 +1766,9 @@ EXPORT_SYMBOL(d_add_ci);
  * dentry is returned. The caller must use dput to free the entry when it has
  * finished using it. %NULL is returned if the dentry does not exist.
  */
-struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
+struct dentry *d_lookup(struct dentry *parent, struct qstr *name)
 {
-	struct dentry * dentry = NULL;
+	struct dentry *dentry;
 	unsigned seq;
 
         do {
@@ -1636,7 +1781,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 }
 EXPORT_SYMBOL(d_lookup);
 
-/*
+/**
  * __d_lookup - search for a dentry (racy)
  * @parent: parent dentry
  * @name: qstr of name we wish to find
@@ -1651,16 +1796,23 @@ EXPORT_SYMBOL(d_lookup);
  *
  * __d_lookup callers must be commented.
  */
-struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
+struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
 {
 	unsigned int len = name->len;
 	unsigned int hash = name->hash;
 	const unsigned char *str = name->name;
 	struct hlist_head *head = d_hash(parent,hash);
-	struct dentry *found = NULL;
 	struct hlist_node *node;
+	struct dentry *found = NULL;
 	struct dentry *dentry;
 
+	/*
+	 * Note: There is significant duplication with __d_lookup_rcu which is
+	 * required to prevent single threaded performance regressions
+	 * especially on architectures where smp_rmb (in seqcounts) are costly.
+	 * Keep the two functions in sync.
+	 */
+
 	/*
 	 * The hash list is protected using RCU.
 	 *
@@ -1677,24 +1829,15 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 	rcu_read_lock();
 	
 	hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
-		struct qstr *qstr;
+		const char *tname;
+		int tlen;
 
 		if (dentry->d_name.hash != hash)
 			continue;
-		if (dentry->d_parent != parent)
-			continue;
 
 		spin_lock(&dentry->d_lock);
-
-		/*
-		 * Recheck the dentry after taking the lock - d_move may have
-		 * changed things. Don't bother checking the hash because
-		 * we're about to compare the whole name anyway.
-		 */
 		if (dentry->d_parent != parent)
 			goto next;
-
-		/* non-existing due to RCU? */
 		if (d_unhashed(dentry))
 			goto next;
 
@@ -1702,16 +1845,17 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 		 * It is safe to compare names since d_move() cannot
 		 * change the qstr (protected by d_lock).
 		 */
-		qstr = &dentry->d_name;
+		tlen = dentry->d_name.len;
+		tname = dentry->d_name.name;
 		if (parent->d_op && parent->d_op->d_compare) {
 			if (parent->d_op->d_compare(parent, parent->d_inode,
 						dentry, dentry->d_inode,
-						qstr->len, qstr->name, name))
+						tlen, tname, name))
 				goto next;
 		} else {
-			if (qstr->len != len)
+			if (tlen != len)
 				goto next;
-			if (memcmp(qstr->name, str, len))
+			if (memcmp(tname, str, tlen))
 				goto next;
 		}
 
@@ -1821,7 +1965,7 @@ again:
 			goto again;
 		}
 		dentry->d_flags &= ~DCACHE_CANT_MOUNT;
-		dentry_iput(dentry);
+		dentry_unlink_inode(dentry);
 		fsnotify_nameremove(dentry, isdir);
 		return;
 	}
@@ -1884,7 +2028,9 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
 	BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
 
 	spin_lock(&dentry->d_lock);
+	write_seqcount_begin(&dentry->d_seq);
 	memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
+	write_seqcount_end(&dentry->d_seq);
 	spin_unlock(&dentry->d_lock);
 }
 EXPORT_SYMBOL(dentry_update_name_case);
@@ -1997,6 +2143,9 @@ void d_move(struct dentry * dentry, struct dentry * target)
 
 	dentry_lock_for_move(dentry, target);
 
+	write_seqcount_begin(&dentry->d_seq);
+	write_seqcount_begin(&target->d_seq);
+
 	/* Move the dentry to the target hash queue, if on different bucket */
 	spin_lock(&dcache_hash_lock);
 	if (!d_unhashed(dentry))
@@ -2005,6 +2154,7 @@ void d_move(struct dentry * dentry, struct dentry * target)
 	spin_unlock(&dcache_hash_lock);
 
 	/* Unhash the target: dput() will then get rid of it */
+	/* __d_drop does write_seqcount_barrier, but they're OK to nest. */
 	__d_drop(target);
 
 	list_del(&dentry->d_u.d_child);
@@ -2028,6 +2178,9 @@ void d_move(struct dentry * dentry, struct dentry * target)
 
 	list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
 
+	write_seqcount_end(&target->d_seq);
+	write_seqcount_end(&dentry->d_seq);
+
 	dentry_unlock_parents_for_move(dentry, target);
 	spin_unlock(&target->d_lock);
 	fsnotify_d_move(dentry);
@@ -2110,6 +2263,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 
 	dentry_lock_for_move(anon, dentry);
 
+	write_seqcount_begin(&dentry->d_seq);
+	write_seqcount_begin(&anon->d_seq);
+
 	dparent = dentry->d_parent;
 	aparent = anon->d_parent;
 
@@ -2130,6 +2286,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 	else
 		INIT_LIST_HEAD(&anon->d_u.d_child);
 
+	write_seqcount_end(&dentry->d_seq);
+	write_seqcount_end(&anon->d_seq);
+
 	dentry_unlock_parents_for_move(anon, dentry);
 	spin_unlock(&dentry->d_lock);
 
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 68ba492d8eef..751d6b255a12 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -115,6 +115,9 @@ int unregister_filesystem(struct file_system_type * fs)
 		tmp = &(*tmp)->next;
 	}
 	write_unlock(&file_systems_lock);
+
+	synchronize_rcu();
+
 	return -EINVAL;
 }
 
diff --git a/fs/namei.c b/fs/namei.c
index 5642bc2be418..8d3f15b3a541 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,8 +169,8 @@ EXPORT_SYMBOL(putname);
 /*
  * This does basic POSIX ACL permission checking
  */
-static int acl_permission_check(struct inode *inode, int mask,
-		int (*check_acl)(struct inode *inode, int mask))
+static inline int __acl_permission_check(struct inode *inode, int mask,
+		int (*check_acl)(struct inode *inode, int mask), int rcu)
 {
 	umode_t			mode = inode->i_mode;
 
@@ -180,9 +180,13 @@ static int acl_permission_check(struct inode *inode, int mask,
 		mode >>= 6;
 	else {
 		if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
-			int error = check_acl(inode, mask);
-			if (error != -EAGAIN)
-				return error;
+			if (rcu) {
+				return -ECHILD;
+			} else {
+				int error = check_acl(inode, mask);
+				if (error != -EAGAIN)
+					return error;
+			}
 		}
 
 		if (in_group_p(inode->i_gid))
@@ -197,6 +201,12 @@ static int acl_permission_check(struct inode *inode, int mask,
 	return -EACCES;
 }
 
+static inline int acl_permission_check(struct inode *inode, int mask,
+		int (*check_acl)(struct inode *inode, int mask))
+{
+	return __acl_permission_check(inode, mask, check_acl, 0);
+}
+
 /**
  * generic_permission  -  check for access rights on a Posix-like filesystem
  * @inode:	inode to check access rights for
@@ -374,6 +384,173 @@ void path_put(struct path *path)
 }
 EXPORT_SYMBOL(path_put);
 
+/**
+ * nameidata_drop_rcu - drop this nameidata out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * @Returns: 0 on success, -ECHLID on failure
+ *
+ * Path walking has 2 modes, rcu-walk and ref-walk (see
+ * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
+ * to drop out of rcu-walk mode and take normal reference counts on dentries
+ * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
+ * refcounts at the last known good point before rcu-walk got stuck, so
+ * ref-walk may continue from there. If this is not successful (eg. a seqcount
+ * has changed), then failure is returned and path walk restarts from the
+ * beginning in ref-walk mode.
+ *
+ * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
+ * ref-walk. Must be called from rcu-walk context.
+ */
+static int nameidata_drop_rcu(struct nameidata *nd)
+{
+	struct fs_struct *fs = current->fs;
+	struct dentry *dentry = nd->path.dentry;
+
+	BUG_ON(!(nd->flags & LOOKUP_RCU));
+	if (nd->root.mnt) {
+		spin_lock(&fs->lock);
+		if (nd->root.mnt != fs->root.mnt ||
+				nd->root.dentry != fs->root.dentry)
+			goto err_root;
+	}
+	spin_lock(&dentry->d_lock);
+	if (!__d_rcu_to_refcount(dentry, nd->seq))
+		goto err;
+	BUG_ON(nd->inode != dentry->d_inode);
+	spin_unlock(&dentry->d_lock);
+	if (nd->root.mnt) {
+		path_get(&nd->root);
+		spin_unlock(&fs->lock);
+	}
+	mntget(nd->path.mnt);
+
+	rcu_read_unlock();
+	br_read_unlock(vfsmount_lock);
+	nd->flags &= ~LOOKUP_RCU;
+	return 0;
+err:
+	spin_unlock(&dentry->d_lock);
+err_root:
+	if (nd->root.mnt)
+		spin_unlock(&fs->lock);
+	return -ECHILD;
+}
+
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
+{
+	if (nd->flags & LOOKUP_RCU)
+		return nameidata_drop_rcu(nd);
+	return 0;
+}
+
+/**
+ * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * @dentry: dentry to drop
+ * @Returns: 0 on success, -ECHLID on failure
+ *
+ * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
+ * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
+ * @nd. Must be called from rcu-walk context.
+ */
+static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
+{
+	struct fs_struct *fs = current->fs;
+	struct dentry *parent = nd->path.dentry;
+
+	BUG_ON(!(nd->flags & LOOKUP_RCU));
+	if (nd->root.mnt) {
+		spin_lock(&fs->lock);
+		if (nd->root.mnt != fs->root.mnt ||
+				nd->root.dentry != fs->root.dentry)
+			goto err_root;
+	}
+	spin_lock(&parent->d_lock);
+	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+	if (!__d_rcu_to_refcount(dentry, nd->seq))
+		goto err;
+	/*
+	 * If the sequence check on the child dentry passed, then the child has
+	 * not been removed from its parent. This means the parent dentry must
+	 * be valid and able to take a reference at this point.
+	 */
+	BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+	BUG_ON(!parent->d_count);
+	parent->d_count++;
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&parent->d_lock);
+	if (nd->root.mnt) {
+		path_get(&nd->root);
+		spin_unlock(&fs->lock);
+	}
+	mntget(nd->path.mnt);
+
+	rcu_read_unlock();
+	br_read_unlock(vfsmount_lock);
+	nd->flags &= ~LOOKUP_RCU;
+	return 0;
+err:
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&parent->d_lock);
+err_root:
+	if (nd->root.mnt)
+		spin_unlock(&fs->lock);
+	return -ECHILD;
+}
+
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
+{
+	if (nd->flags & LOOKUP_RCU)
+		return nameidata_dentry_drop_rcu(nd, dentry);
+	return 0;
+}
+
+/**
+ * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * @Returns: 0 on success, -ECHLID on failure
+ *
+ * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
+ * nd->path should be the final element of the lookup, so nd->root is discarded.
+ * Must be called from rcu-walk context.
+ */
+static int nameidata_drop_rcu_last(struct nameidata *nd)
+{
+	struct dentry *dentry = nd->path.dentry;
+
+	BUG_ON(!(nd->flags & LOOKUP_RCU));
+	nd->flags &= ~LOOKUP_RCU;
+	nd->root.mnt = NULL;
+	spin_lock(&dentry->d_lock);
+	if (!__d_rcu_to_refcount(dentry, nd->seq))
+		goto err_unlock;
+	BUG_ON(nd->inode != dentry->d_inode);
+	spin_unlock(&dentry->d_lock);
+
+	mntget(nd->path.mnt);
+
+	rcu_read_unlock();
+	br_read_unlock(vfsmount_lock);
+
+	return 0;
+
+err_unlock:
+	spin_unlock(&dentry->d_lock);
+	rcu_read_unlock();
+	br_read_unlock(vfsmount_lock);
+	return -ECHILD;
+}
+
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
+{
+	if (likely(nd->flags & LOOKUP_RCU))
+		return nameidata_drop_rcu_last(nd);
+	return 0;
+}
+
 /**
  * release_open_intent - free up open intent resources
  * @nd: pointer to nameidata
@@ -459,26 +636,40 @@ force_reval_path(struct path *path, struct nameidata *nd)
  * short-cut DAC fails, then call ->permission() to do more
  * complete permission check.
  */
-static int exec_permission(struct inode *inode)
+static inline int __exec_permission(struct inode *inode, int rcu)
 {
 	int ret;
 
 	if (inode->i_op->permission) {
+		if (rcu)
+			return -ECHILD;
 		ret = inode->i_op->permission(inode, MAY_EXEC);
 		if (!ret)
 			goto ok;
 		return ret;
 	}
-	ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl);
+	ret = __acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl, rcu);
 	if (!ret)
 		goto ok;
+	if (rcu && ret == -ECHILD)
+		return ret;
 
 	if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
 		goto ok;
 
 	return ret;
 ok:
-	return security_inode_permission(inode, MAY_EXEC);
+	return security_inode_exec_permission(inode, rcu);
+}
+
+static int exec_permission(struct inode *inode)
+{
+	return __exec_permission(inode, 0);
+}
+
+static int exec_permission_rcu(struct inode *inode)
+{
+	return __exec_permission(inode, 1);
 }
 
 static __always_inline void set_root(struct nameidata *nd)
@@ -489,8 +680,20 @@ static __always_inline void set_root(struct nameidata *nd)
 
 static int link_path_walk(const char *, struct nameidata *);
 
+static __always_inline void set_root_rcu(struct nameidata *nd)
+{
+	if (!nd->root.mnt) {
+		struct fs_struct *fs = current->fs;
+		spin_lock(&fs->lock);
+		nd->root = fs->root;
+		spin_unlock(&fs->lock);
+	}
+}
+
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
+	int ret;
+
 	if (IS_ERR(link))
 		goto fail;
 
@@ -500,8 +703,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
 		nd->path = nd->root;
 		path_get(&nd->root);
 	}
+	nd->inode = nd->path.dentry->d_inode;
 
-	return link_path_walk(link, nd);
+	ret = link_path_walk(link, nd);
+	return ret;
 fail:
 	path_put(&nd->path);
 	return PTR_ERR(link);
@@ -516,11 +721,12 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
 
 static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
 {
-	dput(nd->path.dentry);
-	if (nd->path.mnt != path->mnt) {
-		mntput(nd->path.mnt);
-		nd->path.mnt = path->mnt;
+	if (!(nd->flags & LOOKUP_RCU)) {
+		dput(nd->path.dentry);
+		if (nd->path.mnt != path->mnt)
+			mntput(nd->path.mnt);
 	}
+	nd->path.mnt = path->mnt;
 	nd->path.dentry = path->dentry;
 }
 
@@ -535,9 +741,11 @@ __do_follow_link(struct path *path, struct nameidata *nd, void **p)
 
 	if (path->mnt != nd->path.mnt) {
 		path_to_nameidata(path, nd);
+		nd->inode = nd->path.dentry->d_inode;
 		dget(dentry);
 	}
 	mntget(path->mnt);
+
 	nd->last_type = LAST_BIND;
 	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
 	error = PTR_ERR(*p);
@@ -591,6 +799,20 @@ loop:
 	return err;
 }
 
+static int follow_up_rcu(struct path *path)
+{
+	struct vfsmount *parent;
+	struct dentry *mountpoint;
+
+	parent = path->mnt->mnt_parent;
+	if (parent == path->mnt)
+		return 0;
+	mountpoint = path->mnt->mnt_mountpoint;
+	path->dentry = mountpoint;
+	path->mnt = parent;
+	return 1;
+}
+
 int follow_up(struct path *path)
 {
 	struct vfsmount *parent;
@@ -615,6 +837,21 @@ int follow_up(struct path *path)
 /*
  * serialization is taken care of in namespace.c
  */
+static void __follow_mount_rcu(struct nameidata *nd, struct path *path,
+				struct inode **inode)
+{
+	while (d_mountpoint(path->dentry)) {
+		struct vfsmount *mounted;
+		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
+		if (!mounted)
+			return;
+		path->mnt = mounted;
+		path->dentry = mounted->mnt_root;
+		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+		*inode = path->dentry->d_inode;
+	}
+}
+
 static int __follow_mount(struct path *path)
 {
 	int res = 0;
@@ -660,7 +897,42 @@ int follow_down(struct path *path)
 	return 0;
 }
 
-static __always_inline void follow_dotdot(struct nameidata *nd)
+static int follow_dotdot_rcu(struct nameidata *nd)
+{
+	struct inode *inode = nd->inode;
+
+	set_root_rcu(nd);
+
+	while(1) {
+		if (nd->path.dentry == nd->root.dentry &&
+		    nd->path.mnt == nd->root.mnt) {
+			break;
+		}
+		if (nd->path.dentry != nd->path.mnt->mnt_root) {
+			struct dentry *old = nd->path.dentry;
+			struct dentry *parent = old->d_parent;
+			unsigned seq;
+
+			seq = read_seqcount_begin(&parent->d_seq);
+			if (read_seqcount_retry(&old->d_seq, nd->seq))
+				return -ECHILD;
+			inode = parent->d_inode;
+			nd->path.dentry = parent;
+			nd->seq = seq;
+			break;
+		}
+		if (!follow_up_rcu(&nd->path))
+			break;
+		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+		inode = nd->path.dentry->d_inode;
+	}
+	__follow_mount_rcu(nd, &nd->path, &inode);
+	nd->inode = inode;
+
+	return 0;
+}
+
+static void follow_dotdot(struct nameidata *nd)
 {
 	set_root(nd);
 
@@ -681,6 +953,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
 			break;
 	}
 	follow_mount(&nd->path);
+	nd->inode = nd->path.dentry->d_inode;
 }
 
 /*
@@ -718,18 +991,17 @@ static struct dentry *d_alloc_and_lookup(struct dentry *parent,
  *  It _is_ time-critical.
  */
 static int do_lookup(struct nameidata *nd, struct qstr *name,
-		     struct path *path)
+			struct path *path, struct inode **inode)
 {
 	struct vfsmount *mnt = nd->path.mnt;
-	struct dentry *dentry, *parent;
+	struct dentry *dentry, *parent = nd->path.dentry;
 	struct inode *dir;
 	/*
 	 * See if the low-level filesystem might want
 	 * to use its own hash..
 	 */
-	if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
-		int err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
-				nd->path.dentry->d_inode, name);
+	if (parent->d_op && parent->d_op->d_hash) {
+		int err = parent->d_op->d_hash(parent, nd->inode, name);
 		if (err < 0)
 			return err;
 	}
@@ -739,21 +1011,48 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 	 * of a false negative due to a concurrent rename, we're going to
 	 * do the non-racy lookup, below.
 	 */
-	dentry = __d_lookup(nd->path.dentry, name);
-	if (!dentry)
-		goto need_lookup;
+	if (nd->flags & LOOKUP_RCU) {
+		unsigned seq;
+
+		*inode = nd->inode;
+		dentry = __d_lookup_rcu(parent, name, &seq, inode);
+		if (!dentry) {
+			if (nameidata_drop_rcu(nd))
+				return -ECHILD;
+			goto need_lookup;
+		}
+		/* Memory barrier in read_seqcount_begin of child is enough */
+		if (__read_seqcount_retry(&parent->d_seq, nd->seq))
+			return -ECHILD;
+
+		nd->seq = seq;
+		if (dentry->d_op && dentry->d_op->d_revalidate) {
+			/* We commonly drop rcu-walk here */
+			if (nameidata_dentry_drop_rcu(nd, dentry))
+				return -ECHILD;
+			goto need_revalidate;
+		}
+		path->mnt = mnt;
+		path->dentry = dentry;
+		__follow_mount_rcu(nd, path, inode);
+	} else {
+		dentry = __d_lookup(parent, name);
+		if (!dentry)
+			goto need_lookup;
 found:
-	if (dentry->d_op && dentry->d_op->d_revalidate)
-		goto need_revalidate;
+		if (dentry->d_op && dentry->d_op->d_revalidate)
+			goto need_revalidate;
 done:
-	path->mnt = mnt;
-	path->dentry = dentry;
-	__follow_mount(path);
+		path->mnt = mnt;
+		path->dentry = dentry;
+		__follow_mount(path);
+		*inode = path->dentry->d_inode;
+	}
 	return 0;
 
 need_lookup:
-	parent = nd->path.dentry;
 	dir = parent->d_inode;
+	BUG_ON(nd->inode != dir);
 
 	mutex_lock(&dir->i_mutex);
 	/*
@@ -815,7 +1114,6 @@ static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
 static int link_path_walk(const char *name, struct nameidata *nd)
 {
 	struct path next;
-	struct inode *inode;
 	int err;
 	unsigned int lookup_flags = nd->flags;
 	
@@ -824,18 +1122,28 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 	if (!*name)
 		goto return_reval;
 
-	inode = nd->path.dentry->d_inode;
 	if (nd->depth)
 		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
 
 	/* At this point we know we have a real path component. */
 	for(;;) {
+		struct inode *inode;
 		unsigned long hash;
 		struct qstr this;
 		unsigned int c;
 
 		nd->flags |= LOOKUP_CONTINUE;
-		err = exec_permission(inode);
+		if (nd->flags & LOOKUP_RCU) {
+			err = exec_permission_rcu(nd->inode);
+			if (err == -ECHILD) {
+				if (nameidata_drop_rcu(nd))
+					return -ECHILD;
+				goto exec_again;
+			}
+		} else {
+exec_again:
+			err = exec_permission(nd->inode);
+		}
  		if (err)
 			break;
 
@@ -866,37 +1174,44 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		if (this.name[0] == '.') switch (this.len) {
 			default:
 				break;
-			case 2:	
+			case 2:
 				if (this.name[1] != '.')
 					break;
-				follow_dotdot(nd);
-				inode = nd->path.dentry->d_inode;
+				if (nd->flags & LOOKUP_RCU) {
+					if (follow_dotdot_rcu(nd))
+						return -ECHILD;
+				} else
+					follow_dotdot(nd);
 				/* fallthrough */
 			case 1:
 				continue;
 		}
 		/* This does the actual lookups.. */
-		err = do_lookup(nd, &this, &next);
+		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
 			break;
-
 		err = -ENOENT;
-		inode = next.dentry->d_inode;
 		if (!inode)
 			goto out_dput;
 
 		if (inode->i_op->follow_link) {
+			/* We commonly drop rcu-walk here */
+			if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
+				return -ECHILD;
+			BUG_ON(inode != next.dentry->d_inode);
 			err = do_follow_link(&next, nd);
 			if (err)
 				goto return_err;
+			nd->inode = nd->path.dentry->d_inode;
 			err = -ENOENT;
-			inode = nd->path.dentry->d_inode;
-			if (!inode)
+			if (!nd->inode)
 				break;
-		} else
+		} else {
 			path_to_nameidata(&next, nd);
+			nd->inode = inode;
+		}
 		err = -ENOTDIR; 
-		if (!inode->i_op->lookup)
+		if (!nd->inode->i_op->lookup)
 			break;
 		continue;
 		/* here ends the main loop */
@@ -911,32 +1226,39 @@ last_component:
 		if (this.name[0] == '.') switch (this.len) {
 			default:
 				break;
-			case 2:	
+			case 2:
 				if (this.name[1] != '.')
 					break;
-				follow_dotdot(nd);
-				inode = nd->path.dentry->d_inode;
+				if (nd->flags & LOOKUP_RCU) {
+					if (follow_dotdot_rcu(nd))
+						return -ECHILD;
+				} else
+					follow_dotdot(nd);
 				/* fallthrough */
 			case 1:
 				goto return_reval;
 		}
-		err = do_lookup(nd, &this, &next);
+		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
 			break;
-		inode = next.dentry->d_inode;
 		if (follow_on_final(inode, lookup_flags)) {
+			if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
+				return -ECHILD;
+			BUG_ON(inode != next.dentry->d_inode);
 			err = do_follow_link(&next, nd);
 			if (err)
 				goto return_err;
-			inode = nd->path.dentry->d_inode;
-		} else
+			nd->inode = nd->path.dentry->d_inode;
+		} else {
 			path_to_nameidata(&next, nd);
+			nd->inode = inode;
+		}
 		err = -ENOENT;
-		if (!inode)
+		if (!nd->inode)
 			break;
 		if (lookup_flags & LOOKUP_DIRECTORY) {
 			err = -ENOTDIR; 
-			if (!inode->i_op->lookup)
+			if (!nd->inode->i_op->lookup)
 				break;
 		}
 		goto return_base;
@@ -958,6 +1280,8 @@ return_reval:
 		 */
 		if (nd->path.dentry && nd->path.dentry->d_sb &&
 		    (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
+			if (nameidata_drop_rcu_maybe(nd))
+				return -ECHILD;
 			err = -ESTALE;
 			/* Note: we do not d_invalidate() */
 			if (!nd->path.dentry->d_op->d_revalidate(
@@ -965,16 +1289,34 @@ return_reval:
 				break;
 		}
 return_base:
+		if (nameidata_drop_rcu_last_maybe(nd))
+			return -ECHILD;
 		return 0;
 out_dput:
-		path_put_conditional(&next, nd);
+		if (!(nd->flags & LOOKUP_RCU))
+			path_put_conditional(&next, nd);
 		break;
 	}
-	path_put(&nd->path);
+	if (!(nd->flags & LOOKUP_RCU))
+		path_put(&nd->path);
 return_err:
 	return err;
 }
 
+static inline int path_walk_rcu(const char *name, struct nameidata *nd)
+{
+	current->total_link_count = 0;
+
+	return link_path_walk(name, nd);
+}
+
+static inline int path_walk_simple(const char *name, struct nameidata *nd)
+{
+	current->total_link_count = 0;
+
+	return link_path_walk(name, nd);
+}
+
 static int path_walk(const char *name, struct nameidata *nd)
 {
 	struct path save = nd->path;
@@ -1000,6 +1342,88 @@ static int path_walk(const char *name, struct nameidata *nd)
 	return result;
 }
 
+static void path_finish_rcu(struct nameidata *nd)
+{
+	if (nd->flags & LOOKUP_RCU) {
+		/* RCU dangling. Cancel it. */
+		nd->flags &= ~LOOKUP_RCU;
+		nd->root.mnt = NULL;
+		rcu_read_unlock();
+		br_read_unlock(vfsmount_lock);
+	}
+	if (nd->file)
+		fput(nd->file);
+}
+
+static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+{
+	int retval = 0;
+	int fput_needed;
+	struct file *file;
+
+	nd->last_type = LAST_ROOT; /* if there are only slashes... */
+	nd->flags = flags | LOOKUP_RCU;
+	nd->depth = 0;
+	nd->root.mnt = NULL;
+	nd->file = NULL;
+
+	if (*name=='/') {
+		struct fs_struct *fs = current->fs;
+
+		br_read_lock(vfsmount_lock);
+		rcu_read_lock();
+
+		spin_lock(&fs->lock);
+		nd->root = fs->root;
+		nd->path = nd->root;
+		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+		spin_unlock(&fs->lock);
+
+	} else if (dfd == AT_FDCWD) {
+		struct fs_struct *fs = current->fs;
+
+		br_read_lock(vfsmount_lock);
+		rcu_read_lock();
+
+		spin_lock(&fs->lock);
+		nd->path = fs->pwd;
+		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+		spin_unlock(&fs->lock);
+	} else {
+		struct dentry *dentry;
+
+		file = fget_light(dfd, &fput_needed);
+		retval = -EBADF;
+		if (!file)
+			goto out_fail;
+
+		dentry = file->f_path.dentry;
+
+		retval = -ENOTDIR;
+		if (!S_ISDIR(dentry->d_inode->i_mode))
+			goto fput_fail;
+
+		retval = file_permission(file, MAY_EXEC);
+		if (retval)
+			goto fput_fail;
+
+		nd->path = file->f_path;
+		if (fput_needed)
+			nd->file = file;
+
+		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+		br_read_lock(vfsmount_lock);
+		rcu_read_lock();
+	}
+	nd->inode = nd->path.dentry->d_inode;
+	return 0;
+
+fput_fail:
+	fput_light(file, fput_needed);
+out_fail:
+	return retval;
+}
+
 static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
 	int retval = 0;
@@ -1040,6 +1464,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
 
 		fput_light(file, fput_needed);
 	}
+	nd->inode = nd->path.dentry->d_inode;
 	return 0;
 
 fput_fail:
@@ -1052,16 +1477,53 @@ out_fail:
 static int do_path_lookup(int dfd, const char *name,
 				unsigned int flags, struct nameidata *nd)
 {
-	int retval = path_init(dfd, name, flags, nd);
-	if (!retval)
-		retval = path_walk(name, nd);
-	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
-				nd->path.dentry->d_inode))
-		audit_inode(name, nd->path.dentry);
+	int retval;
+
+	/*
+	 * Path walking is largely split up into 2 different synchronisation
+	 * schemes, rcu-walk and ref-walk (explained in
+	 * Documentation/filesystems/path-lookup.txt). These share much of the
+	 * path walk code, but some things particularly setup, cleanup, and
+	 * following mounts are sufficiently divergent that functions are
+	 * duplicated. Typically there is a function foo(), and its RCU
+	 * analogue, foo_rcu().
+	 *
+	 * -ECHILD is the error number of choice (just to avoid clashes) that
+	 * is returned if some aspect of an rcu-walk fails. Such an error must
+	 * be handled by restarting a traditional ref-walk (which will always
+	 * be able to complete).
+	 */
+	retval = path_init_rcu(dfd, name, flags, nd);
+	if (unlikely(retval))
+		return retval;
+	retval = path_walk_rcu(name, nd);
+	path_finish_rcu(nd);
 	if (nd->root.mnt) {
 		path_put(&nd->root);
 		nd->root.mnt = NULL;
 	}
+
+	if (unlikely(retval == -ECHILD || retval == -ESTALE)) {
+		/* slower, locked walk */
+		if (retval == -ESTALE)
+			flags |= LOOKUP_REVAL;
+		retval = path_init(dfd, name, flags, nd);
+		if (unlikely(retval))
+			return retval;
+		retval = path_walk(name, nd);
+		if (nd->root.mnt) {
+			path_put(&nd->root);
+			nd->root.mnt = NULL;
+		}
+	}
+
+	if (likely(!retval)) {
+		if (unlikely(!audit_dummy_context())) {
+			if (nd->path.dentry && nd->inode)
+				audit_inode(name, nd->path.dentry);
+		}
+	}
+
 	return retval;
 }
 
@@ -1104,10 +1566,11 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 	path_get(&nd->path);
 	nd->root = nd->path;
 	path_get(&nd->root);
+	nd->inode = nd->path.dentry->d_inode;
 
 	retval = path_walk(name, nd);
 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
-				nd->path.dentry->d_inode))
+				nd->inode))
 		audit_inode(name, nd->path.dentry);
 
 	path_put(&nd->root);
@@ -1488,6 +1951,7 @@ out_unlock:
 	mutex_unlock(&dir->d_inode->i_mutex);
 	dput(nd->path.dentry);
 	nd->path.dentry = path->dentry;
+
 	if (error)
 		return error;
 	/* Don't check for write permission, don't truncate */
@@ -1582,6 +2046,9 @@ exit:
 	return ERR_PTR(error);
 }
 
+/*
+ * Handle O_CREAT case for do_filp_open
+ */
 static struct file *do_last(struct nameidata *nd, struct path *path,
 			    int open_flag, int acc_mode,
 			    int mode, const char *pathname)
@@ -1603,42 +2070,16 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		}
 		/* fallthrough */
 	case LAST_ROOT:
-		if (open_flag & O_CREAT)
-			goto exit;
-		/* fallthrough */
+		goto exit;
 	case LAST_BIND:
 		audit_inode(pathname, dir);
 		goto ok;
 	}
 
 	/* trailing slashes? */
-	if (nd->last.name[nd->last.len]) {
-		if (open_flag & O_CREAT)
-			goto exit;
-		nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
-	}
-
-	/* just plain open? */
-	if (!(open_flag & O_CREAT)) {
-		error = do_lookup(nd, &nd->last, path);
-		if (error)
-			goto exit;
-		error = -ENOENT;
-		if (!path->dentry->d_inode)
-			goto exit_dput;
-		if (path->dentry->d_inode->i_op->follow_link)
-			return NULL;
-		error = -ENOTDIR;
-		if (nd->flags & LOOKUP_DIRECTORY) {
-			if (!path->dentry->d_inode->i_op->lookup)
-				goto exit_dput;
-		}
-		path_to_nameidata(path, nd);
-		audit_inode(pathname, nd->path.dentry);
-		goto ok;
-	}
+	if (nd->last.name[nd->last.len])
+		goto exit;
 
-	/* OK, it's O_CREAT */
 	mutex_lock(&dir->d_inode->i_mutex);
 
 	path->dentry = lookup_hash(nd);
@@ -1709,8 +2150,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		return NULL;
 
 	path_to_nameidata(path, nd);
+	nd->inode = path->dentry->d_inode;
 	error = -EISDIR;
-	if (S_ISDIR(path->dentry->d_inode->i_mode))
+	if (S_ISDIR(nd->inode->i_mode))
 		goto exit;
 ok:
 	filp = finish_open(nd, open_flag, acc_mode);
@@ -1741,7 +2183,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	struct path path;
 	int count = 0;
 	int flag = open_to_namei_flags(open_flag);
-	int force_reval = 0;
+	int flags;
 
 	if (!(open_flag & O_CREAT))
 		mode = 0;
@@ -1770,54 +2212,84 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	if (open_flag & O_APPEND)
 		acc_mode |= MAY_APPEND;
 
-	/* find the parent */
-reval:
-	error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
+	flags = LOOKUP_OPEN;
+	if (open_flag & O_CREAT) {
+		flags |= LOOKUP_CREATE;
+		if (open_flag & O_EXCL)
+			flags |= LOOKUP_EXCL;
+	}
+	if (open_flag & O_DIRECTORY)
+		flags |= LOOKUP_DIRECTORY;
+	if (!(open_flag & O_NOFOLLOW))
+		flags |= LOOKUP_FOLLOW;
+
+	filp = get_empty_filp();
+	if (!filp)
+		return ERR_PTR(-ENFILE);
+
+	filp->f_flags = open_flag;
+	nd.intent.open.file = filp;
+	nd.intent.open.flags = flag;
+	nd.intent.open.create_mode = mode;
+
+	if (open_flag & O_CREAT)
+		goto creat;
+
+	/* !O_CREAT, simple open */
+	error = do_path_lookup(dfd, pathname, flags, &nd);
+	if (unlikely(error))
+		goto out_filp;
+	error = -ELOOP;
+	if (!(nd.flags & LOOKUP_FOLLOW)) {
+		if (nd.inode->i_op->follow_link)
+			goto out_path;
+	}
+	error = -ENOTDIR;
+	if (nd.flags & LOOKUP_DIRECTORY) {
+		if (!nd.inode->i_op->lookup)
+			goto out_path;
+	}
+	audit_inode(pathname, nd.path.dentry);
+	filp = finish_open(&nd, open_flag, acc_mode);
+	return filp;
+
+creat:
+	/* OK, have to create the file. Find the parent. */
+	error = path_init_rcu(dfd, pathname,
+			LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
 	if (error)
-		return ERR_PTR(error);
-	if (force_reval)
-		nd.flags |= LOOKUP_REVAL;
+		goto out_filp;
+	error = path_walk_rcu(pathname, &nd);
+	path_finish_rcu(&nd);
+	if (unlikely(error == -ECHILD || error == -ESTALE)) {
+		/* slower, locked walk */
+		if (error == -ESTALE) {
+reval:
+			flags |= LOOKUP_REVAL;
+		}
+		error = path_init(dfd, pathname,
+				LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
+		if (error)
+			goto out_filp;
 
-	current->total_link_count = 0;
-	error = link_path_walk(pathname, &nd);
-	if (error) {
-		filp = ERR_PTR(error);
-		goto out;
+		error = path_walk_simple(pathname, &nd);
 	}
-	if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT))
+	if (unlikely(error))
+		goto out_filp;
+	if (unlikely(!audit_dummy_context()))
 		audit_inode(pathname, nd.path.dentry);
 
 	/*
 	 * We have the parent and last component.
 	 */
-
-	error = -ENFILE;
-	filp = get_empty_filp();
-	if (filp == NULL)
-		goto exit_parent;
-	nd.intent.open.file = filp;
-	filp->f_flags = open_flag;
-	nd.intent.open.flags = flag;
-	nd.intent.open.create_mode = mode;
-	nd.flags &= ~LOOKUP_PARENT;
-	nd.flags |= LOOKUP_OPEN;
-	if (open_flag & O_CREAT) {
-		nd.flags |= LOOKUP_CREATE;
-		if (open_flag & O_EXCL)
-			nd.flags |= LOOKUP_EXCL;
-	}
-	if (open_flag & O_DIRECTORY)
-		nd.flags |= LOOKUP_DIRECTORY;
-	if (!(open_flag & O_NOFOLLOW))
-		nd.flags |= LOOKUP_FOLLOW;
+	nd.flags = flags;
 	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path holder;
-		struct inode *inode = path.dentry->d_inode;
 		void *cookie;
 		error = -ELOOP;
 		/* S_ISDIR part is a temporary automount kludge */
-		if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode))
+		if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(nd.inode->i_mode))
 			goto exit_dput;
 		if (count++ == 32)
 			goto exit_dput;
@@ -1838,36 +2310,33 @@ reval:
 			goto exit_dput;
 		error = __do_follow_link(&path, &nd, &cookie);
 		if (unlikely(error)) {
+			if (!IS_ERR(cookie) && nd.inode->i_op->put_link)
+				nd.inode->i_op->put_link(path.dentry, &nd, cookie);
 			/* nd.path had been dropped */
-			if (!IS_ERR(cookie) && inode->i_op->put_link)
-				inode->i_op->put_link(path.dentry, &nd, cookie);
-			path_put(&path);
-			release_open_intent(&nd);
-			filp = ERR_PTR(error);
-			goto out;
+			nd.path = path;
+			goto out_path;
 		}
 		holder = path;
 		nd.flags &= ~LOOKUP_PARENT;
 		filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-		if (inode->i_op->put_link)
-			inode->i_op->put_link(holder.dentry, &nd, cookie);
+		if (nd.inode->i_op->put_link)
+			nd.inode->i_op->put_link(holder.dentry, &nd, cookie);
 		path_put(&holder);
 	}
 out:
 	if (nd.root.mnt)
 		path_put(&nd.root);
-	if (filp == ERR_PTR(-ESTALE) && !force_reval) {
-		force_reval = 1;
+	if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
 		goto reval;
-	}
 	return filp;
 
 exit_dput:
 	path_put_conditional(&path, &nd);
+out_path:
+	path_put(&nd.path);
+out_filp:
 	if (!IS_ERR(nd.intent.open.file))
 		release_open_intent(&nd);
-exit_parent:
-	path_put(&nd.path);
 	filp = ERR_PTR(error);
 	goto out;
 }
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index ae4b0fd9033f..998e3a715bcc 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -402,6 +402,10 @@ static int proc_sys_compare(const struct dentry *parent,
 		const struct dentry *dentry, const struct inode *inode,
 		unsigned int len, const char *str, const struct qstr *name)
 {
+	/* Although proc doesn't have negative dentries, rcu-walk means
+	 * that inode here can be NULL */
+	if (!inode)
+		return 0;
 	if (name->len != len)
 		return 1;
 	if (memcmp(name->name, str, len))
-- 
cgit v1.2.2


From c28cc36469554dc55540f059fbdc7fa22a2c31fc Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:53 +1100
Subject: fs: fs_struct use seqlock

Use a seqlock in the fs_struct to enable us to take an atomic copy of the
complete cwd and root paths. Use this in the RCU lookup path to avoid a
thread-shared spinlock in RCU lookup operations.

Multi-threaded apps may now perform path lookups with scalability matching
multi-process apps. Operations such as stat(2) become very scalable for
multi-threaded workload.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/fs_struct.c | 10 ++++++++++
 fs/namei.c     | 34 +++++++++++++++++++++-------------
 2 files changed, 31 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index ed45a9cf5f3d..60b8531f41c5 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -14,9 +14,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
 	struct path old_root;
 
 	spin_lock(&fs->lock);
+	write_seqcount_begin(&fs->seq);
 	old_root = fs->root;
 	fs->root = *path;
 	path_get(path);
+	write_seqcount_end(&fs->seq);
 	spin_unlock(&fs->lock);
 	if (old_root.dentry)
 		path_put(&old_root);
@@ -31,9 +33,11 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
 	struct path old_pwd;
 
 	spin_lock(&fs->lock);
+	write_seqcount_begin(&fs->seq);
 	old_pwd = fs->pwd;
 	fs->pwd = *path;
 	path_get(path);
+	write_seqcount_end(&fs->seq);
 	spin_unlock(&fs->lock);
 
 	if (old_pwd.dentry)
@@ -52,6 +56,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
 		fs = p->fs;
 		if (fs) {
 			spin_lock(&fs->lock);
+			write_seqcount_begin(&fs->seq);
 			if (fs->root.dentry == old_root->dentry
 			    && fs->root.mnt == old_root->mnt) {
 				path_get(new_root);
@@ -64,6 +69,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
 				fs->pwd = *new_root;
 				count++;
 			}
+			write_seqcount_end(&fs->seq);
 			spin_unlock(&fs->lock);
 		}
 		task_unlock(p);
@@ -88,8 +94,10 @@ void exit_fs(struct task_struct *tsk)
 		int kill;
 		task_lock(tsk);
 		spin_lock(&fs->lock);
+		write_seqcount_begin(&fs->seq);
 		tsk->fs = NULL;
 		kill = !--fs->users;
+		write_seqcount_end(&fs->seq);
 		spin_unlock(&fs->lock);
 		task_unlock(tsk);
 		if (kill)
@@ -105,6 +113,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 		fs->users = 1;
 		fs->in_exec = 0;
 		spin_lock_init(&fs->lock);
+		seqcount_init(&fs->seq);
 		fs->umask = old->umask;
 		get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
 	}
@@ -144,6 +153,7 @@ EXPORT_SYMBOL(current_umask);
 struct fs_struct init_fs = {
 	.users		= 1,
 	.lock		= __SPIN_LOCK_UNLOCKED(init_fs.lock),
+	.seq		= SEQCNT_ZERO,
 	.umask		= 0022,
 };
 
diff --git a/fs/namei.c b/fs/namei.c
index 8d3f15b3a541..c731b50a6184 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -684,9 +684,12 @@ static __always_inline void set_root_rcu(struct nameidata *nd)
 {
 	if (!nd->root.mnt) {
 		struct fs_struct *fs = current->fs;
-		spin_lock(&fs->lock);
-		nd->root = fs->root;
-		spin_unlock(&fs->lock);
+		unsigned seq;
+
+		do {
+			seq = read_seqcount_begin(&fs->seq);
+			nd->root = fs->root;
+		} while (read_seqcount_retry(&fs->seq, seq));
 	}
 }
 
@@ -1369,26 +1372,31 @@ static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct n
 
 	if (*name=='/') {
 		struct fs_struct *fs = current->fs;
+		unsigned seq;
 
 		br_read_lock(vfsmount_lock);
 		rcu_read_lock();
 
-		spin_lock(&fs->lock);
-		nd->root = fs->root;
-		nd->path = nd->root;
-		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
-		spin_unlock(&fs->lock);
+		do {
+			seq = read_seqcount_begin(&fs->seq);
+			nd->root = fs->root;
+			nd->path = nd->root;
+			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+		} while (read_seqcount_retry(&fs->seq, seq));
 
 	} else if (dfd == AT_FDCWD) {
 		struct fs_struct *fs = current->fs;
+		unsigned seq;
 
 		br_read_lock(vfsmount_lock);
 		rcu_read_lock();
 
-		spin_lock(&fs->lock);
-		nd->path = fs->pwd;
-		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
-		spin_unlock(&fs->lock);
+		do {
+			seq = read_seqcount_begin(&fs->seq);
+			nd->path = fs->pwd;
+			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+		} while (read_seqcount_retry(&fs->seq, seq));
+
 	} else {
 		struct dentry *dentry;
 
@@ -1411,7 +1419,7 @@ static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct n
 		if (fput_needed)
 			nd->file = file;
 
-		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+		nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
 		br_read_lock(vfsmount_lock);
 		rcu_read_lock();
 	}
-- 
cgit v1.2.2


From 5f57cbcc02cf18f6b22ef4066bb10afeb8f930ff Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:54 +1100
Subject: fs: dcache remove d_mounted

Rather than keep a d_mounted count in the dentry, set a dentry flag instead.
The flag can be cleared by checking the hash table to see if there are any
mounts left, which is not time critical because it is performed at detach time.

The mounted state of a dentry is only used to speculatively take a look in the
mount hash table if it is set -- before following the mount, vfsmount lock is
taken and mount re-checked without races.

This saves 4 bytes on 32-bit, nothing on 64-bit but it does provide a hole I
might use later (and some configs have larger than 32-bit spinlocks which might
make use of the hole).

Autofs4 conversion and changelog by Ian Kent <raven@themaw.net>:
In autofs4, when expring direct (or offset) mounts we need to ensure that we
block user path walks into the autofs mount, which is covered by another mount.
To do this we clear the mounted status so that follows stop before walking into
the mount and are essentially blocked until the expire is completed. The
automount daemon still finds the correct dentry for the umount due to the
follow mount logic in fs/autofs4/root.c:autofs4_follow_link(), which is set as
an inode operation for direct and offset mounts only and is called following
the lookup that stopped at the covered mount.

At the end of the expire the covering mount probably has gone away so the
mounted status need not be restored. But we need to check this and only restore
the mounted status if the expire failed.

XXX: autofs may not work right if we have other mounts go over the top of it?

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/autofs4/expire.c | 13 +++++++++++--
 fs/dcache.c         |  1 -
 fs/namespace.c      | 29 ++++++++++++++++++++++++++---
 3 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 2f7951d67d16..cc1d01365905 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -295,7 +295,9 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
 		struct autofs_info *ino = autofs4_dentry_ino(root);
 		if (d_mountpoint(root)) {
 			ino->flags |= AUTOFS_INF_MOUNTPOINT;
-			root->d_mounted--;
+			spin_lock(&root->d_lock);
+			root->d_flags &= ~DCACHE_MOUNTED;
+			spin_unlock(&root->d_lock);
 		}
 		ino->flags |= AUTOFS_INF_EXPIRING;
 		init_completion(&ino->expire_complete);
@@ -503,7 +505,14 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 
 		spin_lock(&sbi->fs_lock);
 		if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
-			sb->s_root->d_mounted++;
+			spin_lock(&sb->s_root->d_lock);
+			/*
+			 * If we haven't been expired away, then reset
+			 * mounted status.
+			 */
+			if (mnt->mnt_parent != mnt)
+				sb->s_root->d_flags |= DCACHE_MOUNTED;
+			spin_unlock(&sb->s_root->d_lock);
 			ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
 		}
 		ino->flags &= ~AUTOFS_INF_EXPIRING;
diff --git a/fs/dcache.c b/fs/dcache.c
index 187fea040108..1d5cf511e1c7 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1265,7 +1265,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	dentry->d_sb = NULL;
 	dentry->d_op = NULL;
 	dentry->d_fsdata = NULL;
-	dentry->d_mounted = 0;
 	INIT_HLIST_NODE(&dentry->d_hash);
 	INIT_LIST_HEAD(&dentry->d_lru);
 	INIT_LIST_HEAD(&dentry->d_subdirs);
diff --git a/fs/namespace.c b/fs/namespace.c
index 3dbfc072ec70..39a7d507fba0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -491,6 +491,27 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
 	}
 }
 
+/*
+ * Clear dentry's mounted state if it has no remaining mounts.
+ * vfsmount_lock must be held for write.
+ */
+static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry)
+{
+	unsigned u;
+
+	for (u = 0; u < HASH_SIZE; u++) {
+		struct vfsmount *p;
+
+		list_for_each_entry(p, &mount_hashtable[u], mnt_hash) {
+			if (p->mnt_mountpoint == dentry)
+				return;
+		}
+	}
+	spin_lock(&dentry->d_lock);
+	dentry->d_flags &= ~DCACHE_MOUNTED;
+	spin_unlock(&dentry->d_lock);
+}
+
 /*
  * vfsmount lock must be held for write
  */
@@ -502,7 +523,7 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
 	mnt->mnt_mountpoint = mnt->mnt_root;
 	list_del_init(&mnt->mnt_child);
 	list_del_init(&mnt->mnt_hash);
-	old_path->dentry->d_mounted--;
+	dentry_reset_mounted(old_path->mnt, old_path->dentry);
 }
 
 /*
@@ -513,7 +534,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
 {
 	child_mnt->mnt_parent = mntget(mnt);
 	child_mnt->mnt_mountpoint = dget(dentry);
-	dentry->d_mounted++;
+	spin_lock(&dentry->d_lock);
+	dentry->d_flags |= DCACHE_MOUNTED;
+	spin_unlock(&dentry->d_lock);
 }
 
 /*
@@ -1073,7 +1096,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 		list_del_init(&p->mnt_child);
 		if (p->mnt_parent != p) {
 			p->mnt_parent->mnt_ghosts++;
-			p->mnt_mountpoint->d_mounted--;
+			dentry_reset_mounted(p->mnt_parent, p->mnt_mountpoint);
 		}
 		change_mnt_propagation(p, MS_PRIVATE);
 	}
-- 
cgit v1.2.2


From fb045adb99d9b7c562dc7fef834857f78249daa1 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:55 +1100
Subject: fs: dcache reduce branches in lookup path

Reduce some branches and memory accesses in dcache lookup by adding dentry
flags to indicate common d_ops are set, rather than having to check them.
This saves a pointer memory access (dentry->d_op) in common path lookup
situations, and saves another pointer load and branch in cases where we
have d_op but not the particular operation.

Patched with:

git grep -E '[.>]([[:space:]])*d_op([[:space:]])*=' | xargs sed -e 's/\([^\t ]*\)->d_op = \(.*\);/d_set_d_op(\1, \2);/' -e 's/\([^\t ]*\)\.d_op = \(.*\);/d_set_d_op(\&\1, \2);/' -i

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/9p/vfs_inode.c       | 26 +++++++++++++-------------
 fs/adfs/dir.c           |  2 +-
 fs/adfs/super.c         |  2 +-
 fs/affs/namei.c         |  2 +-
 fs/affs/super.c         |  2 +-
 fs/afs/dir.c            |  2 +-
 fs/anon_inodes.c        |  2 +-
 fs/autofs4/inode.c      |  2 +-
 fs/autofs4/root.c       | 10 +++++-----
 fs/btrfs/export.c       |  4 ++--
 fs/btrfs/inode.c        |  2 +-
 fs/ceph/dir.c           |  6 +++---
 fs/cifs/dir.c           | 16 ++++++++--------
 fs/cifs/inode.c         |  8 ++++----
 fs/cifs/link.c          |  4 ++--
 fs/cifs/readdir.c       |  4 ++--
 fs/coda/dir.c           |  2 +-
 fs/configfs/dir.c       |  8 ++++----
 fs/dcache.c             | 30 ++++++++++++++++++++++++++----
 fs/ecryptfs/inode.c     |  2 +-
 fs/ecryptfs/main.c      |  4 ++--
 fs/fat/inode.c          |  4 ++--
 fs/fat/namei_msdos.c    |  6 +++---
 fs/fat/namei_vfat.c     |  8 ++++----
 fs/fuse/dir.c           |  2 +-
 fs/fuse/inode.c         |  4 ++--
 fs/gfs2/export.c        |  4 ++--
 fs/gfs2/ops_fstype.c    |  2 +-
 fs/gfs2/ops_inode.c     |  2 +-
 fs/hfs/dir.c            |  2 +-
 fs/hfs/super.c          |  2 +-
 fs/hfsplus/dir.c        |  2 +-
 fs/hfsplus/super.c      |  2 +-
 fs/hostfs/hostfs_kern.c |  2 +-
 fs/hpfs/dentry.c        |  2 +-
 fs/isofs/inode.c        |  2 +-
 fs/isofs/namei.c        |  2 +-
 fs/jfs/namei.c          |  4 ++--
 fs/jfs/super.c          |  2 +-
 fs/libfs.c              |  2 +-
 fs/minix/namei.c        |  2 +-
 fs/namei.c              | 31 ++++++++++++++++++++-----------
 fs/ncpfs/dir.c          |  4 ++--
 fs/ncpfs/inode.c        |  2 +-
 fs/nfs/dir.c            |  6 +++---
 fs/nfs/getroot.c        |  4 ++--
 fs/ocfs2/export.c       |  4 ++--
 fs/ocfs2/namei.c        | 10 +++++-----
 fs/pipe.c               |  2 +-
 fs/proc/base.c          | 12 ++++++------
 fs/proc/generic.c       |  2 +-
 fs/proc/proc_sysctl.c   |  4 ++--
 fs/reiserfs/xattr.c     |  2 +-
 fs/sysfs/dir.c          |  2 +-
 fs/sysv/namei.c         |  2 +-
 fs/sysv/super.c         |  2 +-
 56 files changed, 159 insertions(+), 128 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index f6f9081e6d2c..df8bbb358d54 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -635,9 +635,9 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	}
 
 	if (v9ses->cache)
-		dentry->d_op = &v9fs_cached_dentry_operations;
+		d_set_d_op(dentry, &v9fs_cached_dentry_operations);
 	else
-		dentry->d_op = &v9fs_dentry_operations;
+		d_set_d_op(dentry, &v9fs_dentry_operations);
 
 	d_instantiate(dentry, inode);
 	err = v9fs_fid_add(dentry, fid);
@@ -749,7 +749,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 				err);
 			goto error;
 		}
-		dentry->d_op = &v9fs_cached_dentry_operations;
+		d_set_d_op(dentry, &v9fs_cached_dentry_operations);
 		d_instantiate(dentry, inode);
 		err = v9fs_fid_add(dentry, fid);
 		if (err < 0)
@@ -767,7 +767,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 			err = PTR_ERR(inode);
 			goto error;
 		}
-		dentry->d_op = &v9fs_dentry_operations;
+		d_set_d_op(dentry, &v9fs_dentry_operations);
 		d_instantiate(dentry, inode);
 	}
 	/* Now set the ACL based on the default value */
@@ -956,7 +956,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 				err);
 			goto error;
 		}
-		dentry->d_op = &v9fs_cached_dentry_operations;
+		d_set_d_op(dentry, &v9fs_cached_dentry_operations);
 		d_instantiate(dentry, inode);
 		err = v9fs_fid_add(dentry, fid);
 		if (err < 0)
@@ -973,7 +973,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 			err = PTR_ERR(inode);
 			goto error;
 		}
-		dentry->d_op = &v9fs_dentry_operations;
+		d_set_d_op(dentry, &v9fs_dentry_operations);
 		d_instantiate(dentry, inode);
 	}
 	/* Now set the ACL based on the default value */
@@ -1041,9 +1041,9 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 
 inst_out:
 	if (v9ses->cache)
-		dentry->d_op = &v9fs_cached_dentry_operations;
+		d_set_d_op(dentry, &v9fs_cached_dentry_operations);
 	else
-		dentry->d_op = &v9fs_dentry_operations;
+		d_set_d_op(dentry, &v9fs_dentry_operations);
 
 	d_add(dentry, inode);
 	return NULL;
@@ -1709,7 +1709,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 					err);
 			goto error;
 		}
-		dentry->d_op = &v9fs_cached_dentry_operations;
+		d_set_d_op(dentry, &v9fs_cached_dentry_operations);
 		d_instantiate(dentry, inode);
 		err = v9fs_fid_add(dentry, fid);
 		if (err < 0)
@@ -1722,7 +1722,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 			err = PTR_ERR(inode);
 			goto error;
 		}
-		dentry->d_op = &v9fs_dentry_operations;
+		d_set_d_op(dentry, &v9fs_dentry_operations);
 		d_instantiate(dentry, inode);
 	}
 
@@ -1856,7 +1856,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
 		ihold(old_dentry->d_inode);
 	}
 
-	dentry->d_op = old_dentry->d_op;
+	d_set_d_op(dentry, old_dentry->d_op);
 	d_instantiate(dentry, old_dentry->d_inode);
 
 	return err;
@@ -1980,7 +1980,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
 				err);
 			goto error;
 		}
-		dentry->d_op = &v9fs_cached_dentry_operations;
+		d_set_d_op(dentry, &v9fs_cached_dentry_operations);
 		d_instantiate(dentry, inode);
 		err = v9fs_fid_add(dentry, fid);
 		if (err < 0)
@@ -1996,7 +1996,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
 			err = PTR_ERR(inode);
 			goto error;
 		}
-		dentry->d_op = &v9fs_dentry_operations;
+		d_set_d_op(dentry, &v9fs_dentry_operations);
 		d_instantiate(dentry, inode);
 	}
 	/* Now set the ACL based on the default value */
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index a11e5e102716..bf7693c384f9 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -276,7 +276,7 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	struct object_info obj;
 	int error;
 
-	dentry->d_op = &adfs_dentry_operations;	
+	d_set_d_op(dentry, &adfs_dentry_operations);
 	lock_kernel();
 	error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
 	if (error == 0) {
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 47dffc513a26..a4041b52fbca 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -484,7 +484,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 		adfs_error(sb, "get root inode failed\n");
 		goto error;
 	} else
-		sb->s_root->d_op = &adfs_dentry_operations;
+		d_set_d_op(sb->s_root, &adfs_dentry_operations);
 	unlock_kernel();
 	return 0;
 
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 5aca08c21100..944a4042fb65 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -240,7 +240,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 		if (IS_ERR(inode))
 			return ERR_CAST(inode);
 	}
-	dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
+	d_set_d_op(dentry, AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations);
 	d_add(dentry, inode);
 	return NULL;
 }
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 4c18fcfb0a19..d39081bbe7ce 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -482,7 +482,7 @@ got_root:
 		printk(KERN_ERR "AFFS: Get root inode failed\n");
 		goto out_error;
 	}
-	sb->s_root->d_op = &affs_dentry_operations;
+	d_set_d_op(sb->s_root, &affs_dentry_operations);
 
 	pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
 	return 0;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 2c18cde27000..b8bb7e7148d0 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -581,7 +581,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 	}
 
 success:
-	dentry->d_op = &afs_fs_dentry_operations;
+	d_set_d_op(dentry, &afs_fs_dentry_operations);
 
 	d_add(dentry, inode);
 	_leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }",
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 57ce55b2564c..aca8806fa206 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -113,7 +113,7 @@ struct file *anon_inode_getfile(const char *name,
 	 */
 	ihold(anon_inode_inode);
 
-	path.dentry->d_op = &anon_inodefs_dentry_operations;
+	d_set_d_op(path.dentry, &anon_inodefs_dentry_operations);
 	d_instantiate(path.dentry, anon_inode_inode);
 
 	error = -ENFILE;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index ac87e49fa706..a7bdb9dcac84 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -309,7 +309,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 		goto fail_iput;
 	pipe = NULL;
 
-	root->d_op = &autofs4_sb_dentry_operations;
+	d_set_d_op(root, &autofs4_sb_dentry_operations);
 	root->d_fsdata = ino;
 
 	/* Can this call block? */
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 10ca68a96dc7..bfe3f2eb684d 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -571,7 +571,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 		 * we check for the hashed dentry and return the newly
 		 * hashed dentry.
 		 */
-		dentry->d_op = &autofs4_root_dentry_operations;
+		d_set_d_op(dentry, &autofs4_root_dentry_operations);
 
 		/*
 		 * And we need to ensure that the same dentry is used for
@@ -710,9 +710,9 @@ static int autofs4_dir_symlink(struct inode *dir,
 	d_add(dentry, inode);
 
 	if (dir == dir->i_sb->s_root->d_inode)
-		dentry->d_op = &autofs4_root_dentry_operations;
+		d_set_d_op(dentry, &autofs4_root_dentry_operations);
 	else
-		dentry->d_op = &autofs4_dentry_operations;
+		d_set_d_op(dentry, &autofs4_dentry_operations);
 
 	dentry->d_fsdata = ino;
 	ino->dentry = dget(dentry);
@@ -845,9 +845,9 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	d_add(dentry, inode);
 
 	if (dir == dir->i_sb->s_root->d_inode)
-		dentry->d_op = &autofs4_root_dentry_operations;
+		d_set_d_op(dentry, &autofs4_root_dentry_operations);
 	else
-		dentry->d_op = &autofs4_dentry_operations;
+		d_set_d_op(dentry, &autofs4_dentry_operations);
 
 	dentry->d_fsdata = ino;
 	ino->dentry = dget(dentry);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 659f532d26a0..0ccf9a8afcdf 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -110,7 +110,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 
 	dentry = d_obtain_alias(inode);
 	if (!IS_ERR(dentry))
-		dentry->d_op = &btrfs_dentry_operations;
+		d_set_d_op(dentry, &btrfs_dentry_operations);
 	return dentry;
 fail:
 	srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -225,7 +225,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
 	key.offset = 0;
 	dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
 	if (!IS_ERR(dentry))
-		dentry->d_op = &btrfs_dentry_operations;
+		d_set_d_op(dentry, &btrfs_dentry_operations);
 	return dentry;
 fail:
 	btrfs_free_path(path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f9d2994a42a2..63e4546b478a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4084,7 +4084,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 	int index;
 	int ret;
 
-	dentry->d_op = &btrfs_dentry_operations;
+	d_set_d_op(dentry, &btrfs_dentry_operations);
 
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 58abc3da6111..cc01cf826769 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -42,11 +42,11 @@ int ceph_init_dentry(struct dentry *dentry)
 
 	if (dentry->d_parent == NULL ||   /* nfs fh_to_dentry */
 	    ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
-		dentry->d_op = &ceph_dentry_ops;
+		d_set_d_op(dentry, &ceph_dentry_ops);
 	else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
-		dentry->d_op = &ceph_snapdir_dentry_ops;
+		d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
 	else
-		dentry->d_op = &ceph_snap_dentry_ops;
+		d_set_d_op(dentry, &ceph_snap_dentry_ops);
 
 	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
 	if (!di)
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 88bfe686ac00..e3b10ca6d453 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -135,9 +135,9 @@ static void setup_cifs_dentry(struct cifsTconInfo *tcon,
 			      struct inode *newinode)
 {
 	if (tcon->nocase)
-		direntry->d_op = &cifs_ci_dentry_ops;
+		d_set_d_op(direntry, &cifs_ci_dentry_ops);
 	else
-		direntry->d_op = &cifs_dentry_ops;
+		d_set_d_op(direntry, &cifs_dentry_ops);
 	d_instantiate(direntry, newinode);
 }
 
@@ -421,9 +421,9 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 		rc = cifs_get_inode_info_unix(&newinode, full_path,
 						inode->i_sb, xid);
 		if (pTcon->nocase)
-			direntry->d_op = &cifs_ci_dentry_ops;
+			d_set_d_op(direntry, &cifs_ci_dentry_ops);
 		else
-			direntry->d_op = &cifs_dentry_ops;
+			d_set_d_op(direntry, &cifs_dentry_ops);
 
 		if (rc == 0)
 			d_instantiate(direntry, newinode);
@@ -604,9 +604,9 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 
 	if ((rc == 0) && (newInode != NULL)) {
 		if (pTcon->nocase)
-			direntry->d_op = &cifs_ci_dentry_ops;
+			d_set_d_op(direntry, &cifs_ci_dentry_ops);
 		else
-			direntry->d_op = &cifs_dentry_ops;
+			d_set_d_op(direntry, &cifs_dentry_ops);
 		d_add(direntry, newInode);
 		if (posix_open) {
 			filp = lookup_instantiate_filp(nd, direntry,
@@ -634,9 +634,9 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 		rc = 0;
 		direntry->d_time = jiffies;
 		if (pTcon->nocase)
-			direntry->d_op = &cifs_ci_dentry_ops;
+			d_set_d_op(direntry, &cifs_ci_dentry_ops);
 		else
-			direntry->d_op = &cifs_dentry_ops;
+			d_set_d_op(direntry, &cifs_dentry_ops);
 		d_add(direntry, NULL);
 	/*	if it was once a directory (but how can we tell?) we could do
 		shrink_dcache_parent(direntry); */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 99b9a2cc14b7..2a239d878e85 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1319,9 +1319,9 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 	to set uid/gid */
 			inc_nlink(inode);
 			if (pTcon->nocase)
-				direntry->d_op = &cifs_ci_dentry_ops;
+				d_set_d_op(direntry, &cifs_ci_dentry_ops);
 			else
-				direntry->d_op = &cifs_dentry_ops;
+				d_set_d_op(direntry, &cifs_dentry_ops);
 
 			cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
 			cifs_fill_uniqueid(inode->i_sb, &fattr);
@@ -1363,9 +1363,9 @@ mkdir_get_info:
 						 inode->i_sb, xid, NULL);
 
 		if (pTcon->nocase)
-			direntry->d_op = &cifs_ci_dentry_ops;
+			d_set_d_op(direntry, &cifs_ci_dentry_ops);
 		else
-			direntry->d_op = &cifs_dentry_ops;
+			d_set_d_op(direntry, &cifs_dentry_ops);
 		d_instantiate(direntry, newinode);
 		 /* setting nlink not necessary except in cases where we
 		  * failed to get it from the server or was set bogus */
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 85cdbf831e7b..fe2f6a93c49e 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -525,9 +525,9 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 			      rc);
 		} else {
 			if (pTcon->nocase)
-				direntry->d_op = &cifs_ci_dentry_ops;
+				d_set_d_op(direntry, &cifs_ci_dentry_ops);
 			else
-				direntry->d_op = &cifs_dentry_ops;
+				d_set_d_op(direntry, &cifs_dentry_ops);
 			d_instantiate(direntry, newinode);
 		}
 	}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index ee463aeca0b0..ec5b68e3b928 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -103,9 +103,9 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
 	}
 
 	if (cifs_sb_master_tcon(CIFS_SB(sb))->nocase)
-		dentry->d_op = &cifs_ci_dentry_ops;
+		d_set_d_op(dentry, &cifs_ci_dentry_ops);
 	else
-		dentry->d_op = &cifs_dentry_ops;
+		d_set_d_op(dentry, &cifs_dentry_ops);
 
 	alias = d_materialise_unique(dentry, inode);
 	if (alias != NULL) {
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 9e37e8bc9b89..aa40c811f8d2 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -125,7 +125,7 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
 		return ERR_PTR(error);
 
 exit:
-	entry->d_op = &coda_dentry_operations;
+	d_set_d_op(entry, &coda_dentry_operations);
 
 	if (inode && (type & CODA_NOCACHE))
 		coda_flag_inode(inode, C_VATTR | C_PURGE);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index e9acea440ffc..36637a8c1ed3 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -442,7 +442,7 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
 		return error;
 	}
 
-	dentry->d_op = &configfs_dentry_ops;
+	d_set_d_op(dentry, &configfs_dentry_ops);
 	d_rehash(dentry);
 
 	return 0;
@@ -489,7 +489,7 @@ static struct dentry * configfs_lookup(struct inode *dir,
 		 */
 		if (dentry->d_name.len > NAME_MAX)
 			return ERR_PTR(-ENAMETOOLONG);
-		dentry->d_op = &configfs_dentry_ops;
+		d_set_d_op(dentry, &configfs_dentry_ops);
 		d_add(dentry, NULL);
 		return NULL;
 	}
@@ -683,7 +683,7 @@ static int create_default_group(struct config_group *parent_group,
 	ret = -ENOMEM;
 	child = d_alloc(parent, &name);
 	if (child) {
-		child->d_op = &configfs_dentry_ops;
+		d_set_d_op(child, &configfs_dentry_ops);
 		d_add(child, NULL);
 
 		ret = configfs_attach_group(&parent_group->cg_item,
@@ -1681,7 +1681,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
 	err = -ENOMEM;
 	dentry = d_alloc(configfs_sb->s_root, &name);
 	if (dentry) {
-		dentry->d_op = &configfs_dentry_ops;
+		d_set_d_op(dentry, &configfs_dentry_ops);
 		d_add(dentry, NULL);
 
 		err = configfs_attach_group(sd->s_element, &group->cg_item,
diff --git a/fs/dcache.c b/fs/dcache.c
index 1d5cf511e1c7..f9693da3efbd 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -398,7 +398,7 @@ repeat:
 		return;
 	}
 
-	if (dentry->d_op && dentry->d_op->d_delete) {
+	if (dentry->d_flags & DCACHE_OP_DELETE) {
 		if (dentry->d_op->d_delete(dentry))
 			goto kill_it;
 	}
@@ -1301,6 +1301,28 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 }
 EXPORT_SYMBOL(d_alloc_name);
 
+void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
+{
+	BUG_ON(dentry->d_op);
+	BUG_ON(dentry->d_flags & (DCACHE_OP_HASH	|
+				DCACHE_OP_COMPARE	|
+				DCACHE_OP_REVALIDATE	|
+				DCACHE_OP_DELETE ));
+	dentry->d_op = op;
+	if (!op)
+		return;
+	if (op->d_hash)
+		dentry->d_flags |= DCACHE_OP_HASH;
+	if (op->d_compare)
+		dentry->d_flags |= DCACHE_OP_COMPARE;
+	if (op->d_revalidate)
+		dentry->d_flags |= DCACHE_OP_REVALIDATE;
+	if (op->d_delete)
+		dentry->d_flags |= DCACHE_OP_DELETE;
+
+}
+EXPORT_SYMBOL(d_set_d_op);
+
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 {
 	spin_lock(&dentry->d_lock);
@@ -1731,7 +1753,7 @@ seqretry:
 		 */
 		if (read_seqcount_retry(&dentry->d_seq, *seq))
 			goto seqretry;
-		if (parent->d_op && parent->d_op->d_compare) {
+		if (parent->d_flags & DCACHE_OP_COMPARE) {
 			if (parent->d_op->d_compare(parent, *inode,
 						dentry, i,
 						tlen, tname, name))
@@ -1846,7 +1868,7 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
 		 */
 		tlen = dentry->d_name.len;
 		tname = dentry->d_name.name;
-		if (parent->d_op && parent->d_op->d_compare) {
+		if (parent->d_flags & DCACHE_OP_COMPARE) {
 			if (parent->d_op->d_compare(parent, parent->d_inode,
 						dentry, dentry->d_inode,
 						tlen, tname, name))
@@ -1887,7 +1909,7 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
 	 * routine may choose to leave the hash value unchanged.
 	 */
 	name->hash = full_name_hash(name->name, name->len);
-	if (dir->d_op && dir->d_op->d_hash) {
+	if (dir->d_flags & DCACHE_OP_HASH) {
 		if (dir->d_op->d_hash(dir, dir->d_inode, name) < 0)
 			goto out;
 	}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5e5c7ec1fc98..f91b35db4c6e 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -441,7 +441,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	struct qstr lower_name;
 	int rc = 0;
 
-	ecryptfs_dentry->d_op = &ecryptfs_dops;
+	d_set_d_op(ecryptfs_dentry, &ecryptfs_dops);
 	if ((ecryptfs_dentry->d_name.len == 1
 	     && !strcmp(ecryptfs_dentry->d_name.name, "."))
 	    || (ecryptfs_dentry->d_name.len == 2
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index a9dbd62518e6..351038675376 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -189,7 +189,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
 	if (special_file(lower_inode->i_mode))
 		init_special_inode(inode, lower_inode->i_mode,
 				   lower_inode->i_rdev);
-	dentry->d_op = &ecryptfs_dops;
+	d_set_d_op(dentry, &ecryptfs_dops);
 	fsstack_copy_attr_all(inode, lower_inode);
 	/* This size will be overwritten for real files w/ headers and
 	 * other metadata */
@@ -594,7 +594,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 		deactivate_locked_super(s);
 		goto out;
 	}
-	s->s_root->d_op = &ecryptfs_dops;
+	d_set_d_op(s->s_root, &ecryptfs_dops);
 	s->s_root->d_sb = s;
 	s->s_root->d_parent = s->s_root;
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 8cccfebee180..206351af7c58 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -750,7 +750,7 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
 	 */
 	result = d_obtain_alias(inode);
 	if (!IS_ERR(result))
-		result->d_op = sb->s_root->d_op;
+		d_set_d_op(result, sb->s_root->d_op);
 	return result;
 }
 
@@ -800,7 +800,7 @@ static struct dentry *fat_get_parent(struct dentry *child)
 
 	parent = d_obtain_alias(inode);
 	if (!IS_ERR(parent))
-		parent->d_op = sb->s_root->d_op;
+		d_set_d_op(parent, sb->s_root->d_op);
 out:
 	unlock_super(sb);
 
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 3b3e072d8982..35ffe43afa4b 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -227,10 +227,10 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
 	}
 out:
 	unlock_super(sb);
-	dentry->d_op = &msdos_dentry_operations;
+	d_set_d_op(dentry, &msdos_dentry_operations);
 	dentry = d_splice_alias(inode, dentry);
 	if (dentry)
-		dentry->d_op = &msdos_dentry_operations;
+		d_set_d_op(dentry, &msdos_dentry_operations);
 	return dentry;
 
 error:
@@ -673,7 +673,7 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	sb->s_flags |= MS_NOATIME;
-	sb->s_root->d_op = &msdos_dentry_operations;
+	d_set_d_op(sb->s_root, &msdos_dentry_operations);
 	unlock_super(sb);
 	return 0;
 }
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 4fc06278db48..3be5ed7d859f 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -766,11 +766,11 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 
 out:
 	unlock_super(sb);
-	dentry->d_op = sb->s_root->d_op;
+	d_set_d_op(dentry, sb->s_root->d_op);
 	dentry->d_time = dentry->d_parent->d_inode->i_version;
 	dentry = d_splice_alias(inode, dentry);
 	if (dentry) {
-		dentry->d_op = sb->s_root->d_op;
+		d_set_d_op(dentry, sb->s_root->d_op);
 		dentry->d_time = dentry->d_parent->d_inode->i_version;
 	}
 	return dentry;
@@ -1072,9 +1072,9 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	if (MSDOS_SB(sb)->options.name_check != 's')
-		sb->s_root->d_op = &vfat_ci_dentry_ops;
+		d_set_d_op(sb->s_root, &vfat_ci_dentry_ops);
 	else
-		sb->s_root->d_op = &vfat_dentry_ops;
+		d_set_d_op(sb->s_root, &vfat_dentry_ops);
 
 	unlock_super(sb);
 	return 0;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c9627c95482d..c9a8a426a395 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -347,7 +347,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	}
 
 	entry = newent ? newent : entry;
-	entry->d_op = &fuse_dentry_operations;
+	d_set_d_op(entry, &fuse_dentry_operations);
 	if (outarg_valid)
 		fuse_change_entry_timeout(entry, &outarg);
 	else
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 44e0a6c57e87..a8b31da19b93 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -626,7 +626,7 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
 
 	entry = d_obtain_alias(inode);
 	if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) {
-		entry->d_op = &fuse_dentry_operations;
+		d_set_d_op(entry, &fuse_dentry_operations);
 		fuse_invalidate_entry_cache(entry);
 	}
 
@@ -728,7 +728,7 @@ static struct dentry *fuse_get_parent(struct dentry *child)
 
 	parent = d_obtain_alias(inode);
 	if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) {
-		parent->d_op = &fuse_dentry_operations;
+		d_set_d_op(parent, &fuse_dentry_operations);
 		fuse_invalidate_entry_cache(parent);
 	}
 
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 5ab3839dfcb9..97012ecff560 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -130,7 +130,7 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
 
 	dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
 	if (!IS_ERR(dentry))
-		dentry->d_op = &gfs2_dops;
+		d_set_d_op(dentry, &gfs2_dops);
 	return dentry;
 }
 
@@ -158,7 +158,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
 out_inode:
 	dentry = d_obtain_alias(inode);
 	if (!IS_ERR(dentry))
-		dentry->d_op = &gfs2_dops;
+		d_set_d_op(dentry, &gfs2_dops);
 	return dentry;
 }
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3eb1393f7b81..2aeabd4218cc 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -440,7 +440,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
 		iput(inode);
 		return -ENOMEM;
 	}
-	dentry->d_op = &gfs2_dops;
+	d_set_d_op(dentry, &gfs2_dops);
 	*dptr = dentry;
 	return 0;
 }
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 12cbea7502c2..f28f89796f4d 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -106,7 +106,7 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
 {
 	struct inode *inode = NULL;
 
-	dentry->d_op = &gfs2_dops;
+	d_set_d_op(dentry, &gfs2_dops);
 
 	inode = gfs2_lookupi(dir, &dentry->d_name, 0);
 	if (inode && IS_ERR(inode))
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 2b3b8611b41b..ea4aefe7c652 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -25,7 +25,7 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
 	struct inode *inode = NULL;
 	int res;
 
-	dentry->d_op = &hfs_dentry_operations;
+	d_set_d_op(dentry, &hfs_dentry_operations);
 
 	hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
 	hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index ef4ee5716abe..0bef62aa4f42 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -434,7 +434,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sb->s_root)
 		goto bail_iput;
 
-	sb->s_root->d_op = &hfs_dentry_operations;
+	d_set_d_op(sb->s_root, &hfs_dentry_operations);
 
 	/* everything's okay */
 	return 0;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 9d59c0571f59..ccab87145f7a 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -37,7 +37,7 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
 
 	sb = dir->i_sb;
 
-	dentry->d_op = &hfsplus_dentry_operations;
+	d_set_d_op(dentry, &hfsplus_dentry_operations);
 	dentry->d_fsdata = NULL;
 	hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 182e83a9079e..ddf712e4700e 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -419,7 +419,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		err = -ENOMEM;
 		goto cleanup;
 	}
-	sb->s_root->d_op = &hfsplus_dentry_operations;
+	d_set_d_op(sb->s_root, &hfsplus_dentry_operations);
 
 	str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
 	str.name = HFSP_HIDDENDIR_NAME;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 861113fcfc88..0bc81cf256b8 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -612,7 +612,7 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
 		goto out_put;
 
 	d_add(dentry, inode);
-	dentry->d_op = &hostfs_dentry_ops;
+	d_set_d_op(dentry, &hostfs_dentry_ops);
 	return NULL;
 
  out_put:
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 35526df1fd32..32c13a94e1e9 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -65,5 +65,5 @@ static const struct dentry_operations hpfs_dentry_operations = {
 
 void hpfs_set_dentry_operations(struct dentry *dentry)
 {
-	dentry->d_op = &hpfs_dentry_operations;
+	d_set_d_op(dentry, &hpfs_dentry_operations);
 }
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d8f3a652243d..844a7903c72f 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -949,7 +949,7 @@ root_found:
 		table += 2;
 	if (opt.check == 'r')
 		table++;
-	s->s_root->d_op = &isofs_dentry_ops[table];
+	d_set_d_op(s->s_root, &isofs_dentry_ops[table]);
 
 	kfree(opt.iocharset);
 
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 715f7d318046..679a849c3b27 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -172,7 +172,7 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
 	struct inode *inode;
 	struct page *page;
 
-	dentry->d_op = dir->i_sb->s_root->d_op;
+	d_set_d_op(dentry, dir->i_sb->s_root->d_op);
 
 	page = alloc_page(GFP_USER);
 	if (!page)
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 57f90dad8919..a151cbdec626 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1466,7 +1466,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
 	jfs_info("jfs_lookup: name = %s", name);
 
 	if (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2)
-		dentry->d_op = &jfs_ci_dentry_operations;
+		d_set_d_op(dentry, &jfs_ci_dentry_operations);
 
 	if ((name[0] == '.') && (len == 1))
 		inum = dip->i_ino;
@@ -1495,7 +1495,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
 	dentry = d_splice_alias(ip, dentry);
 
 	if (dentry && (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2))
-		dentry->d_op = &jfs_ci_dentry_operations;
+		d_set_d_op(dentry, &jfs_ci_dentry_operations);
 
 	return dentry;
 }
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index b715b0f7bdfd..3150d766e0d4 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -525,7 +525,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto out_no_root;
 
 	if (sbi->mntflag & JFS_OS2)
-		sb->s_root->d_op = &jfs_ci_dentry_operations;
+		d_set_d_op(sb->s_root, &jfs_ci_dentry_operations);
 
 	/* logical blocks are represented by 40 bits in pxd_t, etc. */
 	sb->s_maxbytes = ((u64) sb->s_blocksize) << 40;
diff --git a/fs/libfs.c b/fs/libfs.c
index 28b36663c44e..889311e3d06b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -59,7 +59,7 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
 
 	if (dentry->d_name.len > NAME_MAX)
 		return ERR_PTR(-ENAMETOOLONG);
-	dentry->d_op = &simple_dentry_operations;
+	d_set_d_op(dentry, &simple_dentry_operations);
 	d_add(dentry, NULL);
 	return NULL;
 }
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index c0d35a3accef..1b9e07728a9f 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -23,7 +23,7 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, st
 	struct inode * inode = NULL;
 	ino_t ino;
 
-	dentry->d_op = dir->i_sb->s_root->d_op;
+	d_set_d_op(dentry, dir->i_sb->s_root->d_op);
 
 	if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen)
 		return ERR_PTR(-ENAMETOOLONG);
diff --git a/fs/namei.c b/fs/namei.c
index c731b50a6184..90bd2873e117 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -587,6 +587,17 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return dentry;
 }
 
+static inline int need_reval_dot(struct dentry *dentry)
+{
+	if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
+		return 0;
+
+	if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
+		return 0;
+
+	return 1;
+}
+
 /*
  * force_reval_path - force revalidation of a dentry
  *
@@ -610,10 +621,9 @@ force_reval_path(struct path *path, struct nameidata *nd)
 
 	/*
 	 * only check on filesystems where it's possible for the dentry to
-	 * become stale. It's assumed that if this flag is set then the
-	 * d_revalidate op will also be defined.
+	 * become stale.
 	 */
-	if (!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))
+	if (!need_reval_dot(dentry))
 		return 0;
 
 	status = dentry->d_op->d_revalidate(dentry, nd);
@@ -1003,7 +1013,7 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 	 * See if the low-level filesystem might want
 	 * to use its own hash..
 	 */
-	if (parent->d_op && parent->d_op->d_hash) {
+	if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
 		int err = parent->d_op->d_hash(parent, nd->inode, name);
 		if (err < 0)
 			return err;
@@ -1029,7 +1039,7 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 			return -ECHILD;
 
 		nd->seq = seq;
-		if (dentry->d_op && dentry->d_op->d_revalidate) {
+		if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
 			/* We commonly drop rcu-walk here */
 			if (nameidata_dentry_drop_rcu(nd, dentry))
 				return -ECHILD;
@@ -1043,7 +1053,7 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 		if (!dentry)
 			goto need_lookup;
 found:
-		if (dentry->d_op && dentry->d_op->d_revalidate)
+		if (dentry->d_flags & DCACHE_OP_REVALIDATE)
 			goto need_revalidate;
 done:
 		path->mnt = mnt;
@@ -1281,8 +1291,7 @@ return_reval:
 		 * We bypassed the ordinary revalidation routines.
 		 * We may need to check the cached dentry for staleness.
 		 */
-		if (nd->path.dentry && nd->path.dentry->d_sb &&
-		    (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
+		if (need_reval_dot(nd->path.dentry)) {
 			if (nameidata_drop_rcu_maybe(nd))
 				return -ECHILD;
 			err = -ESTALE;
@@ -1602,7 +1611,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
 	 * See if the low-level filesystem might want
 	 * to use its own hash..
 	 */
-	if (base->d_op && base->d_op->d_hash) {
+	if (base->d_flags & DCACHE_OP_HASH) {
 		err = base->d_op->d_hash(base, inode, name);
 		dentry = ERR_PTR(err);
 		if (err < 0)
@@ -1616,7 +1625,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
 	 */
 	dentry = d_lookup(base, name);
 
-	if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
+	if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE))
 		dentry = do_revalidate(dentry, nd);
 
 	if (!dentry)
@@ -2070,7 +2079,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		follow_dotdot(nd);
 		dir = nd->path.dentry;
 	case LAST_DOT:
-		if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
+		if (need_reval_dot(dir)) {
 			if (!dir->d_op->d_revalidate(dir, nd)) {
 				error = -ESTALE;
 				goto exit;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 0ba3cdc95a44..4b9cbb28d7fa 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -633,7 +633,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 		entry->ino = iunique(dir->i_sb, 2);
 		inode = ncp_iget(dir->i_sb, entry);
 		if (inode) {
-			newdent->d_op = &ncp_dentry_operations;
+			d_set_d_op(newdent, &ncp_dentry_operations);
 			d_instantiate(newdent, inode);
 			if (!hashed)
 				d_rehash(newdent);
@@ -889,7 +889,7 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
 	if (inode) {
 		ncp_new_dentry(dentry);
 add_entry:
-		dentry->d_op = &ncp_dentry_operations;
+		d_set_d_op(dentry, &ncp_dentry_operations);
 		d_add(dentry, inode);
 		error = 0;
 	}
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 60047dbeb38d..0c75a5f3cafd 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -717,7 +717,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	sb->s_root = d_alloc_root(root_inode);
         if (!sb->s_root)
 		goto out_no_root;
-	sb->s_root->d_op = &ncp_root_dentry_operations;
+	d_set_d_op(sb->s_root, &ncp_root_dentry_operations);
 	return 0;
 
 out_no_root:
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index eb77471b8823..37e0a8bb077e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -438,7 +438,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
 	if (dentry == NULL)
 		return;
 
-	dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+	d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
 	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
 	if (IS_ERR(inode))
 		goto out;
@@ -1188,7 +1188,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 	if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
 		goto out;
 
-	dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+	d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
 
 	/*
 	 * If we're doing an exclusive create, optimize away the lookup
@@ -1333,7 +1333,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 		res = ERR_PTR(-ENAMETOOLONG);
 		goto out;
 	}
-	dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+	d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
 
 	/* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
 	 * the dentry. */
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b3e36c3430de..c3a5a1126833 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -121,7 +121,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 	security_d_instantiate(ret, inode);
 
 	if (ret->d_op == NULL)
-		ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
+		d_set_d_op(ret, server->nfs_client->rpc_ops->dentry_ops);
 out:
 	nfs_free_fattr(fsinfo.fattr);
 	return ret;
@@ -228,7 +228,7 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 	security_d_instantiate(ret, inode);
 
 	if (ret->d_op == NULL)
-		ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
+		d_set_d_op(ret, server->nfs_client->rpc_ops->dentry_ops);
 
 out:
 	nfs_free_fattr(fattr);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 19ad145d2af3..6adafa576065 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -138,7 +138,7 @@ check_gen:
 
 	result = d_obtain_alias(inode);
 	if (!IS_ERR(result))
-		result->d_op = &ocfs2_dentry_ops;
+		d_set_d_op(result, &ocfs2_dentry_ops);
 	else
 		mlog_errno(PTR_ERR(result));
 
@@ -176,7 +176,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
 
 	parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
 	if (!IS_ERR(parent))
-		parent->d_op = &ocfs2_dentry_ops;
+		d_set_d_op(parent, &ocfs2_dentry_ops);
 
 bail_unlock:
 	ocfs2_inode_unlock(dir, 0);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index ff5744e1e36f..d14cad6e2e41 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -147,7 +147,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
 	spin_unlock(&oi->ip_lock);
 
 bail_add:
-	dentry->d_op = &ocfs2_dentry_ops;
+	d_set_d_op(dentry, &ocfs2_dentry_ops);
 	ret = d_splice_alias(inode, dentry);
 
 	if (inode) {
@@ -415,7 +415,7 @@ static int ocfs2_mknod(struct inode *dir,
 		mlog_errno(status);
 		goto leave;
 	}
-	dentry->d_op = &ocfs2_dentry_ops;
+	d_set_d_op(dentry, &ocfs2_dentry_ops);
 
 	status = ocfs2_add_entry(handle, dentry, inode,
 				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
@@ -743,7 +743,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 	}
 
 	ihold(inode);
-	dentry->d_op = &ocfs2_dentry_ops;
+	d_set_d_op(dentry, &ocfs2_dentry_ops);
 	d_instantiate(dentry, inode);
 
 out_commit:
@@ -1794,7 +1794,7 @@ static int ocfs2_symlink(struct inode *dir,
 		mlog_errno(status);
 		goto bail;
 	}
-	dentry->d_op = &ocfs2_dentry_ops;
+	d_set_d_op(dentry, &ocfs2_dentry_ops);
 
 	status = ocfs2_add_entry(handle, dentry, inode,
 				 le64_to_cpu(fe->i_blkno), parent_fe_bh,
@@ -2459,7 +2459,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
 		goto out_commit;
 	}
 
-	dentry->d_op = &ocfs2_dentry_ops;
+	d_set_d_op(dentry, &ocfs2_dentry_ops);
 	d_instantiate(dentry, inode);
 	status = 0;
 out_commit:
diff --git a/fs/pipe.c b/fs/pipe.c
index ae3592dcc88c..01a786567810 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1004,7 +1004,7 @@ struct file *create_write_pipe(int flags)
 		goto err_inode;
 	path.mnt = mntget(pipe_mnt);
 
-	path.dentry->d_op = &pipefs_dentry_operations;
+	d_set_d_op(path.dentry, &pipefs_dentry_operations);
 	d_instantiate(path.dentry, inode);
 
 	err = -ENFILE;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d932fdb6a245..85f0a80912aa 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1969,7 +1969,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
 	inode->i_op = &proc_pid_link_inode_operations;
 	inode->i_size = 64;
 	ei->op.proc_get_link = proc_fd_link;
-	dentry->d_op = &tid_fd_dentry_operations;
+	d_set_d_op(dentry, &tid_fd_dentry_operations);
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
 	if (tid_fd_revalidate(dentry, NULL))
@@ -2137,7 +2137,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
 	ei->fd = fd;
 	inode->i_mode = S_IFREG | S_IRUSR;
 	inode->i_fop = &proc_fdinfo_file_operations;
-	dentry->d_op = &tid_fd_dentry_operations;
+	d_set_d_op(dentry, &tid_fd_dentry_operations);
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
 	if (tid_fd_revalidate(dentry, NULL))
@@ -2196,7 +2196,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
 	if (p->fop)
 		inode->i_fop = p->fop;
 	ei->op = p->op;
-	dentry->d_op = &pid_dentry_operations;
+	d_set_d_op(dentry, &pid_dentry_operations);
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
 	if (pid_revalidate(dentry, NULL))
@@ -2615,7 +2615,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
 	if (p->fop)
 		inode->i_fop = p->fop;
 	ei->op = p->op;
-	dentry->d_op = &proc_base_dentry_operations;
+	d_set_d_op(dentry, &proc_base_dentry_operations);
 	d_add(dentry, inode);
 	error = NULL;
 out:
@@ -2926,7 +2926,7 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
 	inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
 		ARRAY_SIZE(tgid_base_stuff));
 
-	dentry->d_op = &pid_dentry_operations;
+	d_set_d_op(dentry, &pid_dentry_operations);
 
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
@@ -3169,7 +3169,7 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
 	inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
 		ARRAY_SIZE(tid_base_stuff));
 
-	dentry->d_op = &pid_dentry_operations;
+	d_set_d_op(dentry, &pid_dentry_operations);
 
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 1d607be36d95..f766be29d2c7 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -439,7 +439,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 out_unlock:
 
 	if (inode) {
-		dentry->d_op = &proc_dentry_operations;
+		d_set_d_op(dentry, &proc_dentry_operations);
 		d_add(dentry, inode);
 		return NULL;
 	}
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 998e3a715bcc..35efd85a4d32 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -120,7 +120,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
 		goto out;
 
 	err = NULL;
-	dentry->d_op = &proc_sys_dentry_operations;
+	d_set_d_op(dentry, &proc_sys_dentry_operations);
 	d_add(dentry, inode);
 
 out:
@@ -201,7 +201,7 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
 				dput(child);
 				return -ENOMEM;
 			} else {
-				child->d_op = &proc_sys_dentry_operations;
+				d_set_d_op(child, &proc_sys_dentry_operations);
 				d_add(child, inode);
 			}
 		} else {
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 5d04a7828e7a..e0f0d7ea10a1 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -990,7 +990,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
 				strlen(PRIVROOT_NAME));
 	if (!IS_ERR(dentry)) {
 		REISERFS_SB(s)->priv_root = dentry;
-		dentry->d_op = &xattr_lookup_poison_ops;
+		d_set_d_op(dentry, &xattr_lookup_poison_ops);
 		if (dentry->d_inode)
 			dentry->d_inode->i_flags |= S_PRIVATE;
 	} else
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 27e1102e303e..3e076caa8daf 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -701,7 +701,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	/* instantiate and hash dentry */
 	ret = d_find_alias(inode);
 	if (!ret) {
-		dentry->d_op = &sysfs_dentry_ops;
+		d_set_d_op(dentry, &sysfs_dentry_ops);
 		dentry->d_fsdata = sysfs_get(sd);
 		d_add(dentry, inode);
 	} else {
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 7507aeb4c90e..b5e68da2db32 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -48,7 +48,7 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, st
 	struct inode * inode = NULL;
 	ino_t ino;
 
-	dentry->d_op = dir->i_sb->s_root->d_op;
+	d_set_d_op(dentry, dir->i_sb->s_root->d_op);
 	if (dentry->d_name.len > SYSV_NAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
 	ino = sysv_inode_by_name(dentry);
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 3d9c62be0c10..76712aefc4ab 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -346,7 +346,7 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
 	if (sbi->s_forced_ro)
 		sb->s_flags |= MS_RDONLY;
 	if (sbi->s_truncate)
-		sb->s_root->d_op = &sysv_dentry_operations;
+		d_set_d_op(sb->s_root, &sysv_dentry_operations);
 	return 1;
 }
 
-- 
cgit v1.2.2


From 44a7d7a878c9cbb74f236ea755b25b6b2e26a9a9 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:56 +1100
Subject: fs: cache optimise dentry and inode for rcu-walk

Put dentry and inode fields into top of data structure.  This allows RCU path
traversal to perform an RCU dentry lookup in a path walk by touching only the
first 56 bytes of the dentry.

We also fit in 8 bytes of inline name in the first 64 bytes, so for short
names, only 64 bytes needs to be touched to perform the lookup. We should
get rid of the hash->prev pointer from the first 64 bytes, and fit 16 bytes
of name in there, which will take care of 81% rather than 32% of the kernel
tree.

inode is also rearranged so that RCU lookup will only touch a single cacheline
in the inode, plus one in the i_ops structure.

This is important for directory component lookups in RCU path walking. In the
kernel source, directory names average is around 6 chars, so this works.

When we reach the last element of the lookup, we need to lock it and take its
refcount which requires another cacheline access.

Align dentry and inode operations structs, so members will be at predictable
offsets and we can group common operations into head of structure.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index f9693da3efbd..07d1f6862dc7 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -83,8 +83,6 @@ EXPORT_SYMBOL(dcache_inode_lock);
 
 static struct kmem_cache *dentry_cache __read_mostly;
 
-#define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname))
-
 /*
  * This is the single most critical data structure when it comes
  * to the dcache: the hashtable for lookups. Somebody should try
-- 
cgit v1.2.2


From 34286d6662308d82aed891852d04c7c3a2649b16 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:57 +1100
Subject: fs: rcu-walk aware d_revalidate method

Require filesystems be aware of .d_revalidate being called in rcu-walk
mode (nd->flags & LOOKUP_RCU). For now do a simple push down, returning
-ECHILD from all implementations.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/afs/dir.c          |  4 ++++
 fs/autofs4/root.c     | 13 ++++++++++---
 fs/ceph/dir.c         |  7 ++++++-
 fs/cifs/dir.c         |  3 +++
 fs/coda/dir.c         |  7 ++++++-
 fs/ecryptfs/dentry.c  |  9 +++++++--
 fs/fat/namei_vfat.c   |  6 ++++++
 fs/fuse/dir.c         |  6 +++++-
 fs/gfs2/dentry.c      | 17 ++++++++++++----
 fs/hfs/sysdep.c       |  7 ++++++-
 fs/jfs/namei.c        |  2 ++
 fs/namei.c            | 54 ++++++++++++++++++++++++++++++++++-----------------
 fs/ncpfs/dir.c        |  4 ++++
 fs/ncpfs/inode.c      |  1 +
 fs/nfs/dir.c          |  8 ++++++--
 fs/ocfs2/dcache.c     | 10 ++++++++--
 fs/proc/base.c        | 33 ++++++++++++++++++++++++-------
 fs/proc/proc_sysctl.c |  3 +++
 fs/reiserfs/xattr.c   |  2 ++
 fs/sysfs/dir.c        |  6 +++++-
 20 files changed, 159 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index b8bb7e7148d0..34a3263d60a4 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
 #include <linux/sched.h>
@@ -607,6 +608,9 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 	void *dir_version;
 	int ret;
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
 	vnode = AFS_FS_I(dentry->d_inode);
 
 	if (dentry->d_inode)
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index bfe3f2eb684d..651e4ef563b1 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -315,12 +315,19 @@ out_error:
  */
 static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct inode *dir = dentry->d_parent->d_inode;
-	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
-	int oz_mode = autofs4_oz_mode(sbi);
+	struct inode *dir;
+	struct autofs_sb_info *sbi;
+	int oz_mode;
 	int flags = nd ? nd->flags : 0;
 	int status = 1;
 
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	dir = dentry->d_parent->d_inode;
+	sbi = autofs4_sbi(dir->i_sb);
+	oz_mode = autofs4_oz_mode(sbi);
+
 	/* Pending dentry */
 	spin_lock(&sbi->fs_lock);
 	if (autofs4_ispending(dentry)) {
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index cc01cf826769..fa7ca04ee816 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -990,7 +990,12 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
  */
 static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct inode *dir = dentry->d_parent->d_inode;
+	struct inode *dir;
+
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	dir = dentry->d_parent->d_inode;
 
 	dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
 	     dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index e3b10ca6d453..db2a58c00f7b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -656,6 +656,9 @@ lookup_out:
 static int
 cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 {
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
 	if (direntry->d_inode) {
 		if (cifs_revalidate_dentry(direntry))
 			return 0;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index aa40c811f8d2..619a8303766e 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -18,6 +18,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
+#include <linux/namei.h>
 
 #include <asm/uaccess.h>
 
@@ -541,9 +542,13 @@ out:
 /* called when a cache lookup succeeds */
 static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
 {
-	struct inode *inode = de->d_inode;
+	struct inode *inode;
 	struct coda_inode_info *cii;
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = de->d_inode;
 	if (!inode || coda_isroot(inode))
 		goto out;
 	if (is_bad_inode(inode))
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 906e803f7f79..6fc4f319b550 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -44,12 +44,17 @@
  */
 static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+	struct dentry *lower_dentry;
+	struct vfsmount *lower_mnt;
 	struct dentry *dentry_save;
 	struct vfsmount *vfsmount_save;
 	int rc = 1;
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
 	if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
 		goto out;
 	dentry_save = nd->path.dentry;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 3be5ed7d859f..e3ffc5e12332 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -43,6 +43,9 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
 
 static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
 	/* This is not negative dentry. Always valid. */
 	if (dentry->d_inode)
 		return 1;
@@ -51,6 +54,9 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 
 static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 {
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
 	/*
 	 * This is not negative dentry. Always valid.
 	 *
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c9a8a426a395..07f4b5e675fc 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -156,8 +156,12 @@ u64 fuse_get_attr_version(struct fuse_conn *fc)
  */
 static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
-	struct inode *inode = entry->d_inode;
+	struct inode *inode;
+
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
 
+	inode = entry->d_inode;
 	if (inode && is_bad_inode(inode))
 		return 0;
 	else if (fuse_dentry_time(entry) < get_jiffies_64()) {
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 50497f65763b..4a456338b873 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -11,6 +11,7 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/namei.h>
 #include <linux/crc32.h>
 
 #include "gfs2.h"
@@ -34,15 +35,23 @@
 
 static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct dentry *parent = dget_parent(dentry);
-	struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
-	struct gfs2_inode *dip = GFS2_I(parent->d_inode);
-	struct inode *inode = dentry->d_inode;
+	struct dentry *parent;
+	struct gfs2_sbd *sdp;
+	struct gfs2_inode *dip;
+	struct inode *inode;
 	struct gfs2_holder d_gh;
 	struct gfs2_inode *ip = NULL;
 	int error;
 	int had_lock = 0;
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	parent = dget_parent(dentry);
+	sdp = GFS2_SB(parent->d_inode);
+	dip = GFS2_I(parent->d_inode);
+	inode = dentry->d_inode;
+
 	if (inode) {
 		if (is_bad_inode(inode))
 			goto invalid;
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index 7478f5c219aa..19cf291eb91f 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -8,15 +8,20 @@
  * This file contains the code to do various system dependent things.
  */
 
+#include <linux/namei.h>
 #include "hfs_fs.h"
 
 /* dentry case-handling: just lowercase everything */
 
 static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode;
 	int diff;
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
 	if(!inode)
 		return 1;
 
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a151cbdec626..4414e3a42264 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1608,6 +1608,8 @@ out:
 
 static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
 	/*
 	 * This is not negative dentry. Always valid.
 	 *
diff --git a/fs/namei.c b/fs/namei.c
index 90bd2873e117..6e275363e89d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -563,10 +563,26 @@ void release_open_intent(struct nameidata *nd)
 		fput(nd->intent.open.file);
 }
 
+static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	int status;
+
+	status = dentry->d_op->d_revalidate(dentry, nd);
+	if (status == -ECHILD) {
+		if (nameidata_dentry_drop_rcu(nd, dentry))
+			return status;
+		status = dentry->d_op->d_revalidate(dentry, nd);
+	}
+
+	return status;
+}
+
 static inline struct dentry *
 do_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	int status = dentry->d_op->d_revalidate(dentry, nd);
+	int status;
+
+	status = d_revalidate(dentry, nd);
 	if (unlikely(status <= 0)) {
 		/*
 		 * The dentry failed validation.
@@ -574,14 +590,20 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
 		 * the dentry otherwise d_revalidate is asking us
 		 * to return a fail status.
 		 */
-		if (!status) {
+		if (status < 0) {
+			/* If we're in rcu-walk, we don't have a ref */
+			if (!(nd->flags & LOOKUP_RCU))
+				dput(dentry);
+			dentry = ERR_PTR(status);
+
+		} else {
+			/* Don't d_invalidate in rcu-walk mode */
+			if (nameidata_dentry_drop_rcu_maybe(nd, dentry))
+				return ERR_PTR(-ECHILD);
 			if (!d_invalidate(dentry)) {
 				dput(dentry);
 				dentry = NULL;
 			}
-		} else {
-			dput(dentry);
-			dentry = ERR_PTR(status);
 		}
 	}
 	return dentry;
@@ -626,7 +648,7 @@ force_reval_path(struct path *path, struct nameidata *nd)
 	if (!need_reval_dot(dentry))
 		return 0;
 
-	status = dentry->d_op->d_revalidate(dentry, nd);
+	status = d_revalidate(dentry, nd);
 	if (status > 0)
 		return 0;
 
@@ -1039,12 +1061,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 			return -ECHILD;
 
 		nd->seq = seq;
-		if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
-			/* We commonly drop rcu-walk here */
-			if (nameidata_dentry_drop_rcu(nd, dentry))
-				return -ECHILD;
+		if (dentry->d_flags & DCACHE_OP_REVALIDATE)
 			goto need_revalidate;
-		}
 		path->mnt = mnt;
 		path->dentry = dentry;
 		__follow_mount_rcu(nd, path, inode);
@@ -1292,12 +1310,11 @@ return_reval:
 		 * We may need to check the cached dentry for staleness.
 		 */
 		if (need_reval_dot(nd->path.dentry)) {
-			if (nameidata_drop_rcu_maybe(nd))
-				return -ECHILD;
-			err = -ESTALE;
 			/* Note: we do not d_invalidate() */
-			if (!nd->path.dentry->d_op->d_revalidate(
-					nd->path.dentry, nd))
+			err = d_revalidate(nd->path.dentry, nd);
+			if (!err)
+				err = -ESTALE;
+			if (err < 0)
 				break;
 		}
 return_base:
@@ -2080,10 +2097,11 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		dir = nd->path.dentry;
 	case LAST_DOT:
 		if (need_reval_dot(dir)) {
-			if (!dir->d_op->d_revalidate(dir, nd)) {
+			error = d_revalidate(nd->path.dentry, nd);
+			if (!error)
 				error = -ESTALE;
+			if (error < 0)
 				goto exit;
-			}
 		}
 		/* fallthrough */
 	case LAST_ROOT:
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 4b9cbb28d7fa..28f136d4aaec 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
+#include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
 
@@ -308,6 +309,9 @@ ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
 	int res, val = 0, len;
 	__u8 __name[NCP_MAXPATHLEN + 1];
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
 	parent = dget_parent(dentry);
 	dir = parent->d_inode;
 
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 0c75a5f3cafd..9531c052d7a4 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -29,6 +29,7 @@
 #include <linux/vfs.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/namei.h>
 
 #include <linux/ncp_fs.h>
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 37e0a8bb077e..58beace14b19 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -938,7 +938,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
  * component of the path.
  * We check for this using LOOKUP_CONTINUE and LOOKUP_PARENT.
  */
-static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, unsigned int mask)
+static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd,
+						unsigned int mask)
 {
 	if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT))
 		return 0;
@@ -1018,7 +1019,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
  * If the parent directory is seen to have changed, we throw out the
  * cached dentry and do a new lookup.
  */
-static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
+static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *dir;
 	struct inode *inode;
@@ -1027,6 +1028,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 	struct nfs_fattr *fattr = NULL;
 	int error;
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
 	parent = dget_parent(dentry);
 	dir = parent->d_inode;
 	nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 4d54c60ceee4..0310b16a7238 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -52,9 +52,15 @@ void ocfs2_dentry_attach_gen(struct dentry *dentry)
 static int ocfs2_dentry_revalidate(struct dentry *dentry,
 				   struct nameidata *nd)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode;
 	int ret = 0;    /* if all else fails, just return false */
-	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+	struct ocfs2_super *osb;
+
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	osb = OCFS2_SB(dentry->d_sb);
 
 	mlog_entry("(0x%p, '%.*s')\n", dentry,
 		   dentry->d_name.len, dentry->d_name.name);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 85f0a80912aa..dc5b2fcadc3b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1719,10 +1719,16 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
  */
 static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = get_proc_task(inode);
+	struct inode *inode;
+	struct task_struct *task;
 	const struct cred *cred;
 
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+
 	if (task) {
 		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
 		    task_dumpable(task)) {
@@ -1888,12 +1894,19 @@ static int proc_fd_link(struct inode *inode, struct path *path)
 
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = get_proc_task(inode);
-	int fd = proc_fd(inode);
+	struct inode *inode;
+	struct task_struct *task;
+	int fd;
 	struct files_struct *files;
 	const struct cred *cred;
 
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	fd = proc_fd(inode);
+
 	if (task) {
 		files = get_files_struct(task);
 		if (files) {
@@ -2563,8 +2576,14 @@ static const struct pid_entry proc_base_stuff[] = {
  */
 static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = get_proc_task(inode);
+	struct inode *inode;
+	struct task_struct *task;
+
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
 	if (task) {
 		put_task_struct(task);
 		return 1;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 35efd85a4d32..c9097f43b425 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -5,6 +5,7 @@
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
+#include <linux/namei.h>
 #include "internal.h"
 
 static const struct dentry_operations proc_sys_dentry_operations;
@@ -389,6 +390,8 @@ static const struct inode_operations proc_sys_dir_operations = {
 
 static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
 	return !PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
 
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e0f0d7ea10a1..9ea22a56cdf1 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -972,6 +972,8 @@ int reiserfs_permission(struct inode *inode, int mask)
 
 static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
 	return -EPERM;
 }
 
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 3e076caa8daf..ea9120a830d8 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -239,9 +239,13 @@ static int sysfs_dentry_delete(const struct dentry *dentry)
 
 static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct sysfs_dirent *sd;
 	int is_dir;
 
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	sd = dentry->d_fsdata;
 	mutex_lock(&sysfs_mutex);
 
 	/* The sysfs dirent has been deleted */
-- 
cgit v1.2.2


From b74c79e99389cd79b31fcc08f82c24e492e63c7e Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:58 +1100
Subject: fs: provide rcu-walk aware permission i_ops

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/9p/acl.c                |  5 +++-
 fs/9p/acl.h                |  2 +-
 fs/afs/internal.h          |  2 +-
 fs/afs/security.c          |  7 +++--
 fs/bad_inode.c             |  5 +++-
 fs/btrfs/acl.c             |  6 ++--
 fs/btrfs/ctree.h           |  2 +-
 fs/btrfs/inode.c           |  7 +++--
 fs/ceph/inode.c            | 11 +++++--
 fs/ceph/super.h            |  2 +-
 fs/cifs/cifsfs.c           |  7 +++--
 fs/coda/dir.c              |  5 +++-
 fs/coda/pioctl.c           |  6 ++--
 fs/ecryptfs/inode.c        |  4 ++-
 fs/ext2/acl.c              |  8 +++--
 fs/ext2/acl.h              |  2 +-
 fs/ext3/acl.c              |  8 +++--
 fs/ext3/acl.h              |  2 +-
 fs/ext4/acl.c              |  8 +++--
 fs/ext4/acl.h              |  2 +-
 fs/fuse/dir.c              | 10 +++++--
 fs/generic_acl.c           |  8 +++--
 fs/gfs2/acl.c              |  5 +++-
 fs/gfs2/acl.h              |  2 +-
 fs/gfs2/file.c             |  2 +-
 fs/gfs2/inode.c            |  4 +--
 fs/gfs2/inode.h            |  2 +-
 fs/gfs2/ops_inode.c        | 18 ++++++-----
 fs/hostfs/hostfs_kern.c    |  7 +++--
 fs/hpfs/namei.c            |  2 +-
 fs/jffs2/acl.c             |  5 +++-
 fs/jffs2/acl.h             |  2 +-
 fs/jfs/acl.c               |  8 +++--
 fs/jfs/jfs_acl.h           |  2 +-
 fs/logfs/dir.c             |  6 ++--
 fs/namei.c                 | 75 ++++++++++++++++++----------------------------
 fs/nfs/dir.c               |  7 +++--
 fs/nilfs2/inode.c          | 10 +++++--
 fs/nilfs2/nilfs.h          |  2 +-
 fs/ocfs2/acl.c             |  8 +++--
 fs/ocfs2/acl.h             |  2 +-
 fs/ocfs2/file.c            |  7 +++--
 fs/ocfs2/file.h            |  2 +-
 fs/proc/base.c             |  6 ++--
 fs/proc/proc_sysctl.c      |  5 +++-
 fs/reiserfs/xattr.c        | 14 ++++++---
 fs/sysfs/inode.c           | 11 +++++--
 fs/sysfs/sysfs.h           |  2 +-
 fs/xfs/linux-2.6/xfs_acl.c |  8 +++--
 fs/xfs/xfs_acl.h           |  2 +-
 50 files changed, 215 insertions(+), 130 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 12d602351dbe..6e58c4ca1e6e 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -91,11 +91,14 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
 	return acl;
 }
 
-int v9fs_check_acl(struct inode *inode, int mask)
+int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
 	struct posix_acl *acl;
 	struct v9fs_session_info *v9ses;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	v9ses = v9fs_inode2v9ses(inode);
 	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
 		/*
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index 59e18c2e8c7e..7ef3ac9f6d95 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -16,7 +16,7 @@
 
 #ifdef CONFIG_9P_FS_POSIX_ACL
 extern int v9fs_get_acl(struct inode *, struct p9_fid *);
-extern int v9fs_check_acl(struct inode *inode, int mask);
+extern int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags);
 extern int v9fs_acl_chmod(struct dentry *);
 extern int v9fs_set_create_acl(struct dentry *,
 			       struct posix_acl *, struct posix_acl *);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index cca8eef736fc..6d4bc1c8ff60 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -624,7 +624,7 @@ extern void afs_clear_permits(struct afs_vnode *);
 extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
 extern void afs_zap_permits(struct rcu_head *);
 extern struct key *afs_request_key(struct afs_cell *);
-extern int afs_permission(struct inode *, int);
+extern int afs_permission(struct inode *, int, unsigned int);
 
 /*
  * server.c
diff --git a/fs/afs/security.c b/fs/afs/security.c
index bb4ed144d0e4..f44b9d355377 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -285,13 +285,16 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
  * - AFS ACLs are attached to directories only, and a file is controlled by its
  *   parent directory's ACL
  */
-int afs_permission(struct inode *inode, int mask)
+int afs_permission(struct inode *inode, int mask, unsigned int flags)
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 	afs_access_t uninitialized_var(access);
 	struct key *key;
 	int ret;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	_enter("{{%x:%u},%lx},%x,",
 	       vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
 
@@ -347,7 +350,7 @@ int afs_permission(struct inode *inode, int mask)
 	}
 
 	key_put(key);
-	ret = generic_permission(inode, mask, NULL);
+	ret = generic_permission(inode, mask, flags, NULL);
 	_leave(" = %d", ret);
 	return ret;
 
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index f024d8aaddef..9ad2369d9e35 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -229,8 +229,11 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
 	return -EIO;
 }
 
-static int bad_inode_permission(struct inode *inode, int mask)
+static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags)
 {
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	return -EIO;
 }
 
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 2222d161c7b6..cb518a4b917c 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -185,13 +185,15 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
 	return ret;
 }
 
-int btrfs_check_acl(struct inode *inode, int mask)
+int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
 	struct posix_acl *acl;
 	int error = -EAGAIN;
 
-	acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 
+	acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index af52f6d7a4d8..a142d204b526 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2544,7 +2544,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait);
 
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-int btrfs_check_acl(struct inode *inode, int mask);
+int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
 #else
 #define btrfs_check_acl NULL
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 63e4546b478a..5cf0db0081f9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7211,11 +7211,14 @@ static int btrfs_set_page_dirty(struct page *page)
 	return __set_page_dirty_nobuffers(page);
 }
 
-static int btrfs_permission(struct inode *inode, int mask)
+static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
 		return -EACCES;
-	return generic_permission(inode, mask, btrfs_check_acl);
+	return generic_permission(inode, mask, flags, btrfs_check_acl);
 }
 
 static const struct inode_operations btrfs_dir_inode_operations = {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 47f8c8baf3b5..e61de4f7b99d 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1781,12 +1781,17 @@ int ceph_do_getattr(struct inode *inode, int mask)
  * Check inode permissions.  We verify we have a valid value for
  * the AUTH cap, then call the generic handler.
  */
-int ceph_permission(struct inode *inode, int mask)
+int ceph_permission(struct inode *inode, int mask, unsigned int flags)
 {
-	int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
+	int err;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
 
 	if (!err)
-		err = generic_permission(inode, mask, NULL);
+		err = generic_permission(inode, mask, flags, NULL);
 	return err;
 }
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7f01728a4657..4553d8829edb 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -665,7 +665,7 @@ extern void ceph_queue_invalidate(struct inode *inode);
 extern void ceph_queue_writeback(struct inode *inode);
 
 extern int ceph_do_getattr(struct inode *inode, int mask);
-extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_permission(struct inode *inode, int mask, unsigned int flags);
 extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
 extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
 			struct kstat *stat);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 223717dcc401..8e21e0fe65d5 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -283,10 +283,13 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static int cifs_permission(struct inode *inode, int mask)
+static int cifs_permission(struct inode *inode, int mask, unsigned int flags)
 {
 	struct cifs_sb_info *cifs_sb;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	cifs_sb = CIFS_SB(inode->i_sb);
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {
@@ -298,7 +301,7 @@ static int cifs_permission(struct inode *inode, int mask)
 		on the client (above and beyond ACL on servers) for
 		servers which do not support setting and viewing mode bits,
 		so allowing client to check permissions is useful */
-		return generic_permission(inode, mask, NULL);
+		return generic_permission(inode, mask, flags, NULL);
 }
 
 static struct kmem_cache *cifs_inode_cachep;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 619a8303766e..29badd91360f 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -135,10 +135,13 @@ exit:
 }
 
 
-int coda_permission(struct inode *inode, int mask)
+int coda_permission(struct inode *inode, int mask, unsigned int flags)
 {
 	int error;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
  
 	if (!mask)
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 2fd89b5c5c7b..741f0bd03918 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -24,7 +24,7 @@
 #include <linux/coda_psdev.h>
 
 /* pioctl ops */
-static int coda_ioctl_permission(struct inode *inode, int mask);
+static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags);
 static long coda_pioctl(struct file *filp, unsigned int cmd,
 			unsigned long user_data);
 
@@ -41,8 +41,10 @@ const struct file_operations coda_ioctl_operations = {
 };
 
 /* the coda pioctl inode ops */
-static int coda_ioctl_permission(struct inode *inode, int mask)
+static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags)
 {
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 	return (mask & MAY_EXEC) ? -EACCES : 0;
 }
 
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index f91b35db4c6e..337352a94751 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -980,8 +980,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 }
 
 static int
-ecryptfs_permission(struct inode *inode, int mask)
+ecryptfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 	return inode_permission(ecryptfs_inode_to_lower(inode), mask);
 }
 
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 2bcc0431bada..dd9bb3f0c8d7 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -232,10 +232,14 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 }
 
 int
-ext2_check_acl(struct inode *inode, int mask)
+ext2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-	struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
+	struct posix_acl *acl;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 
+	acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl) {
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 3ff6cbb9ac44..c939b7b12099 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,7 +54,7 @@ static inline int ext2_acl_count(size_t size)
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
 
 /* acl.c */
-extern int ext2_check_acl (struct inode *, int);
+extern int ext2_check_acl (struct inode *, int, unsigned int);
 extern int ext2_acl_chmod (struct inode *);
 extern int ext2_init_acl (struct inode *, struct inode *);
 
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 8a11fe212183..9e49da8302d3 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -240,10 +240,14 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 }
 
 int
-ext3_check_acl(struct inode *inode, int mask)
+ext3_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-	struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
+	struct posix_acl *acl;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 
+	acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl) {
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 597334626de9..5faf8048e906 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,7 +54,7 @@ static inline int ext3_acl_count(size_t size)
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
 
 /* acl.c */
-extern int ext3_check_acl (struct inode *, int);
+extern int ext3_check_acl (struct inode *, int, unsigned int);
 extern int ext3_acl_chmod (struct inode *);
 extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
 
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 5e2ed4504ead..373dcaeedba9 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -238,10 +238,14 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 }
 
 int
-ext4_check_acl(struct inode *inode, int mask)
+ext4_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-	struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
+	struct posix_acl *acl;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 
+	acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl) {
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 9d843d5deac4..dec821168fd4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,7 +54,7 @@ static inline int ext4_acl_count(size_t size)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 
 /* acl.c */
-extern int ext4_check_acl(struct inode *, int);
+extern int ext4_check_acl(struct inode *, int, unsigned int);
 extern int ext4_acl_chmod(struct inode *);
 extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 07f4b5e675fc..f738599fd8cd 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -985,12 +985,15 @@ static int fuse_access(struct inode *inode, int mask)
  * access request is sent.  Execute permission is still checked
  * locally based on file mode.
  */
-static int fuse_permission(struct inode *inode, int mask)
+static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	bool refreshed = false;
 	int err = 0;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	if (!fuse_allow_task(fc, current))
 		return -EACCES;
 
@@ -1005,7 +1008,7 @@ static int fuse_permission(struct inode *inode, int mask)
 	}
 
 	if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
-		err = generic_permission(inode, mask, NULL);
+		err = generic_permission(inode, mask, flags, NULL);
 
 		/* If permission is denied, try to refresh file
 		   attributes.  This is also needed, because the root
@@ -1013,7 +1016,8 @@ static int fuse_permission(struct inode *inode, int mask)
 		if (err == -EACCES && !refreshed) {
 			err = fuse_do_getattr(inode, NULL, NULL);
 			if (!err)
-				err = generic_permission(inode, mask, NULL);
+				err = generic_permission(inode, mask,
+							flags, NULL);
 		}
 
 		/* Note: the opposite of the above test does not
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 6bc9e3a5a693..628004282130 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -190,10 +190,14 @@ generic_acl_chmod(struct inode *inode)
 }
 
 int
-generic_check_acl(struct inode *inode, int mask)
+generic_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-	struct posix_acl *acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
+	struct posix_acl *acl;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 
+	acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
 	if (acl) {
 		int error = posix_acl_permission(inode, acl, mask);
 		posix_acl_release(acl);
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 48171f4c943d..7118f1a780a9 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -75,11 +75,14 @@ static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
  * Returns: errno
  */
 
-int gfs2_check_acl(struct inode *inode, int mask)
+int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
 	struct posix_acl *acl;
 	int error;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index b522b0cb39ea..a93907c8159b 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -16,7 +16,7 @@
 #define GFS2_POSIX_ACL_DEFAULT		"posix_acl_default"
 #define GFS2_ACL_MAX_ENTRIES		25
 
-extern int gfs2_check_acl(struct inode *inode, int mask);
+extern int gfs2_check_acl(struct inode *inode, int mask, unsigned int);
 extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
 extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
 extern const struct xattr_handler gfs2_xattr_system_handler;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index aa996471ec5c..fca6689e12e6 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -241,7 +241,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	    !capable(CAP_LINUX_IMMUTABLE))
 		goto out;
 	if (!IS_IMMUTABLE(inode)) {
-		error = gfs2_permission(inode, MAY_WRITE);
+		error = gfs2_permission(inode, MAY_WRITE, 0);
 		if (error)
 			goto out;
 	}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e1213f7f9217..6163be9686be 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -509,7 +509,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 	}
 
 	if (!is_root) {
-		error = gfs2_permission(dir, MAY_EXEC);
+		error = gfs2_permission(dir, MAY_EXEC, 0);
 		if (error)
 			goto out;
 	}
@@ -539,7 +539,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 {
 	int error;
 
-	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
 	if (error)
 		return error;
 
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index d8499fadcc53..732a183efdb3 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -113,7 +113,7 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
 				  const struct qstr *name,
 				  unsigned int mode, dev_t dev);
-extern int gfs2_permission(struct inode *inode, int mask);
+extern int gfs2_permission(struct inode *inode, int mask, unsigned int flags);
 extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index f28f89796f4d..5c63a06fcd7c 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -166,7 +166,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	if (error)
 		goto out_child;
 
-	error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
+	error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0);
 	if (error)
 		goto out_gunlock;
 
@@ -289,7 +289,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 	if (IS_APPEND(&dip->i_inode))
 		return -EPERM;
 
-	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
 	if (error)
 		return error;
 
@@ -822,7 +822,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			}
 		}
 	} else {
-		error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
+		error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0);
 		if (error)
 			goto out_gunlock;
 
@@ -857,7 +857,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	/* Check out the dir to be renamed */
 
 	if (dir_rename) {
-		error = gfs2_permission(odentry->d_inode, MAY_WRITE);
+		error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0);
 		if (error)
 			goto out_gunlock;
 	}
@@ -1041,13 +1041,17 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
  * Returns: errno
  */
 
-int gfs2_permission(struct inode *inode, int mask)
+int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
 {
-	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_inode *ip;
 	struct gfs2_holder i_gh;
 	int error;
 	int unlock = 0;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	ip = GFS2_I(inode);
 	if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
 		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
 		if (error)
@@ -1058,7 +1062,7 @@ int gfs2_permission(struct inode *inode, int mask)
 	if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
 		error = -EACCES;
 	else
-		error = generic_permission(inode, mask, gfs2_check_acl);
+		error = generic_permission(inode, mask, flags, gfs2_check_acl);
 	if (unlock)
 		gfs2_glock_dq_uninit(&i_gh);
 
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 0bc81cf256b8..d3244d949a4e 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -749,11 +749,14 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
 	return err;
 }
 
-int hostfs_permission(struct inode *ino, int desired)
+int hostfs_permission(struct inode *ino, int desired, unsigned int flags)
 {
 	char *name;
 	int r = 0, w = 0, x = 0, err;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	if (desired & MAY_READ) r = 1;
 	if (desired & MAY_WRITE) w = 1;
 	if (desired & MAY_EXEC) x = 1;
@@ -768,7 +771,7 @@ int hostfs_permission(struct inode *ino, int desired)
 		err = access_file(name, r, w, x);
 	__putname(name);
 	if (!err)
-		err = generic_permission(ino, desired, NULL);
+		err = generic_permission(ino, desired, flags, NULL);
 	return err;
 }
 
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 11c2b4080f65..f4ad9e31ddc4 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -419,7 +419,7 @@ again:
 			unlock_kernel();
 			return -ENOSPC;
 		}
-		if (generic_permission(inode, MAY_WRITE, NULL) ||
+		if (generic_permission(inode, MAY_WRITE, 0, NULL) ||
 		    !S_ISREG(inode->i_mode) ||
 		    get_write_access(inode)) {
 			d_rehash(dentry);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 54a92fd02bbd..95b79672150a 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -259,11 +259,14 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	return rc;
 }
 
-int jffs2_check_acl(struct inode *inode, int mask)
+int jffs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
 	struct posix_acl *acl;
 	int rc;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 5e42de8d9541..3119f59253d3 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
 
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
 
-extern int jffs2_check_acl(struct inode *, int);
+extern int jffs2_check_acl(struct inode *, int, unsigned int);
 extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 1057a4998e4e..e5de9422fa32 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -114,10 +114,14 @@ out:
 	return rc;
 }
 
-int jfs_check_acl(struct inode *inode, int mask)
+int jfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-	struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
+	struct posix_acl *acl;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 
+	acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl) {
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 54e07559878d..f9285c4900fa 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
 
 #ifdef CONFIG_JFS_POSIX_ACL
 
-int jfs_check_acl(struct inode *, int);
+int jfs_check_acl(struct inode *, int, unsigned int flags);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
 int jfs_acl_chmod(struct inode *inode);
 
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 409dfd65e9a1..f9ddf0c388c8 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -555,9 +555,11 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry,
 	return __logfs_create(dir, dentry, inode, target, destlen);
 }
 
-static int logfs_permission(struct inode *inode, int mask)
+static int logfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-	return generic_permission(inode, mask, NULL);
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+	return generic_permission(inode, mask, flags, NULL);
 }
 
 static int logfs_link(struct dentry *old_dentry, struct inode *dir,
diff --git a/fs/namei.c b/fs/namei.c
index 6e275363e89d..4e957bf744ae 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,8 +169,8 @@ EXPORT_SYMBOL(putname);
 /*
  * This does basic POSIX ACL permission checking
  */
-static inline int __acl_permission_check(struct inode *inode, int mask,
-		int (*check_acl)(struct inode *inode, int mask), int rcu)
+static int acl_permission_check(struct inode *inode, int mask, unsigned int flags,
+		int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
 {
 	umode_t			mode = inode->i_mode;
 
@@ -180,13 +180,9 @@ static inline int __acl_permission_check(struct inode *inode, int mask,
 		mode >>= 6;
 	else {
 		if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
-			if (rcu) {
-				return -ECHILD;
-			} else {
-				int error = check_acl(inode, mask);
-				if (error != -EAGAIN)
-					return error;
-			}
+			int error = check_acl(inode, mask, flags);
+			if (error != -EAGAIN)
+				return error;
 		}
 
 		if (in_group_p(inode->i_gid))
@@ -201,32 +197,31 @@ static inline int __acl_permission_check(struct inode *inode, int mask,
 	return -EACCES;
 }
 
-static inline int acl_permission_check(struct inode *inode, int mask,
-		int (*check_acl)(struct inode *inode, int mask))
-{
-	return __acl_permission_check(inode, mask, check_acl, 0);
-}
-
 /**
- * generic_permission  -  check for access rights on a Posix-like filesystem
+ * generic_permission -  check for access rights on a Posix-like filesystem
  * @inode:	inode to check access rights for
  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
  * @check_acl:	optional callback to check for Posix ACLs
+ * @flags	IPERM_FLAG_ flags.
  *
  * Used to check for read/write/execute permissions on a file.
  * We use "fsuid" for this, letting us set arbitrary permissions
  * for filesystem access without changing the "normal" uids which
- * are used for other things..
+ * are used for other things.
+ *
+ * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
+ * request cannot be satisfied (eg. requires blocking or too much complexity).
+ * It would then be called again in ref-walk mode.
  */
-int generic_permission(struct inode *inode, int mask,
-		int (*check_acl)(struct inode *inode, int mask))
+int generic_permission(struct inode *inode, int mask, unsigned int flags,
+	int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
 {
 	int ret;
 
 	/*
 	 * Do the basic POSIX ACL permission checks.
 	 */
-	ret = acl_permission_check(inode, mask, check_acl);
+	ret = acl_permission_check(inode, mask, flags, check_acl);
 	if (ret != -EACCES)
 		return ret;
 
@@ -281,9 +276,10 @@ int inode_permission(struct inode *inode, int mask)
 	}
 
 	if (inode->i_op->permission)
-		retval = inode->i_op->permission(inode, mask);
+		retval = inode->i_op->permission(inode, mask, 0);
 	else
-		retval = generic_permission(inode, mask, inode->i_op->check_acl);
+		retval = generic_permission(inode, mask, 0,
+				inode->i_op->check_acl);
 
 	if (retval)
 		return retval;
@@ -668,22 +664,19 @@ force_reval_path(struct path *path, struct nameidata *nd)
  * short-cut DAC fails, then call ->permission() to do more
  * complete permission check.
  */
-static inline int __exec_permission(struct inode *inode, int rcu)
+static inline int exec_permission(struct inode *inode, unsigned int flags)
 {
 	int ret;
 
 	if (inode->i_op->permission) {
-		if (rcu)
-			return -ECHILD;
-		ret = inode->i_op->permission(inode, MAY_EXEC);
-		if (!ret)
-			goto ok;
-		return ret;
+		ret = inode->i_op->permission(inode, MAY_EXEC, flags);
+	} else {
+		ret = acl_permission_check(inode, MAY_EXEC, flags,
+				inode->i_op->check_acl);
 	}
-	ret = __acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl, rcu);
-	if (!ret)
+	if (likely(!ret))
 		goto ok;
-	if (rcu && ret == -ECHILD)
+	if (ret == -ECHILD)
 		return ret;
 
 	if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
@@ -691,17 +684,7 @@ static inline int __exec_permission(struct inode *inode, int rcu)
 
 	return ret;
 ok:
-	return security_inode_exec_permission(inode, rcu);
-}
-
-static int exec_permission(struct inode *inode)
-{
-	return __exec_permission(inode, 0);
-}
-
-static int exec_permission_rcu(struct inode *inode)
-{
-	return __exec_permission(inode, 1);
+	return security_inode_exec_permission(inode, flags);
 }
 
 static __always_inline void set_root(struct nameidata *nd)
@@ -1165,7 +1148,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 
 		nd->flags |= LOOKUP_CONTINUE;
 		if (nd->flags & LOOKUP_RCU) {
-			err = exec_permission_rcu(nd->inode);
+			err = exec_permission(nd->inode, IPERM_FLAG_RCU);
 			if (err == -ECHILD) {
 				if (nameidata_drop_rcu(nd))
 					return -ECHILD;
@@ -1173,7 +1156,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			}
 		} else {
 exec_again:
-			err = exec_permission(nd->inode);
+			err = exec_permission(nd->inode, 0);
 		}
  		if (err)
 			break;
@@ -1620,7 +1603,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
 	struct dentry *dentry;
 	int err;
 
-	err = exec_permission(inode);
+	err = exec_permission(inode, 0);
 	if (err)
 		return ERR_PTR(err);
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 58beace14b19..d33da530097a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2189,11 +2189,14 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
 	return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
 }
 
-int nfs_permission(struct inode *inode, int mask)
+int nfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
 	struct rpc_cred *cred;
 	int res = 0;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	nfs_inc_stats(inode, NFSIOS_VFSACCESS);
 
 	if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
@@ -2241,7 +2244,7 @@ out:
 out_notsup:
 	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
 	if (res == 0)
-		res = generic_permission(inode, mask, NULL);
+		res = generic_permission(inode, mask, flags, NULL);
 	goto out;
 }
 
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 71d4bc8464e0..77b48c8fab17 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -785,15 +785,19 @@ out_err:
 	return err;
 }
 
-int nilfs_permission(struct inode *inode, int mask)
+int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-	struct nilfs_root *root = NILFS_I(inode)->i_root;
+	struct nilfs_root *root;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 
+	root = NILFS_I(inode)->i_root;
 	if ((mask & MAY_WRITE) && root &&
 	    root->cno != NILFS_CPTREE_CURRENT_CNO)
 		return -EROFS; /* snapshot is not writable */
 
-	return generic_permission(inode, mask, NULL);
+	return generic_permission(inode, mask, flags, NULL);
 }
 
 int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index f7560da5a567..0ca98823db59 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -256,7 +256,7 @@ extern void nilfs_update_inode(struct inode *, struct buffer_head *);
 extern void nilfs_truncate(struct inode *);
 extern void nilfs_evict_inode(struct inode *);
 extern int nilfs_setattr(struct dentry *, struct iattr *);
-int nilfs_permission(struct inode *inode, int mask);
+int nilfs_permission(struct inode *inode, int mask, unsigned int flags);
 extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
 				  struct buffer_head **);
 extern int nilfs_inode_dirty(struct inode *);
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 391915093fe1..704f6b1742f3 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -291,13 +291,17 @@ static int ocfs2_set_acl(handle_t *handle,
 	return ret;
 }
 
-int ocfs2_check_acl(struct inode *inode, int mask)
+int ocfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_super *osb;
 	struct buffer_head *di_bh = NULL;
 	struct posix_acl *acl;
 	int ret = -EAGAIN;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	osb = OCFS2_SB(inode->i_sb);
 	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
 		return ret;
 
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 5c5d31f05853..4fe7c9cf4bfb 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,7 +26,7 @@ struct ocfs2_acl_entry {
 	__le32 e_id;
 };
 
-extern int ocfs2_check_acl(struct inode *, int);
+extern int ocfs2_check_acl(struct inode *, int, unsigned int);
 extern int ocfs2_acl_chmod(struct inode *);
 extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
 			  struct buffer_head *, struct buffer_head *,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f6cba566429d..bdadbae09094 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1307,10 +1307,13 @@ bail:
 	return err;
 }
 
-int ocfs2_permission(struct inode *inode, int mask)
+int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
 {
 	int ret;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	mlog_entry_void();
 
 	ret = ocfs2_inode_lock(inode, NULL, 0);
@@ -1320,7 +1323,7 @@ int ocfs2_permission(struct inode *inode, int mask)
 		goto out;
 	}
 
-	ret = generic_permission(inode, mask, ocfs2_check_acl);
+	ret = generic_permission(inode, mask, flags, ocfs2_check_acl);
 
 	ocfs2_inode_unlock(inode, 0);
 out:
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 97bf761c9e7c..f5afbbef6703 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -61,7 +61,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
-int ocfs2_permission(struct inode *inode, int mask);
+int ocfs2_permission(struct inode *inode, int mask, unsigned int flags);
 
 int ocfs2_should_update_atime(struct inode *inode,
 			      struct vfsmount *vfsmnt);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dc5b2fcadc3b..b953d41d9abf 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2114,11 +2114,13 @@ static const struct file_operations proc_fd_operations = {
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
  */
-static int proc_fd_permission(struct inode *inode, int mask)
+static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags)
 {
 	int rv;
 
-	rv = generic_permission(inode, mask, NULL);
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+	rv = generic_permission(inode, mask, flags, NULL);
 	if (rv == 0)
 		return 0;
 	if (task_pid(current) == proc_pid(inode))
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index c9097f43b425..09a1f92a34ef 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -295,7 +295,7 @@ out:
 	return ret;
 }
 
-static int proc_sys_permission(struct inode *inode, int mask)
+static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags)
 {
 	/*
 	 * sysctl entries that are not writeable,
@@ -305,6 +305,9 @@ static int proc_sys_permission(struct inode *inode, int mask)
 	struct ctl_table *table;
 	int error;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	/* Executable files are not allowed under /proc/sys/ */
 	if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))
 		return -EACCES;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 9ea22a56cdf1..3cfb2e933644 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -870,11 +870,14 @@ out:
 	return err;
 }
 
-static int reiserfs_check_acl(struct inode *inode, int mask)
+static int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
 	struct posix_acl *acl;
 	int error = -EAGAIN; /* do regular unix permission checks by default */
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
 	acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
 
 	if (acl) {
@@ -951,8 +954,10 @@ static int xattr_mount_check(struct super_block *s)
 	return 0;
 }
 
-int reiserfs_permission(struct inode *inode, int mask)
+int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
 	/*
 	 * We don't do permission checks on the internal objects.
 	 * Permissions are determined by the "owning" object.
@@ -965,9 +970,10 @@ int reiserfs_permission(struct inode *inode, int mask)
 	 * Stat data v1 doesn't support ACLs.
 	 */
 	if (get_inode_sd_version(inode) != STAT_DATA_V1)
-		return generic_permission(inode, mask, reiserfs_check_acl);
+		return generic_permission(inode, mask, flags,
+					reiserfs_check_acl);
 #endif
-	return generic_permission(inode, mask, NULL);
+	return generic_permission(inode, mask, flags, NULL);
 }
 
 static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index cffb1fd8ba33..30ac27345586 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -348,13 +348,18 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
 		return -ENOENT;
 }
 
-int sysfs_permission(struct inode *inode, int mask)
+int sysfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-	struct sysfs_dirent *sd = inode->i_private;
+	struct sysfs_dirent *sd;
+
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	sd = inode->i_private;
 
 	mutex_lock(&sysfs_mutex);
 	sysfs_refresh_inode(sd, inode);
 	mutex_unlock(&sysfs_mutex);
 
-	return generic_permission(inode, mask, NULL);
+	return generic_permission(inode, mask, flags, NULL);
 }
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index d9be60a2e956..ffaaa816bfba 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -200,7 +200,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
 struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
 void sysfs_evict_inode(struct inode *inode);
 int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
-int sysfs_permission(struct inode *inode, int mask);
+int sysfs_permission(struct inode *inode, int mask, unsigned int flags);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index b2771862fd3d..4b11eaf6a580 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -219,12 +219,16 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 }
 
 int
-xfs_check_acl(struct inode *inode, int mask)
+xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-	struct xfs_inode *ip = XFS_I(inode);
+	struct xfs_inode *ip;
 	struct posix_acl *acl;
 	int error = -EAGAIN;
 
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	ip = XFS_I(inode);
 	trace_xfs_check_acl(ip);
 
 	/*
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 0135e2a669d7..11dd72070cbb 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -42,7 +42,7 @@ struct xfs_acl {
 #define SGI_ACL_DEFAULT_SIZE	(sizeof(SGI_ACL_DEFAULT)-1)
 
 #ifdef CONFIG_XFS_POSIX_ACL
-extern int xfs_check_acl(struct inode *inode, int mask);
+extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags);
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
 extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
 extern int xfs_acl_chmod(struct inode *inode);
-- 
cgit v1.2.2


From 1e1743ebe35ec7e3c1fa732408358fbc614cbbe5 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:49:59 +1100
Subject: fs: provide simple rcu-walk generic_check_acl implementation

This simple implementation just checks for no ACLs on the inode, and
if so, then the rcu-walk may proceed, otherwise fail it.

This could easily be extended to put acls under RCU and check them
under seqlock, if need be. But this implementation is enough to show
the rcu-walk aware permissions code for path lookups is working, and
will handle cases where there are no ACLs or ACLs in just the final
element.

This patch implicity converts tmpfs to rcu-aware permission check.
Subsequent patches onvert ext*, xfs, and, btrfs. Each of these uses
acl/permission code in a different way, so convert them all to provide
templates and proof of concept.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/generic_acl.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 628004282130..06c48a891832 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -192,16 +192,18 @@ generic_acl_chmod(struct inode *inode)
 int
 generic_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-	struct posix_acl *acl;
-
-	if (flags & IPERM_FLAG_RCU)
-		return -ECHILD;
-
-	acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
-	if (acl) {
-		int error = posix_acl_permission(inode, acl, mask);
-		posix_acl_release(acl);
-		return error;
+	if (flags & IPERM_FLAG_RCU) {
+		if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+			return -ECHILD;
+	} else {
+		struct posix_acl *acl;
+
+		acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
+		if (acl) {
+			int error = posix_acl_permission(inode, acl, mask);
+			posix_acl_release(acl);
+			return error;
+		}
 	}
 	return -EAGAIN;
 }
-- 
cgit v1.2.2


From 73598611ade7c85f0c3d52ba5130103f6709c6d3 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:50:00 +1100
Subject: ext2,3,4: provide simple rcu-walk ACL implementation

This simple implementation just checks for no ACLs on the inode, and
if so, then the rcu-walk may proceed, otherwise fail it.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/ext2/acl.c | 7 +++++--
 fs/ext3/acl.c | 7 +++++--
 fs/ext4/acl.c | 7 +++++--
 3 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index dd9bb3f0c8d7..7b4180554a62 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -236,8 +236,11 @@ ext2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
 	struct posix_acl *acl;
 
-	if (flags & IPERM_FLAG_RCU)
-		return -ECHILD;
+	if (flags & IPERM_FLAG_RCU) {
+		if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+			return -ECHILD;
+		return -EAGAIN;
+	}
 
 	acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 9e49da8302d3..e4fa49e6c539 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -244,8 +244,11 @@ ext3_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
 	struct posix_acl *acl;
 
-	if (flags & IPERM_FLAG_RCU)
-		return -ECHILD;
+	if (flags & IPERM_FLAG_RCU) {
+		if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+			return -ECHILD;
+		return -EAGAIN;
+	}
 
 	acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 373dcaeedba9..e0270d1f8d82 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -242,8 +242,11 @@ ext4_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
 	struct posix_acl *acl;
 
-	if (flags & IPERM_FLAG_RCU)
-		return -ECHILD;
+	if (flags & IPERM_FLAG_RCU) {
+		if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+			return -ECHILD;
+		return -EAGAIN;
+	}
 
 	acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
-- 
cgit v1.2.2


From 258a5aa8dfc6294f5f7df892023ee4d3e57c9841 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:50:01 +1100
Subject: btrfs: provide simple rcu-walk ACL implementation

This simple implementation just checks for no ACLs on the inode, and
if so, then the rcu-walk may proceed, otherwise fail it.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/btrfs/acl.c   | 21 ++++++++++++---------
 fs/btrfs/inode.c |  3 ---
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index cb518a4b917c..6ae2c8cac9d5 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -187,18 +187,21 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
 
 int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-	struct posix_acl *acl;
 	int error = -EAGAIN;
 
-	if (flags & IPERM_FLAG_RCU)
-		return -ECHILD;
+	if (flags & IPERM_FLAG_RCU) {
+		if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+			error = -ECHILD;
 
-	acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (acl) {
-		error = posix_acl_permission(inode, acl, mask);
-		posix_acl_release(acl);
+	} else {
+		struct posix_acl *acl;
+		acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		if (acl) {
+			error = posix_acl_permission(inode, acl, mask);
+			posix_acl_release(acl);
+		}
 	}
 
 	return error;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5cf0db0081f9..a0ff46a47895 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7213,9 +7213,6 @@ static int btrfs_set_page_dirty(struct page *page)
 
 static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-	if (flags & IPERM_FLAG_RCU)
-		return -ECHILD;
-
 	if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
 		return -EACCES;
 	return generic_permission(inode, mask, flags, btrfs_check_acl);
-- 
cgit v1.2.2


From 880566e17c4fce0d998d92610d32fcb9506aa6dd Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:50:02 +1100
Subject: xfs: provide simple rcu-walk ACL implementation

This simple implementation just checks for no ACLs on the inode, and
if so, then the rcu-walk may proceed, otherwise fail it.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/xfs/linux-2.6/xfs_acl.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index 4b11eaf6a580..39f4f809bb68 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -225,9 +225,6 @@ xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 	struct posix_acl *acl;
 	int error = -EAGAIN;
 
-	if (flags & IPERM_FLAG_RCU)
-		return -ECHILD;
-
 	ip = XFS_I(inode);
 	trace_xfs_check_acl(ip);
 
@@ -238,6 +235,12 @@ xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 	if (!XFS_IFORK_Q(ip))
 		return -EAGAIN;
 
+	if (flags & IPERM_FLAG_RCU) {
+		if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+			return -ECHILD;
+		return -EAGAIN;
+	}
+
 	acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
-- 
cgit v1.2.2


From ceb5bdc2d246f6d81cf61ed70f325308a11821d2 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:50:05 +1100
Subject: fs: dcache per-bucket dcache hash locking

We can turn the dcache hash locking from a global dcache_hash_lock into
per-bucket locking.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 133 +++++++++++++++++++++++++++++++++++++-----------------------
 fs/super.c  |   3 +-
 2 files changed, 85 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 07d1f6862dc7..9f04e1ba75b7 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -33,14 +33,18 @@
 #include <linux/bootmem.h>
 #include <linux/fs_struct.h>
 #include <linux/hardirq.h>
+#include <linux/bit_spinlock.h>
+#include <linux/rculist_bl.h>
 #include "internal.h"
 
 /*
  * Usage:
  * dcache_inode_lock protects:
  *   - i_dentry, d_alias, d_inode
- * dcache_hash_lock protects:
- *   - the dcache hash table, s_anon lists
+ * dcache_hash_bucket lock protects:
+ *   - the dcache hash table
+ * s_anon bl list spinlock protects:
+ *   - the s_anon list (see __d_drop)
  * dcache_lru_lock protects:
  *   - the dcache lru lists and counters
  * d_lock protects:
@@ -57,7 +61,8 @@
  * dcache_inode_lock
  *   dentry->d_lock
  *     dcache_lru_lock
- *     dcache_hash_lock
+ *     dcache_hash_bucket lock
+ *     s_anon lock
  *
  * If there is an ancestor relationship:
  * dentry->d_parent->...->d_parent->d_lock
@@ -74,7 +79,6 @@ int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_inode_lock);
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_hash_lock);
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
@@ -96,7 +100,29 @@ static struct kmem_cache *dentry_cache __read_mostly;
 
 static unsigned int d_hash_mask __read_mostly;
 static unsigned int d_hash_shift __read_mostly;
-static struct hlist_head *dentry_hashtable __read_mostly;
+
+struct dcache_hash_bucket {
+	struct hlist_bl_head head;
+};
+static struct dcache_hash_bucket *dentry_hashtable __read_mostly;
+
+static inline struct dcache_hash_bucket *d_hash(struct dentry *parent,
+					unsigned long hash)
+{
+	hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
+	hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
+	return dentry_hashtable + (hash & D_HASHMASK);
+}
+
+static inline void spin_lock_bucket(struct dcache_hash_bucket *b)
+{
+	bit_spin_lock(0, (unsigned long *)&b->head.first);
+}
+
+static inline void spin_unlock_bucket(struct dcache_hash_bucket *b)
+{
+	__bit_spin_unlock(0, (unsigned long *)&b->head.first);
+}
 
 /* Statistics gathering. */
 struct dentry_stat_t dentry_stat = {
@@ -144,7 +170,7 @@ static void d_free(struct dentry *dentry)
 		dentry->d_op->d_release(dentry);
 
 	/* if dentry was never inserted into hash, immediate free is OK */
-	if (hlist_unhashed(&dentry->d_hash))
+	if (hlist_bl_unhashed(&dentry->d_hash))
 		__d_free(&dentry->d_u.d_rcu);
 	else
 		call_rcu(&dentry->d_u.d_rcu, __d_free);
@@ -302,11 +328,27 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
 void __d_drop(struct dentry *dentry)
 {
 	if (!(dentry->d_flags & DCACHE_UNHASHED)) {
-		dentry->d_flags |= DCACHE_UNHASHED;
-		spin_lock(&dcache_hash_lock);
-		hlist_del_rcu(&dentry->d_hash);
-		spin_unlock(&dcache_hash_lock);
-		dentry_rcuwalk_barrier(dentry);
+		if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) {
+			bit_spin_lock(0,
+				(unsigned long *)&dentry->d_sb->s_anon.first);
+			dentry->d_flags |= DCACHE_UNHASHED;
+			hlist_bl_del_init(&dentry->d_hash);
+			__bit_spin_unlock(0,
+				(unsigned long *)&dentry->d_sb->s_anon.first);
+		} else {
+			struct dcache_hash_bucket *b;
+			b = d_hash(dentry->d_parent, dentry->d_name.hash);
+			spin_lock_bucket(b);
+			/*
+			 * We may not actually need to put DCACHE_UNHASHED
+			 * manipulations under the hash lock, but follow
+			 * the principle of least surprise.
+			 */
+			dentry->d_flags |= DCACHE_UNHASHED;
+			hlist_bl_del_rcu(&dentry->d_hash);
+			spin_unlock_bucket(b);
+			dentry_rcuwalk_barrier(dentry);
+		}
 	}
 }
 EXPORT_SYMBOL(__d_drop);
@@ -961,8 +1003,8 @@ void shrink_dcache_for_umount(struct super_block *sb)
 	spin_unlock(&dentry->d_lock);
 	shrink_dcache_for_umount_subtree(dentry);
 
-	while (!hlist_empty(&sb->s_anon)) {
-		dentry = hlist_entry(sb->s_anon.first, struct dentry, d_hash);
+	while (!hlist_bl_empty(&sb->s_anon)) {
+		dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
 		shrink_dcache_for_umount_subtree(dentry);
 	}
 }
@@ -1263,7 +1305,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	dentry->d_sb = NULL;
 	dentry->d_op = NULL;
 	dentry->d_fsdata = NULL;
-	INIT_HLIST_NODE(&dentry->d_hash);
+	INIT_HLIST_BL_NODE(&dentry->d_hash);
 	INIT_LIST_HEAD(&dentry->d_lru);
 	INIT_LIST_HEAD(&dentry->d_subdirs);
 	INIT_LIST_HEAD(&dentry->d_alias);
@@ -1459,14 +1501,6 @@ struct dentry * d_alloc_root(struct inode * root_inode)
 }
 EXPORT_SYMBOL(d_alloc_root);
 
-static inline struct hlist_head *d_hash(struct dentry *parent,
-					unsigned long hash)
-{
-	hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
-	hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
-	return dentry_hashtable + (hash & D_HASHMASK);
-}
-
 /**
  * d_obtain_alias - find or allocate a dentry for a given inode
  * @inode: inode to allocate the dentry for
@@ -1521,11 +1555,11 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	tmp->d_sb = inode->i_sb;
 	tmp->d_inode = inode;
 	tmp->d_flags |= DCACHE_DISCONNECTED;
-	tmp->d_flags &= ~DCACHE_UNHASHED;
 	list_add(&tmp->d_alias, &inode->i_dentry);
-	spin_lock(&dcache_hash_lock);
-	hlist_add_head(&tmp->d_hash, &inode->i_sb->s_anon);
-	spin_unlock(&dcache_hash_lock);
+	bit_spin_lock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
+	tmp->d_flags &= ~DCACHE_UNHASHED;
+	hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
+	__bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
 	spin_unlock(&tmp->d_lock);
 	spin_unlock(&dcache_inode_lock);
 
@@ -1567,7 +1601,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 			d_move(new, dentry);
 			iput(inode);
 		} else {
-			/* already taking dcache_inode_lock, so d_add() by hand */
+			/* already got dcache_inode_lock, so d_add() by hand */
 			__d_instantiate(dentry, inode);
 			spin_unlock(&dcache_inode_lock);
 			security_d_instantiate(dentry, inode);
@@ -1702,8 +1736,8 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
 	unsigned int len = name->len;
 	unsigned int hash = name->hash;
 	const unsigned char *str = name->name;
-	struct hlist_head *head = d_hash(parent, hash);
-	struct hlist_node *node;
+	struct dcache_hash_bucket *b = d_hash(parent, hash);
+	struct hlist_bl_node *node;
 	struct dentry *dentry;
 
 	/*
@@ -1726,7 +1760,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
 	 *
 	 * See Documentation/vfs/dcache-locking.txt for more details.
 	 */
-	hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
+	hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
 		struct inode *i;
 		const char *tname;
 		int tlen;
@@ -1820,8 +1854,8 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
 	unsigned int len = name->len;
 	unsigned int hash = name->hash;
 	const unsigned char *str = name->name;
-	struct hlist_head *head = d_hash(parent,hash);
-	struct hlist_node *node;
+	struct dcache_hash_bucket *b = d_hash(parent, hash);
+	struct hlist_bl_node *node;
 	struct dentry *found = NULL;
 	struct dentry *dentry;
 
@@ -1847,7 +1881,7 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
 	 */
 	rcu_read_lock();
 	
-	hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
+	hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
 		const char *tname;
 		int tlen;
 
@@ -1998,11 +2032,13 @@ again:
 }
 EXPORT_SYMBOL(d_delete);
 
-static void __d_rehash(struct dentry * entry, struct hlist_head *list)
+static void __d_rehash(struct dentry * entry, struct dcache_hash_bucket *b)
 {
-
+	BUG_ON(!d_unhashed(entry));
+	spin_lock_bucket(b);
  	entry->d_flags &= ~DCACHE_UNHASHED;
- 	hlist_add_head_rcu(&entry->d_hash, list);
+	hlist_bl_add_head_rcu(&entry->d_hash, &b->head);
+	spin_unlock_bucket(b);
 }
 
 static void _d_rehash(struct dentry * entry)
@@ -2020,9 +2056,7 @@ static void _d_rehash(struct dentry * entry)
 void d_rehash(struct dentry * entry)
 {
 	spin_lock(&entry->d_lock);
-	spin_lock(&dcache_hash_lock);
 	_d_rehash(entry);
-	spin_unlock(&dcache_hash_lock);
 	spin_unlock(&entry->d_lock);
 }
 EXPORT_SYMBOL(d_rehash);
@@ -2165,15 +2199,16 @@ void d_move(struct dentry * dentry, struct dentry * target)
 	write_seqcount_begin(&dentry->d_seq);
 	write_seqcount_begin(&target->d_seq);
 
-	/* Move the dentry to the target hash queue, if on different bucket */
-	spin_lock(&dcache_hash_lock);
-	if (!d_unhashed(dentry))
-		hlist_del_rcu(&dentry->d_hash);
+	/* __d_drop does write_seqcount_barrier, but they're OK to nest. */
+
+	/*
+	 * Move the dentry to the target hash queue. Don't bother checking
+	 * for the same hash queue because of how unlikely it is.
+	 */
+	__d_drop(dentry);
 	__d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
-	spin_unlock(&dcache_hash_lock);
 
 	/* Unhash the target: dput() will then get rid of it */
-	/* __d_drop does write_seqcount_barrier, but they're OK to nest. */
 	__d_drop(target);
 
 	list_del(&dentry->d_u.d_child);
@@ -2369,9 +2404,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 
 	spin_lock(&actual->d_lock);
 found:
-	spin_lock(&dcache_hash_lock);
 	_d_rehash(actual);
-	spin_unlock(&dcache_hash_lock);
 	spin_unlock(&actual->d_lock);
 	spin_unlock(&dcache_inode_lock);
 out_nolock:
@@ -2953,7 +2986,7 @@ static void __init dcache_init_early(void)
 
 	dentry_hashtable =
 		alloc_large_system_hash("Dentry cache",
-					sizeof(struct hlist_head),
+					sizeof(struct dcache_hash_bucket),
 					dhash_entries,
 					13,
 					HASH_EARLY,
@@ -2962,7 +2995,7 @@ static void __init dcache_init_early(void)
 					0);
 
 	for (loop = 0; loop < (1 << d_hash_shift); loop++)
-		INIT_HLIST_HEAD(&dentry_hashtable[loop]);
+		INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head);
 }
 
 static void __init dcache_init(void)
@@ -2985,7 +3018,7 @@ static void __init dcache_init(void)
 
 	dentry_hashtable =
 		alloc_large_system_hash("Dentry cache",
-					sizeof(struct hlist_head),
+					sizeof(struct dcache_hash_bucket),
 					dhash_entries,
 					13,
 					0,
@@ -2994,7 +3027,7 @@ static void __init dcache_init(void)
 					0);
 
 	for (loop = 0; loop < (1 << d_hash_shift); loop++)
-		INIT_HLIST_HEAD(&dentry_hashtable[loop]);
+		INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head);
 }
 
 /* SLAB cache for __getname() consumers */
diff --git a/fs/super.c b/fs/super.c
index ca696155cd9a..968ba013011a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -30,6 +30,7 @@
 #include <linux/idr.h>
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
+#include <linux/rculist_bl.h>
 #include "internal.h"
 
 
@@ -71,7 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		INIT_LIST_HEAD(&s->s_files);
 #endif
 		INIT_LIST_HEAD(&s->s_instances);
-		INIT_HLIST_HEAD(&s->s_anon);
+		INIT_HLIST_BL_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
 		INIT_LIST_HEAD(&s->s_dentry_lru);
 		init_rwsem(&s->s_umount);
-- 
cgit v1.2.2


From 873feea09ebc980cbd3631b767356ce1eee65ec1 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:50:06 +1100
Subject: fs: dcache per-inode inode alias locking

dcache_inode_lock can be replaced with per-inode locking. Use existing
inode->i_lock for this. This is slightly non-trivial because we sometimes
need to find the inode from the dentry, which requires d_inode to be
stabilised (either with refcount or d_lock).

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/9p/vfs_inode.c    |  4 +--
 fs/affs/amigaffs.c   |  4 +--
 fs/cifs/inode.c      |  6 ++--
 fs/dcache.c          | 88 ++++++++++++++++++++++++++++------------------------
 fs/exportfs/expfs.c  | 12 ++++---
 fs/nfs/getroot.c     |  4 +--
 fs/notify/fsnotify.c |  4 +--
 fs/ocfs2/dcache.c    |  4 +--
 8 files changed, 67 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index df8bbb358d54..59782981b225 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -277,11 +277,11 @@ static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
 {
 	struct dentry *dentry;
 
-	spin_lock(&dcache_inode_lock);
+	spin_lock(&inode->i_lock);
 	/* Directory should have only one entry. */
 	BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
 	dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
-	spin_unlock(&dcache_inode_lock);
+	spin_unlock(&inode->i_lock);
 	return dentry;
 }
 
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 600101a21ba2..3a4557e8325c 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -128,7 +128,7 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
 	void *data = dentry->d_fsdata;
 	struct list_head *head, *next;
 
-	spin_lock(&dcache_inode_lock);
+	spin_lock(&inode->i_lock);
 	head = &inode->i_dentry;
 	next = head->next;
 	while (next != head) {
@@ -139,7 +139,7 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
 		}
 		next = next->next;
 	}
-	spin_unlock(&dcache_inode_lock);
+	spin_unlock(&inode->i_lock);
 }
 
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 2a239d878e85..a853a89857a5 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -809,14 +809,14 @@ inode_has_hashed_dentries(struct inode *inode)
 {
 	struct dentry *dentry;
 
-	spin_lock(&dcache_inode_lock);
+	spin_lock(&inode->i_lock);
 	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
-			spin_unlock(&dcache_inode_lock);
+			spin_unlock(&inode->i_lock);
 			return true;
 		}
 	}
-	spin_unlock(&dcache_inode_lock);
+	spin_unlock(&inode->i_lock);
 	return false;
 }
 
diff --git a/fs/dcache.c b/fs/dcache.c
index 9f04e1ba75b7..09ec945f3c98 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -39,8 +39,8 @@
 
 /*
  * Usage:
- * dcache_inode_lock protects:
- *   - i_dentry, d_alias, d_inode
+ * dcache->d_inode->i_lock protects:
+ *   - i_dentry, d_alias, d_inode of aliases
  * dcache_hash_bucket lock protects:
  *   - the dcache hash table
  * s_anon bl list spinlock protects:
@@ -58,7 +58,7 @@
  *   - d_alias, d_inode
  *
  * Ordering:
- * dcache_inode_lock
+ * dentry->d_inode->i_lock
  *   dentry->d_lock
  *     dcache_lru_lock
  *     dcache_hash_bucket lock
@@ -78,12 +78,10 @@
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_inode_lock);
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
 EXPORT_SYMBOL(rename_lock);
-EXPORT_SYMBOL(dcache_inode_lock);
 
 static struct kmem_cache *dentry_cache __read_mostly;
 
@@ -196,14 +194,14 @@ static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
  */
 static void dentry_iput(struct dentry * dentry)
 	__releases(dentry->d_lock)
-	__releases(dcache_inode_lock)
+	__releases(dentry->d_inode->i_lock)
 {
 	struct inode *inode = dentry->d_inode;
 	if (inode) {
 		dentry->d_inode = NULL;
 		list_del_init(&dentry->d_alias);
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_inode_lock);
+		spin_unlock(&inode->i_lock);
 		if (!inode->i_nlink)
 			fsnotify_inoderemove(inode);
 		if (dentry->d_op && dentry->d_op->d_iput)
@@ -212,7 +210,6 @@ static void dentry_iput(struct dentry * dentry)
 			iput(inode);
 	} else {
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_inode_lock);
 	}
 }
 
@@ -222,14 +219,14 @@ static void dentry_iput(struct dentry * dentry)
  */
 static void dentry_unlink_inode(struct dentry * dentry)
 	__releases(dentry->d_lock)
-	__releases(dcache_inode_lock)
+	__releases(dentry->d_inode->i_lock)
 {
 	struct inode *inode = dentry->d_inode;
 	dentry->d_inode = NULL;
 	list_del_init(&dentry->d_alias);
 	dentry_rcuwalk_barrier(dentry);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_inode_lock);
+	spin_unlock(&inode->i_lock);
 	if (!inode->i_nlink)
 		fsnotify_inoderemove(inode);
 	if (dentry->d_op && dentry->d_op->d_iput)
@@ -295,7 +292,7 @@ static void dentry_lru_move_tail(struct dentry *dentry)
 static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
 	__releases(dentry->d_lock)
 	__releases(parent->d_lock)
-	__releases(dcache_inode_lock)
+	__releases(dentry->d_inode->i_lock)
 {
 	dentry->d_parent = NULL;
 	list_del(&dentry->d_u.d_child);
@@ -370,9 +367,11 @@ EXPORT_SYMBOL(d_drop);
 static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
 	__releases(dentry->d_lock)
 {
+	struct inode *inode;
 	struct dentry *parent;
 
-	if (!spin_trylock(&dcache_inode_lock)) {
+	inode = dentry->d_inode;
+	if (inode && !spin_trylock(&inode->i_lock)) {
 relock:
 		spin_unlock(&dentry->d_lock);
 		cpu_relax();
@@ -383,7 +382,8 @@ relock:
 	else
 		parent = dentry->d_parent;
 	if (parent && !spin_trylock(&parent->d_lock)) {
-		spin_unlock(&dcache_inode_lock);
+		if (inode)
+			spin_unlock(&inode->i_lock);
 		goto relock;
 	}
 
@@ -618,9 +618,9 @@ struct dentry *d_find_alias(struct inode *inode)
 	struct dentry *de = NULL;
 
 	if (!list_empty(&inode->i_dentry)) {
-		spin_lock(&dcache_inode_lock);
+		spin_lock(&inode->i_lock);
 		de = __d_find_alias(inode, 0);
-		spin_unlock(&dcache_inode_lock);
+		spin_unlock(&inode->i_lock);
 	}
 	return de;
 }
@@ -634,20 +634,20 @@ void d_prune_aliases(struct inode *inode)
 {
 	struct dentry *dentry;
 restart:
-	spin_lock(&dcache_inode_lock);
+	spin_lock(&inode->i_lock);
 	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		spin_lock(&dentry->d_lock);
 		if (!dentry->d_count) {
 			__dget_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&dcache_inode_lock);
+			spin_unlock(&inode->i_lock);
 			dput(dentry);
 			goto restart;
 		}
 		spin_unlock(&dentry->d_lock);
 	}
-	spin_unlock(&dcache_inode_lock);
+	spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL(d_prune_aliases);
 
@@ -1392,9 +1392,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 void d_instantiate(struct dentry *entry, struct inode * inode)
 {
 	BUG_ON(!list_empty(&entry->d_alias));
-	spin_lock(&dcache_inode_lock);
+	if (inode)
+		spin_lock(&inode->i_lock);
 	__d_instantiate(entry, inode);
-	spin_unlock(&dcache_inode_lock);
+	if (inode)
+		spin_unlock(&inode->i_lock);
 	security_d_instantiate(entry, inode);
 }
 EXPORT_SYMBOL(d_instantiate);
@@ -1458,9 +1460,11 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
 
 	BUG_ON(!list_empty(&entry->d_alias));
 
-	spin_lock(&dcache_inode_lock);
+	if (inode)
+		spin_lock(&inode->i_lock);
 	result = __d_instantiate_unique(entry, inode);
-	spin_unlock(&dcache_inode_lock);
+	if (inode)
+		spin_unlock(&inode->i_lock);
 
 	if (!result) {
 		security_d_instantiate(entry, inode);
@@ -1542,10 +1546,10 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	tmp->d_parent = tmp; /* make sure dput doesn't croak */
 
 
-	spin_lock(&dcache_inode_lock);
+	spin_lock(&inode->i_lock);
 	res = __d_find_alias(inode, 0);
 	if (res) {
-		spin_unlock(&dcache_inode_lock);
+		spin_unlock(&inode->i_lock);
 		dput(tmp);
 		goto out_iput;
 	}
@@ -1561,7 +1565,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
 	__bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
 	spin_unlock(&tmp->d_lock);
-	spin_unlock(&dcache_inode_lock);
+	spin_unlock(&inode->i_lock);
 
 	return tmp;
 
@@ -1592,18 +1596,18 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 	struct dentry *new = NULL;
 
 	if (inode && S_ISDIR(inode->i_mode)) {
-		spin_lock(&dcache_inode_lock);
+		spin_lock(&inode->i_lock);
 		new = __d_find_alias(inode, 1);
 		if (new) {
 			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
-			spin_unlock(&dcache_inode_lock);
+			spin_unlock(&inode->i_lock);
 			security_d_instantiate(new, inode);
 			d_move(new, dentry);
 			iput(inode);
 		} else {
-			/* already got dcache_inode_lock, so d_add() by hand */
+			/* already taking inode->i_lock, so d_add() by hand */
 			__d_instantiate(dentry, inode);
-			spin_unlock(&dcache_inode_lock);
+			spin_unlock(&inode->i_lock);
 			security_d_instantiate(dentry, inode);
 			d_rehash(dentry);
 		}
@@ -1676,10 +1680,10 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 	 * Negative dentry: instantiate it unless the inode is a directory and
 	 * already has a dentry.
 	 */
-	spin_lock(&dcache_inode_lock);
+	spin_lock(&inode->i_lock);
 	if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) {
 		__d_instantiate(found, inode);
-		spin_unlock(&dcache_inode_lock);
+		spin_unlock(&inode->i_lock);
 		security_d_instantiate(found, inode);
 		return found;
 	}
@@ -1690,7 +1694,7 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 	 */
 	new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
 	__dget(new);
-	spin_unlock(&dcache_inode_lock);
+	spin_unlock(&inode->i_lock);
 	security_d_instantiate(found, inode);
 	d_move(new, found);
 	iput(inode);
@@ -2004,15 +2008,17 @@ EXPORT_SYMBOL(d_validate);
  
 void d_delete(struct dentry * dentry)
 {
+	struct inode *inode;
 	int isdir = 0;
 	/*
 	 * Are we the only user?
 	 */
 again:
 	spin_lock(&dentry->d_lock);
-	isdir = S_ISDIR(dentry->d_inode->i_mode);
+	inode = dentry->d_inode;
+	isdir = S_ISDIR(inode->i_mode);
 	if (dentry->d_count == 1) {
-		if (!spin_trylock(&dcache_inode_lock)) {
+		if (inode && !spin_trylock(&inode->i_lock)) {
 			spin_unlock(&dentry->d_lock);
 			cpu_relax();
 			goto again;
@@ -2266,13 +2272,13 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
  * This helper attempts to cope with remotely renamed directories
  *
  * It assumes that the caller is already holding
- * dentry->d_parent->d_inode->i_mutex and the dcache_inode_lock
+ * dentry->d_parent->d_inode->i_mutex and the inode->i_lock
  *
  * Note: If ever the locking in lock_rename() changes, then please
  * remember to update this too...
  */
-static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
-	__releases(dcache_inode_lock)
+static struct dentry *__d_unalias(struct inode *inode,
+		struct dentry *dentry, struct dentry *alias)
 {
 	struct mutex *m1 = NULL, *m2 = NULL;
 	struct dentry *ret;
@@ -2298,7 +2304,7 @@ out_unalias:
 	d_move(alias, dentry);
 	ret = alias;
 out_err:
-	spin_unlock(&dcache_inode_lock);
+	spin_unlock(&inode->i_lock);
 	if (m2)
 		mutex_unlock(m2);
 	if (m1)
@@ -2371,7 +2377,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 		goto out_nolock;
 	}
 
-	spin_lock(&dcache_inode_lock);
+	spin_lock(&inode->i_lock);
 
 	if (S_ISDIR(inode->i_mode)) {
 		struct dentry *alias;
@@ -2388,7 +2394,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 				goto found;
 			}
 			/* Nope, but we must(!) avoid directory aliasing */
-			actual = __d_unalias(dentry, alias);
+			actual = __d_unalias(inode, dentry, alias);
 			if (IS_ERR(actual))
 				dput(alias);
 			goto out_nolock;
@@ -2406,7 +2412,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 found:
 	_d_rehash(actual);
 	spin_unlock(&actual->d_lock);
-	spin_unlock(&dcache_inode_lock);
+	spin_unlock(&inode->i_lock);
 out_nolock:
 	if (actual == dentry) {
 		security_d_instantiate(dentry, inode);
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index f06a940065f6..4b6825740dd5 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -43,24 +43,26 @@ find_acceptable_alias(struct dentry *result,
 		void *context)
 {
 	struct dentry *dentry, *toput = NULL;
+	struct inode *inode;
 
 	if (acceptable(context, result))
 		return result;
 
-	spin_lock(&dcache_inode_lock);
-	list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) {
+	inode = result->d_inode;
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		dget(dentry);
-		spin_unlock(&dcache_inode_lock);
+		spin_unlock(&inode->i_lock);
 		if (toput)
 			dput(toput);
 		if (dentry != result && acceptable(context, dentry)) {
 			dput(result);
 			return dentry;
 		}
-		spin_lock(&dcache_inode_lock);
+		spin_lock(&inode->i_lock);
 		toput = dentry;
 	}
-	spin_unlock(&dcache_inode_lock);
+	spin_unlock(&inode->i_lock);
 
 	if (toput)
 		dput(toput);
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index c3a5a1126833..5596c6a2881e 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -63,11 +63,11 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
 		 * This again causes shrink_dcache_for_umount_subtree() to
 		 * Oops, since the test for IS_ROOT() will fail.
 		 */
-		spin_lock(&dcache_inode_lock);
+		spin_lock(&sb->s_root->d_inode->i_lock);
 		spin_lock(&sb->s_root->d_lock);
 		list_del_init(&sb->s_root->d_alias);
 		spin_unlock(&sb->s_root->d_lock);
-		spin_unlock(&dcache_inode_lock);
+		spin_unlock(&sb->s_root->d_inode->i_lock);
 	}
 	return 0;
 }
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 9be6ec1f36d8..79b47cbb5cd8 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -59,7 +59,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 	/* determine if the children should tell inode about their events */
 	watched = fsnotify_inode_watches_children(inode);
 
-	spin_lock(&dcache_inode_lock);
+	spin_lock(&inode->i_lock);
 	/* run all of the dentries associated with this inode.  Since this is a
 	 * directory, there damn well better only be one item on this list */
 	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
@@ -82,7 +82,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 		}
 		spin_unlock(&alias->d_lock);
 	}
-	spin_unlock(&dcache_inode_lock);
+	spin_unlock(&inode->i_lock);
 }
 
 /* Notify this dentry's parent about a child's events. */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 0310b16a7238..6d80ecc7834f 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -175,7 +175,7 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
 	struct list_head *p;
 	struct dentry *dentry = NULL;
 
-	spin_lock(&dcache_inode_lock);
+	spin_lock(&inode->i_lock);
 	list_for_each(p, &inode->i_dentry) {
 		dentry = list_entry(p, struct dentry, d_alias);
 
@@ -193,7 +193,7 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
 		dentry = NULL;
 	}
 
-	spin_unlock(&dcache_inode_lock);
+	spin_unlock(&inode->i_lock);
 
 	return dentry;
 }
-- 
cgit v1.2.2


From 4b936885ab04dc6e0bb0ef35e0e23c1a7364d9e5 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:50:07 +1100
Subject: fs: improve scalability of pseudo filesystems

Regardless of how much we possibly try to scale dcache, there is likely
always going to be some fundamental contention when adding or removing children
under the same parent. Pseudo filesystems do not seem need to have connected
dentries because by definition they are disconnected.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/anon_inodes.c |  2 +-
 fs/dcache.c      | 12 ++++++++++++
 fs/pipe.c        |  2 +-
 3 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index aca8806fa206..9d92b33da8a0 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -102,7 +102,7 @@ struct file *anon_inode_getfile(const char *name,
 	this.name = name;
 	this.len = strlen(name);
 	this.hash = 0;
-	path.dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
+	path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this);
 	if (!path.dentry)
 		goto err_module;
 
diff --git a/fs/dcache.c b/fs/dcache.c
index 09ec945f3c98..9e6e6db76869 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1330,6 +1330,18 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 }
 EXPORT_SYMBOL(d_alloc);
 
+struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
+{
+	struct dentry *dentry = d_alloc(NULL, name);
+	if (dentry) {
+		dentry->d_sb = sb;
+		dentry->d_parent = dentry;
+		dentry->d_flags |= DCACHE_DISCONNECTED;
+	}
+	return dentry;
+}
+EXPORT_SYMBOL(d_alloc_pseudo);
+
 struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 {
 	struct qstr q;
diff --git a/fs/pipe.c b/fs/pipe.c
index 01a786567810..cfe3a7f2ee21 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -999,7 +999,7 @@ struct file *create_write_pipe(int flags)
 		goto err;
 
 	err = -ENOMEM;
-	path.dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name);
+	path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
 	if (!path.dentry)
 		goto err_inode;
 	path.mnt = mntget(pipe_mnt);
-- 
cgit v1.2.2


From e1bb57826381199cc79fbf44e9dfeee58fc7b339 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:50:08 +1100
Subject: fs: prefetch inode data in dcache lookup

This makes single threaded git diff -1.25% +/- 0.05% elapsed time on my
2s12c24t Westmere system, and -0.86% +/- 0.05% on my 2s8c Barcelona, by
prefetching the important first cacheline of the inode in while we do the
actual name compare and other operations on the dentry.

There was no measurable slowdown in the single file stat case, or the creat
case (where negative dentries would be common).

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 9e6e6db76869..2a4ce7dc2306 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1793,6 +1793,9 @@ seqretry:
 		tlen = dentry->d_name.len;
 		tname = dentry->d_name.name;
 		i = dentry->d_inode;
+		prefetch(tname);
+		if (i)
+			prefetch(i);
 		/*
 		 * This seqcount check is required to ensure name and
 		 * len are loaded atomically, so as not to walk off the
-- 
cgit v1.2.2


From 9d55c369bb5e695e629bc35cba2ef607755b3bee Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:50:09 +1100
Subject: fs: implement faster dentry memcmp

The standard memcmp function on a Westmere system shows up hot in
profiles in the `git diff` workload (both parallel and single threaded),
and it is likely due to the costs associated with trapping into
microcode, and little opportunity to improve memory access (dentry
name is not likely to take up more than a cacheline).

So replace it with an open-coded byte comparison. This increases code
size by 8 bytes in the critical __d_lookup_rcu function, but the
speedup is huge, averaging 10 runs of each:

git diff st   user   sys   elapsed  CPU
before        1.15   2.57  3.82      97.1
after         1.14   2.35  3.61      96.8

git diff mt   user   sys   elapsed  CPU
before        1.27   3.85  1.46     349
after         1.26   3.54  1.43     333

Elapsed time for single threaded git diff at 95.0% confidence:
        -0.21  +/- 0.01
        -5.45% +/- 0.24%

It's -0.66% +/- 0.06% elapsed time on my Opteron, so rep cmp costs on the
fam10h seem to be relatively smaller, but there is still a win.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/dcache.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 2a4ce7dc2306..5699d4c027cb 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1454,9 +1454,7 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
 			continue;
 		if (alias->d_parent != entry->d_parent)
 			continue;
-		if (qstr->len != len)
-			continue;
-		if (memcmp(qstr->name, name, len))
+		if (dentry_cmp(qstr->name, qstr->len, name, len))
 			continue;
 		__dget(alias);
 		return alias;
@@ -1810,9 +1808,7 @@ seqretry:
 						tlen, tname, name))
 				continue;
 		} else {
-			if (tlen != len)
-				continue;
-			if (memcmp(tname, str, tlen))
+			if (dentry_cmp(tname, tlen, str, len))
 				continue;
 		}
 		/*
@@ -1925,9 +1921,7 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
 						tlen, tname, name))
 				goto next;
 		} else {
-			if (tlen != len)
-				goto next;
-			if (memcmp(tname, str, tlen))
+			if (dentry_cmp(tname, tlen, str, len))
 				goto next;
 		}
 
-- 
cgit v1.2.2


From c6653a838b1b2738561aff0b8c0f62a9b714bdd9 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:50:10 +1100
Subject: fs: rename vfsmount counter helpers

Suggested by Andreas, mnt_ prefix is clearer namespace, follows kernel
conventions better, and is easier for tab complete. I introduced these
names so I'll admit they were not good choices.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/namespace.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 39a7d507fba0..03b82350f020 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -216,7 +216,7 @@ int __mnt_is_readonly(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
 
-static inline void inc_mnt_writers(struct vfsmount *mnt)
+static inline void mnt_inc_writers(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
 	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
@@ -225,7 +225,7 @@ static inline void inc_mnt_writers(struct vfsmount *mnt)
 #endif
 }
 
-static inline void dec_mnt_writers(struct vfsmount *mnt)
+static inline void mnt_dec_writers(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
 	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
@@ -234,7 +234,7 @@ static inline void dec_mnt_writers(struct vfsmount *mnt)
 #endif
 }
 
-static unsigned int count_mnt_writers(struct vfsmount *mnt)
+static unsigned int mnt_get_writers(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
 	unsigned int count = 0;
@@ -273,9 +273,9 @@ int mnt_want_write(struct vfsmount *mnt)
 	int ret = 0;
 
 	preempt_disable();
-	inc_mnt_writers(mnt);
+	mnt_inc_writers(mnt);
 	/*
-	 * The store to inc_mnt_writers must be visible before we pass
+	 * The store to mnt_inc_writers must be visible before we pass
 	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
 	 * incremented count after it has set MNT_WRITE_HOLD.
 	 */
@@ -289,7 +289,7 @@ int mnt_want_write(struct vfsmount *mnt)
 	 */
 	smp_rmb();
 	if (__mnt_is_readonly(mnt)) {
-		dec_mnt_writers(mnt);
+		mnt_dec_writers(mnt);
 		ret = -EROFS;
 		goto out;
 	}
@@ -317,7 +317,7 @@ int mnt_clone_write(struct vfsmount *mnt)
 	if (__mnt_is_readonly(mnt))
 		return -EROFS;
 	preempt_disable();
-	inc_mnt_writers(mnt);
+	mnt_inc_writers(mnt);
 	preempt_enable();
 	return 0;
 }
@@ -351,7 +351,7 @@ EXPORT_SYMBOL_GPL(mnt_want_write_file);
 void mnt_drop_write(struct vfsmount *mnt)
 {
 	preempt_disable();
-	dec_mnt_writers(mnt);
+	mnt_dec_writers(mnt);
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
@@ -384,7 +384,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
 	 * we're counting up here.
 	 */
-	if (count_mnt_writers(mnt) > 0)
+	if (mnt_get_writers(mnt) > 0)
 		ret = -EBUSY;
 	else
 		mnt->mnt_flags |= MNT_READONLY;
@@ -663,9 +663,9 @@ static inline void __mntput(struct vfsmount *mnt)
 	 */
 	/*
 	 * atomic_dec_and_lock() used to deal with ->mnt_count decrements
-	 * provides barriers, so count_mnt_writers() below is safe.  AV
+	 * provides barriers, so mnt_get_writers() below is safe.  AV
 	 */
-	WARN_ON(count_mnt_writers(mnt));
+	WARN_ON(mnt_get_writers(mnt));
 	fsnotify_vfsmount_delete(mnt);
 	dput(mnt->mnt_root);
 	free_vfsmnt(mnt);
-- 
cgit v1.2.2


From b3e19d924b6eaf2ca7d22cba99a517c5171007b6 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Fri, 7 Jan 2011 17:50:11 +1100
Subject: fs: scale mntget/mntput

The problem that this patch aims to fix is vfsmount refcounting scalability.
We need to take a reference on the vfsmount for every successful path lookup,
which often go to the same mount point.

The fundamental difficulty is that a "simple" reference count can never be made
scalable, because any time a reference is dropped, we must check whether that
was the last reference. To do that requires communication with all other CPUs
that may have taken a reference count.

We can make refcounts more scalable in a couple of ways, involving keeping
distributed counters, and checking for the global-zero condition less
frequently.

- check the global sum once every interval (this will delay zero detection
  for some interval, so it's probably a showstopper for vfsmounts).

- keep a local count and only taking the global sum when local reaches 0 (this
  is difficult for vfsmounts, because we can't hold preempt off for the life of
  a reference, so a counter would need to be per-thread or tied strongly to a
  particular CPU which requires more locking).

- keep a local difference of increments and decrements, which allows us to sum
  the total difference and hence find the refcount when summing all CPUs. Then,
  keep a single integer "long" refcount for slow and long lasting references,
  and only take the global sum of local counters when the long refcount is 0.

This last scheme is what I implemented here. Attached mounts and process root
and working directory references are "long" references, and everything else is
a short reference.

This allows scalable vfsmount references during path walking over mounted
subtrees and unattached (lazy umounted) mounts with processes still running
in them.

This results in one fewer atomic op in the fastpath: mntget is now just a
per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock
and non-atomic decrement in the common case. However code is otherwise bigger
and heavier, so single threaded performance is basically a wash.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
---
 fs/anon_inodes.c |   2 +-
 fs/fs_struct.c   |  26 +++---
 fs/internal.h    |   1 +
 fs/namei.c       |  24 ++++++
 fs/namespace.c   | 242 +++++++++++++++++++++++++++++++++++++++++++++----------
 fs/pipe.c        |   2 +-
 fs/pnode.c       |   4 +-
 fs/super.c       |   2 +-
 8 files changed, 245 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 9d92b33da8a0..5fd38112a6ca 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -232,7 +232,7 @@ static int __init anon_inode_init(void)
 	return 0;
 
 err_mntput:
-	mntput(anon_inode_mnt);
+	mntput_long(anon_inode_mnt);
 err_unregister_filesystem:
 	unregister_filesystem(&anon_inode_fs_type);
 err_exit:
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 60b8531f41c5..68ca487bedb1 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -17,11 +17,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
 	write_seqcount_begin(&fs->seq);
 	old_root = fs->root;
 	fs->root = *path;
-	path_get(path);
+	path_get_long(path);
 	write_seqcount_end(&fs->seq);
 	spin_unlock(&fs->lock);
 	if (old_root.dentry)
-		path_put(&old_root);
+		path_put_long(&old_root);
 }
 
 /*
@@ -36,12 +36,12 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
 	write_seqcount_begin(&fs->seq);
 	old_pwd = fs->pwd;
 	fs->pwd = *path;
-	path_get(path);
+	path_get_long(path);
 	write_seqcount_end(&fs->seq);
 	spin_unlock(&fs->lock);
 
 	if (old_pwd.dentry)
-		path_put(&old_pwd);
+		path_put_long(&old_pwd);
 }
 
 void chroot_fs_refs(struct path *old_root, struct path *new_root)
@@ -59,13 +59,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
 			write_seqcount_begin(&fs->seq);
 			if (fs->root.dentry == old_root->dentry
 			    && fs->root.mnt == old_root->mnt) {
-				path_get(new_root);
+				path_get_long(new_root);
 				fs->root = *new_root;
 				count++;
 			}
 			if (fs->pwd.dentry == old_root->dentry
 			    && fs->pwd.mnt == old_root->mnt) {
-				path_get(new_root);
+				path_get_long(new_root);
 				fs->pwd = *new_root;
 				count++;
 			}
@@ -76,13 +76,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 	while (count--)
-		path_put(old_root);
+		path_put_long(old_root);
 }
 
 void free_fs_struct(struct fs_struct *fs)
 {
-	path_put(&fs->root);
-	path_put(&fs->pwd);
+	path_put_long(&fs->root);
+	path_put_long(&fs->pwd);
 	kmem_cache_free(fs_cachep, fs);
 }
 
@@ -115,7 +115,13 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 		spin_lock_init(&fs->lock);
 		seqcount_init(&fs->seq);
 		fs->umask = old->umask;
-		get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
+
+		spin_lock(&old->lock);
+		fs->root = old->root;
+		path_get_long(&fs->root);
+		fs->pwd = old->pwd;
+		path_get_long(&fs->pwd);
+		spin_unlock(&old->lock);
 	}
 	return fs;
 }
diff --git a/fs/internal.h b/fs/internal.h
index e43b9a4dbf4e..9687c2ee2735 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -63,6 +63,7 @@ extern int copy_mount_string(const void __user *, char **);
 
 extern void free_vfsmnt(struct vfsmount *);
 extern struct vfsmount *alloc_vfsmnt(const char *);
+extern unsigned int mnt_get_count(struct vfsmount *mnt);
 extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
 extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
 				struct vfsmount *);
diff --git a/fs/namei.c b/fs/namei.c
index 4e957bf744ae..19433cdba011 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -367,6 +367,18 @@ void path_get(struct path *path)
 }
 EXPORT_SYMBOL(path_get);
 
+/**
+ * path_get_long - get a long reference to a path
+ * @path: path to get the reference to
+ *
+ * Given a path increment the reference count to the dentry and the vfsmount.
+ */
+void path_get_long(struct path *path)
+{
+	mntget_long(path->mnt);
+	dget(path->dentry);
+}
+
 /**
  * path_put - put a reference to a path
  * @path: path to put the reference to
@@ -380,6 +392,18 @@ void path_put(struct path *path)
 }
 EXPORT_SYMBOL(path_put);
 
+/**
+ * path_put_long - put a long reference to a path
+ * @path: path to put the reference to
+ *
+ * Given a path decrement the reference count to the dentry and the vfsmount.
+ */
+void path_put_long(struct path *path)
+{
+	dput(path->dentry);
+	mntput_long(path->mnt);
+}
+
 /**
  * nameidata_drop_rcu - drop this nameidata out of rcu-walk
  * @nd: nameidata pathwalk data to drop
diff --git a/fs/namespace.c b/fs/namespace.c
index 03b82350f020..3ddfd9046c44 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -138,6 +138,64 @@ void mnt_release_group_id(struct vfsmount *mnt)
 	mnt->mnt_group_id = 0;
 }
 
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_add_count(struct vfsmount *mnt, int n)
+{
+#ifdef CONFIG_SMP
+	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
+#else
+	preempt_disable();
+	mnt->mnt_count += n;
+	preempt_enable();
+#endif
+}
+
+static inline void mnt_set_count(struct vfsmount *mnt, int n)
+{
+#ifdef CONFIG_SMP
+	this_cpu_write(mnt->mnt_pcp->mnt_count, n);
+#else
+	mnt->mnt_count = n;
+#endif
+}
+
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_inc_count(struct vfsmount *mnt)
+{
+	mnt_add_count(mnt, 1);
+}
+
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_dec_count(struct vfsmount *mnt)
+{
+	mnt_add_count(mnt, -1);
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+unsigned int mnt_get_count(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	unsigned int count = atomic_read(&mnt->mnt_longrefs);
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
+	}
+
+	return count;
+#else
+	return mnt->mnt_count;
+#endif
+}
+
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
 	struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -154,7 +212,17 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 				goto out_free_id;
 		}
 
-		atomic_set(&mnt->mnt_count, 1);
+#ifdef CONFIG_SMP
+		mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
+		if (!mnt->mnt_pcp)
+			goto out_free_devname;
+
+		atomic_set(&mnt->mnt_longrefs, 1);
+#else
+		mnt->mnt_count = 1;
+		mnt->mnt_writers = 0;
+#endif
+
 		INIT_LIST_HEAD(&mnt->mnt_hash);
 		INIT_LIST_HEAD(&mnt->mnt_child);
 		INIT_LIST_HEAD(&mnt->mnt_mounts);
@@ -165,13 +233,6 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 #ifdef CONFIG_FSNOTIFY
 		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
-#endif
-#ifdef CONFIG_SMP
-		mnt->mnt_writers = alloc_percpu(int);
-		if (!mnt->mnt_writers)
-			goto out_free_devname;
-#else
-		mnt->mnt_writers = 0;
 #endif
 	}
 	return mnt;
@@ -219,7 +280,7 @@ EXPORT_SYMBOL_GPL(__mnt_is_readonly);
 static inline void mnt_inc_writers(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
-	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
+	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
 #else
 	mnt->mnt_writers++;
 #endif
@@ -228,7 +289,7 @@ static inline void mnt_inc_writers(struct vfsmount *mnt)
 static inline void mnt_dec_writers(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
-	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
+	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
 #else
 	mnt->mnt_writers--;
 #endif
@@ -241,7 +302,7 @@ static unsigned int mnt_get_writers(struct vfsmount *mnt)
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		count += *per_cpu_ptr(mnt->mnt_writers, cpu);
+		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
 	}
 
 	return count;
@@ -418,7 +479,7 @@ void free_vfsmnt(struct vfsmount *mnt)
 	kfree(mnt->mnt_devname);
 	mnt_free_id(mnt);
 #ifdef CONFIG_SMP
-	free_percpu(mnt->mnt_writers);
+	free_percpu(mnt->mnt_pcp);
 #endif
 	kmem_cache_free(mnt_cache, mnt);
 }
@@ -652,9 +713,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 	return NULL;
 }
 
-static inline void __mntput(struct vfsmount *mnt)
+static inline void mntfree(struct vfsmount *mnt)
 {
 	struct super_block *sb = mnt->mnt_sb;
+
 	/*
 	 * This probably indicates that somebody messed
 	 * up a mnt_want/drop_write() pair.  If this
@@ -662,8 +724,8 @@ static inline void __mntput(struct vfsmount *mnt)
 	 * to make r/w->r/o transitions.
 	 */
 	/*
-	 * atomic_dec_and_lock() used to deal with ->mnt_count decrements
-	 * provides barriers, so mnt_get_writers() below is safe.  AV
+	 * The locking used to deal with mnt_count decrement provides barriers,
+	 * so mnt_get_writers() below is safe.
 	 */
 	WARN_ON(mnt_get_writers(mnt));
 	fsnotify_vfsmount_delete(mnt);
@@ -672,28 +734,113 @@ static inline void __mntput(struct vfsmount *mnt)
 	deactivate_super(sb);
 }
 
-void mntput_no_expire(struct vfsmount *mnt)
-{
-repeat:
-	if (atomic_add_unless(&mnt->mnt_count, -1, 1))
-		return;
+#ifdef CONFIG_SMP
+static inline void __mntput(struct vfsmount *mnt, int longrefs)
+{
+	if (!longrefs) {
+put_again:
+		br_read_lock(vfsmount_lock);
+		if (likely(atomic_read(&mnt->mnt_longrefs))) {
+			mnt_dec_count(mnt);
+			br_read_unlock(vfsmount_lock);
+			return;
+		}
+		br_read_unlock(vfsmount_lock);
+	} else {
+		BUG_ON(!atomic_read(&mnt->mnt_longrefs));
+		if (atomic_add_unless(&mnt->mnt_longrefs, -1, 1))
+			return;
+	}
+
 	br_write_lock(vfsmount_lock);
-	if (!atomic_dec_and_test(&mnt->mnt_count)) {
+	if (!longrefs)
+		mnt_dec_count(mnt);
+	else
+		atomic_dec(&mnt->mnt_longrefs);
+	if (mnt_get_count(mnt)) {
 		br_write_unlock(vfsmount_lock);
 		return;
 	}
-	if (likely(!mnt->mnt_pinned)) {
+	if (unlikely(mnt->mnt_pinned)) {
+		mnt_add_count(mnt, mnt->mnt_pinned + 1);
+		mnt->mnt_pinned = 0;
 		br_write_unlock(vfsmount_lock);
-		__mntput(mnt);
+		acct_auto_close_mnt(mnt);
+		goto put_again;
+	}
+	br_write_unlock(vfsmount_lock);
+	mntfree(mnt);
+}
+#else
+static inline void __mntput(struct vfsmount *mnt, int longrefs)
+{
+put_again:
+	mnt_dec_count(mnt);
+	if (likely(mnt_get_count(mnt)))
 		return;
+	br_write_lock(vfsmount_lock);
+	if (unlikely(mnt->mnt_pinned)) {
+		mnt_add_count(mnt, mnt->mnt_pinned + 1);
+		mnt->mnt_pinned = 0;
+		br_write_unlock(vfsmount_lock);
+		acct_auto_close_mnt(mnt);
+		goto put_again;
 	}
-	atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
-	mnt->mnt_pinned = 0;
 	br_write_unlock(vfsmount_lock);
-	acct_auto_close_mnt(mnt);
-	goto repeat;
+	mntfree(mnt);
+}
+#endif
+
+static void mntput_no_expire(struct vfsmount *mnt)
+{
+	__mntput(mnt, 0);
+}
+
+void mntput(struct vfsmount *mnt)
+{
+	if (mnt) {
+		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+		if (unlikely(mnt->mnt_expiry_mark))
+			mnt->mnt_expiry_mark = 0;
+		__mntput(mnt, 0);
+	}
+}
+EXPORT_SYMBOL(mntput);
+
+struct vfsmount *mntget(struct vfsmount *mnt)
+{
+	if (mnt)
+		mnt_inc_count(mnt);
+	return mnt;
+}
+EXPORT_SYMBOL(mntget);
+
+void mntput_long(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	if (mnt) {
+		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+		if (unlikely(mnt->mnt_expiry_mark))
+			mnt->mnt_expiry_mark = 0;
+		__mntput(mnt, 1);
+	}
+#else
+	mntput(mnt);
+#endif
 }
-EXPORT_SYMBOL(mntput_no_expire);
+EXPORT_SYMBOL(mntput_long);
+
+struct vfsmount *mntget_long(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	if (mnt)
+		atomic_inc(&mnt->mnt_longrefs);
+	return mnt;
+#else
+	return mntget(mnt);
+#endif
+}
+EXPORT_SYMBOL(mntget_long);
 
 void mnt_pin(struct vfsmount *mnt)
 {
@@ -701,19 +848,17 @@ void mnt_pin(struct vfsmount *mnt)
 	mnt->mnt_pinned++;
 	br_write_unlock(vfsmount_lock);
 }
-
 EXPORT_SYMBOL(mnt_pin);
 
 void mnt_unpin(struct vfsmount *mnt)
 {
 	br_write_lock(vfsmount_lock);
 	if (mnt->mnt_pinned) {
-		atomic_inc(&mnt->mnt_count);
+		mnt_inc_count(mnt);
 		mnt->mnt_pinned--;
 	}
 	br_write_unlock(vfsmount_lock);
 }
-
 EXPORT_SYMBOL(mnt_unpin);
 
 static inline void mangle(struct seq_file *m, const char *s)
@@ -1008,12 +1153,13 @@ int may_umount_tree(struct vfsmount *mnt)
 	int minimum_refs = 0;
 	struct vfsmount *p;
 
-	br_read_lock(vfsmount_lock);
+	/* write lock needed for mnt_get_count */
+	br_write_lock(vfsmount_lock);
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
-		actual_refs += atomic_read(&p->mnt_count);
+		actual_refs += mnt_get_count(p);
 		minimum_refs += 2;
 	}
-	br_read_unlock(vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 
 	if (actual_refs > minimum_refs)
 		return 0;
@@ -1040,10 +1186,10 @@ int may_umount(struct vfsmount *mnt)
 {
 	int ret = 1;
 	down_read(&namespace_sem);
-	br_read_lock(vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	if (propagate_mount_busy(mnt, 2))
 		ret = 0;
-	br_read_unlock(vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 	up_read(&namespace_sem);
 	return ret;
 }
@@ -1070,7 +1216,7 @@ void release_mounts(struct list_head *head)
 			dput(dentry);
 			mntput(m);
 		}
-		mntput(mnt);
+		mntput_long(mnt);
 	}
 }
 
@@ -1125,8 +1271,16 @@ static int do_umount(struct vfsmount *mnt, int flags)
 		    flags & (MNT_FORCE | MNT_DETACH))
 			return -EINVAL;
 
-		if (atomic_read(&mnt->mnt_count) != 2)
+		/*
+		 * probably don't strictly need the lock here if we examined
+		 * all race cases, but it's a slowpath.
+		 */
+		br_write_lock(vfsmount_lock);
+		if (mnt_get_count(mnt) != 2) {
+			br_write_lock(vfsmount_lock);
 			return -EBUSY;
+		}
+		br_write_unlock(vfsmount_lock);
 
 		if (!xchg(&mnt->mnt_expiry_mark, 1))
 			return -EAGAIN;
@@ -1815,7 +1969,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 
 unlock:
 	up_write(&namespace_sem);
-	mntput(newmnt);
+	mntput_long(newmnt);
 	return err;
 }
 
@@ -2148,11 +2302,11 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 		if (fs) {
 			if (p == fs->root.mnt) {
 				rootmnt = p;
-				fs->root.mnt = mntget(q);
+				fs->root.mnt = mntget_long(q);
 			}
 			if (p == fs->pwd.mnt) {
 				pwdmnt = p;
-				fs->pwd.mnt = mntget(q);
+				fs->pwd.mnt = mntget_long(q);
 			}
 		}
 		p = next_mnt(p, mnt_ns->root);
@@ -2161,9 +2315,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	up_write(&namespace_sem);
 
 	if (rootmnt)
-		mntput(rootmnt);
+		mntput_long(rootmnt);
 	if (pwdmnt)
-		mntput(pwdmnt);
+		mntput_long(pwdmnt);
 
 	return new_ns;
 }
@@ -2350,6 +2504,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	br_write_unlock(vfsmount_lock);
 	chroot_fs_refs(&root, &new);
+
 	error = 0;
 	path_put(&root_parent);
 	path_put(&parent_path);
@@ -2376,6 +2531,7 @@ static void __init init_mount_tree(void)
 	mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
 	if (IS_ERR(mnt))
 		panic("Can't create rootfs");
+
 	ns = create_mnt_ns(mnt);
 	if (IS_ERR(ns))
 		panic("Can't allocate initial namespace");
diff --git a/fs/pipe.c b/fs/pipe.c
index cfe3a7f2ee21..68f1f8e4e23b 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1292,7 +1292,7 @@ static int __init init_pipe_fs(void)
 static void __exit exit_pipe_fs(void)
 {
 	unregister_filesystem(&pipe_fs_type);
-	mntput(pipe_mnt);
+	mntput_long(pipe_mnt);
 }
 
 fs_initcall(init_pipe_fs);
diff --git a/fs/pnode.c b/fs/pnode.c
index 8066b8dd748f..d42514e32380 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -288,7 +288,7 @@ out:
  */
 static inline int do_refcount_check(struct vfsmount *mnt, int count)
 {
-	int mycount = atomic_read(&mnt->mnt_count) - mnt->mnt_ghosts;
+	int mycount = mnt_get_count(mnt) - mnt->mnt_ghosts;
 	return (mycount > count);
 }
 
@@ -300,7 +300,7 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
  * Check if any of these mounts that **do not have submounts**
  * have more references than 'refcnt'. If so return busy.
  *
- * vfsmount lock must be held for read or write
+ * vfsmount lock must be held for write
  */
 int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
 {
diff --git a/fs/super.c b/fs/super.c
index 968ba013011a..823e061faa87 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1140,7 +1140,7 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
 	return mnt;
 
  err:
-	mntput(mnt);
+	mntput_long(mnt);
 	return ERR_PTR(err);
 }
 
-- 
cgit v1.2.2