299 files changed, 6288 insertions, 3139 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 12d602351dbe..6e58c4ca1e6e 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -91,11 +91,14 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
        return acl;
 }
-int v9fs_check_acl(struct inode *inode, int mask)
+int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
        struct posix_acl *acl;
        struct v9fs_session_info *v9ses;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        v9ses = v9fs_inode2v9ses(inode);
        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
                /*
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index 59e18c2e8c7e..7ef3ac9f6d95 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -16,7 +16,7 @@
 #ifdef CONFIG_9P_FS_POSIX_ACL
 extern int v9fs_get_acl(struct inode *, struct p9_fid *);
-extern int v9fs_check_acl(struct inode *inode, int mask);
+extern int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags);
 extern int v9fs_acl_chmod(struct dentry *);
 extern int v9fs_set_create_acl(struct dentry *,
                               struct posix_acl *, struct posix_acl *);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index cbf4e50f3933..466d2a4fc5cb 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -51,7 +51,7 @@
 *
 */
-static int v9fs_dentry_delete(struct dentry *dentry)
+static int v9fs_dentry_delete(const struct dentry *dentry)
 {
        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
                                                                        dentry);
@@ -68,7 +68,7 @@ static int v9fs_dentry_delete(struct dentry *dentry)
 *
 */
-static int v9fs_cached_dentry_delete(struct dentry *dentry)
+static int v9fs_cached_dentry_delete(const struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 34bf71b56542..59782981b225 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -237,10 +237,17 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
 *
 */
-void v9fs_destroy_inode(struct inode *inode)
+static void v9fs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
 }
+void v9fs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, v9fs_i_callback);
+}
 #endif
 /**
@@ -270,11 +277,11 @@ static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
 {
        struct dentry *dentry;
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        /* Directory should have only one entry. */
        BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
        dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        return dentry;
 }
@@ -628,9 +635,9 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        }
        if (v9ses->cache)
-                dentry->d_op = &v9fs_cached_dentry_operations;
+                d_set_d_op(dentry, &v9fs_cached_dentry_operations);
        else
-                dentry->d_op = &v9fs_dentry_operations;
+                d_set_d_op(dentry, &v9fs_dentry_operations);
        d_instantiate(dentry, inode);
        err = v9fs_fid_add(dentry, fid);
@@ -742,7 +749,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                                err);
                        goto error;
                }
-                dentry->d_op = &v9fs_cached_dentry_operations;
+                d_set_d_op(dentry, &v9fs_cached_dentry_operations);
                d_instantiate(dentry, inode);
                err = v9fs_fid_add(dentry, fid);
                if (err < 0)
@@ -760,7 +767,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                        err = PTR_ERR(inode);
                        goto error;
                }
-                dentry->d_op = &v9fs_dentry_operations;
+                d_set_d_op(dentry, &v9fs_dentry_operations);
                d_instantiate(dentry, inode);
        }
        /* Now set the ACL based on the default value */
@@ -949,7 +956,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
                                err);
                        goto error;
                }
-                dentry->d_op = &v9fs_cached_dentry_operations;
+                d_set_d_op(dentry, &v9fs_cached_dentry_operations);
                d_instantiate(dentry, inode);
                err = v9fs_fid_add(dentry, fid);
                if (err < 0)
@@ -966,7 +973,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
                        err = PTR_ERR(inode);
                        goto error;
                }
-                dentry->d_op = &v9fs_dentry_operations;
+                d_set_d_op(dentry, &v9fs_dentry_operations);
                d_instantiate(dentry, inode);
        }
        /* Now set the ACL based on the default value */
@@ -1034,9 +1041,9 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 inst_out:
        if (v9ses->cache)
-                dentry->d_op = &v9fs_cached_dentry_operations;
+                d_set_d_op(dentry, &v9fs_cached_dentry_operations);
        else
-                dentry->d_op = &v9fs_dentry_operations;
+                d_set_d_op(dentry, &v9fs_dentry_operations);
        d_add(dentry, inode);
        return NULL;
@@ -1702,7 +1709,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
                                        err);
                        goto error;
                }
-                dentry->d_op = &v9fs_cached_dentry_operations;
+                d_set_d_op(dentry, &v9fs_cached_dentry_operations);
                d_instantiate(dentry, inode);
                err = v9fs_fid_add(dentry, fid);
                if (err < 0)
@@ -1715,7 +1722,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
                        err = PTR_ERR(inode);
                        goto error;
                }
-                dentry->d_op = &v9fs_dentry_operations;
+                d_set_d_op(dentry, &v9fs_dentry_operations);
                d_instantiate(dentry, inode);
        }
@@ -1849,7 +1856,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
                ihold(old_dentry->d_inode);
        }
-        dentry->d_op = old_dentry->d_op;
+        d_set_d_op(dentry, old_dentry->d_op);
        d_instantiate(dentry, old_dentry->d_inode);
        return err;
@@ -1973,7 +1980,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
                                err);
                        goto error;
                }
-                dentry->d_op = &v9fs_cached_dentry_operations;
+                d_set_d_op(dentry, &v9fs_cached_dentry_operations);
                d_instantiate(dentry, inode);
                err = v9fs_fid_add(dentry, fid);
                if (err < 0)
@@ -1989,7 +1996,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
                        err = PTR_ERR(inode);
                        goto error;
                }
-                dentry->d_op = &v9fs_dentry_operations;
+                d_set_d_op(dentry, &v9fs_dentry_operations);
                d_instantiate(dentry, inode);
        }
        /* Now set the ACL based on the default value */
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index f4287e4de744..bf7693c384f9 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -201,7 +201,8 @@ const struct file_operations adfs_dir_operations = {
 };
 static int
-adfs_hash(struct dentry *parent, struct qstr *qstr)
+adfs_hash(const struct dentry *parent, const struct inode *inode,
+                struct qstr *qstr)
 {
        const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
        const unsigned char *name;
@@ -237,17 +238,19 @@ adfs_hash(struct dentry *parent, struct qstr *qstr)
 * requirements of the underlying filesystem.
 */
 static int
-adfs_compare(struct dentry *parent, struct qstr *entry, struct qstr *name)
+adfs_compare(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
        int i;
-        if (entry->len != name->len)
+        if (len != name->len)
                return 1;
        for (i = 0; i < name->len; i++) {
                char a, b;
-                a = entry->name[i];
+                a = str[i];
                b = name->name[i];
                if (a >= 'A' && a <= 'Z')
@@ -273,7 +276,7 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        struct object_info obj;
        int error;
-        dentry->d_op = &adfs_dentry_operations; 
+        d_set_d_op(dentry, &adfs_dentry_operations);
        lock_kernel();
        error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
        if (error == 0) {
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 959dbff2d42d..a4041b52fbca 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -240,11 +240,18 @@ static struct inode *adfs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void adfs_destroy_inode(struct inode *inode)
+static void adfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
 }
+static void adfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, adfs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
@@ -477,7 +484,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
                adfs_error(sb, "get root inode failed\n");
                goto error;
        } else
-                sb->s_root->d_op = &adfs_dentry_operations;
+                d_set_d_op(sb->s_root, &adfs_dentry_operations);
        unlock_kernel();
        return 0;
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 7d0f0a30f7a3..3a4557e8325c 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -128,7 +128,7 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
        void *data = dentry->d_fsdata;
        struct list_head *head, *next;
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        head = &inode->i_dentry;
        next = head->next;
        while (next != head) {
@@ -139,7 +139,7 @@ affs_fix_dcache(struct dentry *dentry, u32 entry_ino)
                }
                next = next->next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
 }
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 914d1c0bc07a..944a4042fb65 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -13,11 +13,19 @@
 typedef int (*toupper_t)(int);
 static int       affs_toupper(int ch);
-static int       affs_hash_dentry(struct dentry *, struct qstr *);
+static int       affs_hash_dentry(const struct dentry *,
-static int       affs_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+                const struct inode *, struct qstr *);
+static int       affs_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 static int       affs_intl_toupper(int ch);
-static int       affs_intl_hash_dentry(struct dentry *, struct qstr *);
+static int       affs_intl_hash_dentry(const struct dentry *,
-static int       affs_intl_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+                const struct inode *, struct qstr *);
+static int       affs_intl_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 const struct dentry_operations affs_dentry_operations = {
        .d_hash         = affs_hash_dentry,
@@ -58,13 +66,13 @@ affs_get_toupper(struct super_block *sb)
 * Note: the dentry argument is the parent dentry.
 */
 static inline int
-__affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper)
+__affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
 {
        const u8 *name = qstr->name;
        unsigned long hash;
        int i;
-        i = affs_check_name(qstr->name,qstr->len);
+        i = affs_check_name(qstr->name, qstr->len);
        if (i)
                return i;
@@ -78,39 +86,41 @@ __affs_hash_dentry(struct dentry *dentry, struct qstr *qstr, toupper_t toupper)
 }
 static int
-affs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
+affs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
-        return __affs_hash_dentry(dentry, qstr, affs_toupper);
+        return __affs_hash_dentry(qstr, affs_toupper);
 }
 static int
-affs_intl_hash_dentry(struct dentry *dentry, struct qstr *qstr)
+affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
-        return __affs_hash_dentry(dentry, qstr, affs_intl_toupper);
+        return __affs_hash_dentry(qstr, affs_intl_toupper);
 }
-static inline int
+static inline int __affs_compare_dentry(unsigned int len,
-__affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, toupper_t toupper)
+                const char *str, const struct qstr *name, toupper_t toupper)
 {
-        const u8 *aname = a->name;
+        const u8 *aname = str;
-        const u8 *bname = b->name;
+        const u8 *bname = name->name;
-        int len;
-        /* 'a' is the qstr of an already existing dentry, so the name
+        /*
-         * must be valid. 'b' must be validated first.
+         * 'str' is the name of an already existing dentry, so the name
+         * must be valid. 'name' must be validated first.
         */
-        if (affs_check_name(b->name,b->len))
+        if (affs_check_name(name->name, name->len))
                return 1;
-        /* If the names are longer than the allowed 30 chars,
+        /*
+         * If the names are longer than the allowed 30 chars,
         * the excess is ignored, so their length may differ.
         */
-        len = a->len;
        if (len >= 30) {
-                if (b->len < 30)
+                if (name->len < 30)
                        return 1;
                len = 30;
-        } else if (len != b->len)
+        } else if (len != name->len)
                return 1;
        for (; len > 0; len--)
@@ -121,14 +131,18 @@ __affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b, tou
 }
 static int
-affs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+affs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return __affs_compare_dentry(dentry, a, b, affs_toupper);
+        return __affs_compare_dentry(len, str, name, affs_toupper);
 }
 static int
-affs_intl_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return __affs_compare_dentry(dentry, a, b, affs_intl_toupper);
+        return __affs_compare_dentry(len, str, name, affs_intl_toupper);
 }
 /*
@@ -226,7 +240,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                if (IS_ERR(inode))
                        return ERR_CAST(inode);
        }
-        dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
+        d_set_d_op(dentry, AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations);
        d_add(dentry, inode);
        return NULL;
 }
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 0cf7f4384cbd..d39081bbe7ce 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -95,11 +95,18 @@ static struct inode *affs_alloc_inode(struct super_block *sb)
        return &i->vfs_inode;
 }
-static void affs_destroy_inode(struct inode *inode)
+static void affs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
 }
+static void affs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, affs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct affs_inode_info *ei = (struct affs_inode_info *) foo;
@@ -475,7 +482,7 @@ got_root:
                printk(KERN_ERR "AFFS: Get root inode failed\n");
                goto out_error;
        }
-        sb->s_root->d_op = &affs_dentry_operations;
+        d_set_d_op(sb->s_root, &affs_dentry_operations);
        pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
        return 0;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 5439e1bc9a86..34a3263d60a4 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
 #include <linux/sched.h>
@@ -23,7 +24,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 static int afs_dir_open(struct inode *inode, struct file *file);
 static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
 static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd);
-static int afs_d_delete(struct dentry *dentry);
+static int afs_d_delete(const struct dentry *dentry);
 static void afs_d_release(struct dentry *dentry);
 static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
                                  loff_t fpos, u64 ino, unsigned dtype);
@@ -581,7 +582,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
        }
 success:
-        dentry->d_op = &afs_fs_dentry_operations;
+        d_set_d_op(dentry, &afs_fs_dentry_operations);
        d_add(dentry, inode);
        _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }",
@@ -607,6 +608,9 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
        void *dir_version;
        int ret;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        vnode = AFS_FS_I(dentry->d_inode);
        if (dentry->d_inode)
@@ -730,7 +734,7 @@ out_bad:
 * - called from dput() when d_count is going to 0.
 * - return 1 to request dentry be unhashed, 0 otherwise
 */
-static int afs_d_delete(struct dentry *dentry)
+static int afs_d_delete(const struct dentry *dentry)
 {
        _enter("%s", dentry->d_name.name);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index cca8eef736fc..6d4bc1c8ff60 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -624,7 +624,7 @@ extern void afs_clear_permits(struct afs_vnode *);
 extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
 extern void afs_zap_permits(struct rcu_head *);
 extern struct key *afs_request_key(struct afs_cell *);
-extern int afs_permission(struct inode *, int);
+extern int afs_permission(struct inode *, int, unsigned int);
 /*
 * server.c
diff --git a/fs/afs/security.c b/fs/afs/security.c
index bb4ed144d0e4..f44b9d355377 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -285,13 +285,16 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
 * - AFS ACLs are attached to directories only, and a file is controlled by its
 *   parent directory's ACL
 */
-int afs_permission(struct inode *inode, int mask)
+int afs_permission(struct inode *inode, int mask, unsigned int flags)
 {
        struct afs_vnode *vnode = AFS_FS_I(inode);
        afs_access_t uninitialized_var(access);
        struct key *key;
        int ret;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        _enter("{{%x:%u},%lx},%x,",
               vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
@@ -347,7 +350,7 @@ int afs_permission(struct inode *inode, int mask)
        }
        key_put(key);
-        ret = generic_permission(inode, mask, NULL);
+        ret = generic_permission(inode, mask, flags, NULL);
        _leave(" = %d", ret);
        return ret;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 27201cffece4..f901a9d7c111 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -498,6 +498,14 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
        return &vnode->vfs_inode;
 }
+static void afs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct afs_vnode *vnode = AFS_FS_I(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(afs_inode_cachep, vnode);
+}
 /*
 * destroy an AFS inode struct
 */
@@ -511,7 +519,7 @@ static void afs_destroy_inode(struct inode *inode)
        ASSERTCMP(vnode->server, ==, NULL);
-        kmem_cache_free(afs_inode_cachep, vnode);
+        call_rcu(&inode->i_rcu, afs_i_callback);
        atomic_dec(&afs_count_active_inodes);
 }
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 57ce55b2564c..5fd38112a6ca 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -102,7 +102,7 @@ struct file *anon_inode_getfile(const char *name,
        this.name = name;
        this.len = strlen(name);
        this.hash = 0;
-        path.dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
+        path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this);
        if (!path.dentry)
                goto err_module;
@@ -113,7 +113,7 @@ struct file *anon_inode_getfile(const char *name,
         */
        ihold(anon_inode_inode);
-        path.dentry->d_op = &anon_inodefs_dentry_operations;
+        d_set_d_op(path.dentry, &anon_inodefs_dentry_operations);
        d_instantiate(path.dentry, anon_inode_inode);
        error = -ENFILE;
@@ -232,7 +232,7 @@ static int __init anon_inode_init(void)
        return 0;
 err_mntput:
-        mntput(anon_inode_mnt);
+        mntput_long(anon_inode_mnt);
 err_unregister_filesystem:
        unregister_filesystem(&anon_inode_fs_type);
 err_exit:
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 3d283abf67d7..0fffe1c24cec 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -16,6 +16,7 @@
 #include <linux/auto_fs4.h>
 #include <linux/auto_dev-ioctl.h>
 #include <linux/mutex.h>
+#include <linux/spinlock.h>
 #include <linux/list.h>
 /* This is the range of ioctl() numbers we claim as ours */
@@ -60,6 +61,8 @@ do {							\
                current->pid, __func__, ##args);        \
 } while (0)
+extern spinlock_t autofs4_lock;
 /* Unified info structure.  This is pointed to by both the dentry and
   inode structures.  Each file in the filesystem has an instance of this
   structure.  It holds a reference to the dentry, so dentries are never
@@ -254,17 +257,15 @@ static inline int simple_positive(struct dentry *dentry)
        return dentry->d_inode && !d_unhashed(dentry);
 }
-static inline int __simple_empty(struct dentry *dentry)
+static inline void __autofs4_add_expiring(struct dentry *dentry)
 {
-        struct dentry *child;
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-        int ret = 0;
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        if (ino) {
-        list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
+                if (list_empty(&ino->expiring))
-                if (simple_positive(child))
+                        list_add(&ino->expiring, &sbi->expiring_list);
-                        goto out;
+        }
-        ret = 1;
+        return;
-out:
-        return ret;
 }
 static inline void autofs4_add_expiring(struct dentry *dentry)
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index a796c9417fb1..cc1d01365905 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -91,24 +91,64 @@ done:
 }
 /*
- * Calculate next entry in top down tree traversal.
+ * Calculate and dget next entry in top down tree traversal.
- * From next_mnt in namespace.c - elegant.
 */
-static struct dentry *next_dentry(struct dentry *p, struct dentry *root)
+static struct dentry *get_next_positive_dentry(struct dentry *prev,
+                                                struct dentry *root)
 {
-        struct list_head *next = p->d_subdirs.next;
+        struct list_head *next;
+        struct dentry *p, *ret;
+        if (prev == NULL)
+                return dget(prev);
+        spin_lock(&autofs4_lock);
+relock:
+        p = prev;
+        spin_lock(&p->d_lock);
+again:
+        next = p->d_subdirs.next;
        if (next == &p->d_subdirs) {
                while (1) {
-                        if (p == root)
+                        struct dentry *parent;
+                        if (p == root) {
+                                spin_unlock(&p->d_lock);
+                                spin_unlock(&autofs4_lock);
+                                dput(prev);
                                return NULL;
+                        }
+                        parent = p->d_parent;
+                        if (!spin_trylock(&parent->d_lock)) {
+                                spin_unlock(&p->d_lock);
+                                cpu_relax();
+                                goto relock;
+                        }
+                        spin_unlock(&p->d_lock);
                        next = p->d_u.d_child.next;
-                        if (next != &p->d_parent->d_subdirs)
+                        p = parent;
+                        if (next != &parent->d_subdirs)
                                break;
-                        p = p->d_parent;
                }
        }
-        return list_entry(next, struct dentry, d_u.d_child);
+        ret = list_entry(next, struct dentry, d_u.d_child);
+        spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED);
+        /* Negative dentry - try next */
+        if (!simple_positive(ret)) {
+                spin_unlock(&ret->d_lock);
+                p = ret;
+                goto again;
+        }
+        dget_dlock(ret);
+        spin_unlock(&ret->d_lock);
+        spin_unlock(&p->d_lock);
+        spin_unlock(&autofs4_lock);
+        dput(prev);
+        return ret;
 }
 /*
@@ -158,18 +198,11 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
        if (!simple_positive(top))
                return 1;
-        spin_lock(&dcache_lock);
+        p = NULL;
-        for (p = top; p; p = next_dentry(p, top)) {
+        while ((p = get_next_positive_dentry(p, top))) {
-                /* Negative dentry - give up */
-                if (!simple_positive(p))
-                        continue;
                DPRINTK("dentry %p %.*s",
                        p, (int) p->d_name.len, p->d_name.name);
-                p = dget(p);
-                spin_unlock(&dcache_lock);
                /*
                 * Is someone visiting anywhere in the subtree ?
                 * If there's no mount we need to check the usage
@@ -198,16 +231,13 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
                        else
                                ino_count++;
-                        if (atomic_read(&p->d_count) > ino_count) {
+                        if (p->d_count > ino_count) {
                                top_ino->last_used = jiffies;
                                dput(p);
                                return 1;
                        }
                }
-                dput(p);
-                spin_lock(&dcache_lock);
        }
-        spin_unlock(&dcache_lock);
        /* Timeout of a tree mount is ultimately determined by its top dentry */
        if (!autofs4_can_expire(top, timeout, do_now))
@@ -226,32 +256,21 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
        DPRINTK("parent %p %.*s",
                parent, (int)parent->d_name.len, parent->d_name.name);
-        spin_lock(&dcache_lock);
+        p = NULL;
-        for (p = parent; p; p = next_dentry(p, parent)) {
+        while ((p = get_next_positive_dentry(p, parent))) {
-                /* Negative dentry - give up */
-                if (!simple_positive(p))
-                        continue;
                DPRINTK("dentry %p %.*s",
                        p, (int) p->d_name.len, p->d_name.name);
-                p = dget(p);
-                spin_unlock(&dcache_lock);
                if (d_mountpoint(p)) {
                        /* Can we umount this guy */
                        if (autofs4_mount_busy(mnt, p))
-                                goto cont;
+                                continue;
                        /* Can we expire this guy */
                        if (autofs4_can_expire(p, timeout, do_now))
                                return p;
                }
-cont:
-                dput(p);
-                spin_lock(&dcache_lock);
        }
-        spin_unlock(&dcache_lock);
        return NULL;
 }
@@ -276,7 +295,9 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
                struct autofs_info *ino = autofs4_dentry_ino(root);
                if (d_mountpoint(root)) {
                        ino->flags |= AUTOFS_INF_MOUNTPOINT;
-                        root->d_mounted--;
+                        spin_lock(&root->d_lock);
+                        root->d_flags &= ~DCACHE_MOUNTED;
+                        spin_unlock(&root->d_lock);
                }
                ino->flags |= AUTOFS_INF_EXPIRING;
                init_completion(&ino->expire_complete);
@@ -302,8 +323,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 {
        unsigned long timeout;
        struct dentry *root = sb->s_root;
+        struct dentry *dentry;
        struct dentry *expired = NULL;
-        struct list_head *next;
        int do_now = how & AUTOFS_EXP_IMMEDIATE;
        int exp_leaves = how & AUTOFS_EXP_LEAVES;
        struct autofs_info *ino;
@@ -315,23 +336,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
        now = jiffies;
        timeout = sbi->exp_timeout;
-        spin_lock(&dcache_lock);
+        dentry = NULL;
-        next = root->d_subdirs.next;
+        while ((dentry = get_next_positive_dentry(dentry, root))) {
-        /* On exit from the loop expire is set to a dgot dentry
-         * to expire or it's NULL */
-        while ( next != &root->d_subdirs ) {
-                struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
-                /* Negative dentry - give up */
-                if (!simple_positive(dentry)) {
-                        next = next->next;
-                        continue;
-                }
-                dentry = dget(dentry);
-                spin_unlock(&dcache_lock);
                spin_lock(&sbi->fs_lock);
                ino = autofs4_dentry_ino(dentry);
@@ -347,7 +353,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                        /* Path walk currently on this dentry? */
                        ino_count = atomic_read(&ino->count) + 2;
-                        if (atomic_read(&dentry->d_count) > ino_count)
+                        if (dentry->d_count > ino_count)
                                goto next;
                        /* Can we umount this guy */
@@ -369,7 +375,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                if (!exp_leaves) {
                        /* Path walk currently on this dentry? */
                        ino_count = atomic_read(&ino->count) + 1;
-                        if (atomic_read(&dentry->d_count) > ino_count)
+                        if (dentry->d_count > ino_count)
                                goto next;
                        if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
@@ -383,7 +389,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                } else {
                        /* Path walk currently on this dentry? */
                        ino_count = atomic_read(&ino->count) + 1;
-                        if (atomic_read(&dentry->d_count) > ino_count)
+                        if (dentry->d_count > ino_count)
                                goto next;
                        expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
@@ -394,11 +400,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                }
 next:
                spin_unlock(&sbi->fs_lock);
-                dput(dentry);
-                spin_lock(&dcache_lock);
-                next = next->next;
        }
-        spin_unlock(&dcache_lock);
        return NULL;
 found:
@@ -408,9 +410,13 @@ found:
        ino->flags |= AUTOFS_INF_EXPIRING;
        init_completion(&ino->expire_complete);
        spin_unlock(&sbi->fs_lock);
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
+        spin_lock(&expired->d_parent->d_lock);
+        spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
        list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&expired->d_lock);
+        spin_unlock(&expired->d_parent->d_lock);
+        spin_unlock(&autofs4_lock);
        return expired;
 }
@@ -499,7 +505,14 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
                spin_lock(&sbi->fs_lock);
                if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
-                        sb->s_root->d_mounted++;
+                        spin_lock(&sb->s_root->d_lock);
+                        /*
+                         * If we haven't been expired away, then reset
+                         * mounted status.
+                         */
+                        if (mnt->mnt_parent != mnt)
+                                sb->s_root->d_flags |= DCACHE_MOUNTED;
+                        spin_unlock(&sb->s_root->d_lock);
                        ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
                }
                ino->flags &= ~AUTOFS_INF_EXPIRING;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index ac87e49fa706..a7bdb9dcac84 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -309,7 +309,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
                goto fail_iput;
        pipe = NULL;
-        root->d_op = &autofs4_sb_dentry_operations;
+        d_set_d_op(root, &autofs4_sb_dentry_operations);
        root->d_fsdata = ino;
        /* Can this call block? */
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index d5c1401f0031..651e4ef563b1 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -23,6 +23,8 @@
 #include "autofs_i.h"
+DEFINE_SPINLOCK(autofs4_lock);
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
@@ -142,12 +144,15 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
         * autofs file system so just let the libfs routines handle
         * it.
         */
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
+        spin_lock(&dentry->d_lock);
        if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
+                spin_unlock(&autofs4_lock);
                return -ENOENT;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&autofs4_lock);
 out:
        return dcache_dir_open(inode, file);
@@ -252,9 +257,11 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
        /* We trigger a mount for almost all flags */
        lookup_type = autofs4_need_mount(nd->flags);
        spin_lock(&sbi->fs_lock);
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
+        spin_lock(&dentry->d_lock);
        if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
+                spin_unlock(&autofs4_lock);
                spin_unlock(&sbi->fs_lock);
                goto follow;
        }
@@ -266,7 +273,8 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
         */
        if (ino->flags & AUTOFS_INF_PENDING ||
            (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
+                spin_unlock(&autofs4_lock);
                spin_unlock(&sbi->fs_lock);
                status = try_to_fill_dentry(dentry, nd->flags);
@@ -275,7 +283,8 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
                goto follow;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&autofs4_lock);
        spin_unlock(&sbi->fs_lock);
 follow:
        /*
@@ -306,12 +315,19 @@ out_error:
 */
 static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *dir = dentry->d_parent->d_inode;
+        struct inode *dir;
-        struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
+        struct autofs_sb_info *sbi;
-        int oz_mode = autofs4_oz_mode(sbi);
+        int oz_mode;
        int flags = nd ? nd->flags : 0;
        int status = 1;
+        if (flags & LOOKUP_RCU)
+                return -ECHILD;
+        dir = dentry->d_parent->d_inode;
+        sbi = autofs4_sbi(dir->i_sb);
+        oz_mode = autofs4_oz_mode(sbi);
        /* Pending dentry */
        spin_lock(&sbi->fs_lock);
        if (autofs4_ispending(dentry)) {
@@ -346,12 +362,14 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
                return 0;
        /* Check for a non-mountpoint directory with no contents */
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
+        spin_lock(&dentry->d_lock);
        if (S_ISDIR(dentry->d_inode->i_mode) &&
            !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
                DPRINTK("dentry=%p %.*s, emptydir",
                         dentry, dentry->d_name.len, dentry->d_name.name);
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
+                spin_unlock(&autofs4_lock);
                /* The daemon never causes a mount to trigger */
                if (oz_mode)
@@ -367,7 +385,8 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
                return status;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&autofs4_lock);
        return 1;
 }
@@ -422,7 +441,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
        const unsigned char *str = name->name;
        struct list_head *p, *head;
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
        spin_lock(&sbi->lookup_lock);
        head = &sbi->active_list;
        list_for_each(p, head) {
@@ -436,7 +455,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
                spin_lock(&active->d_lock);
                /* Already gone? */
-                if (atomic_read(&active->d_count) == 0)
+                if (active->d_count == 0)
                        goto next;
                qstr = &active->d_name;
@@ -452,17 +471,17 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
                        goto next;
                if (d_unhashed(active)) {
-                        dget(active);
+                        dget_dlock(active);
                        spin_unlock(&active->d_lock);
                        spin_unlock(&sbi->lookup_lock);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&autofs4_lock);
                        return active;
                }
 next:
                spin_unlock(&active->d_lock);
        }
        spin_unlock(&sbi->lookup_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
        return NULL;
 }
@@ -477,7 +496,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
        const unsigned char *str = name->name;
        struct list_head *p, *head;
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
        spin_lock(&sbi->lookup_lock);
        head = &sbi->expiring_list;
        list_for_each(p, head) {
@@ -507,17 +526,17 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
                        goto next;
                if (d_unhashed(expiring)) {
-                        dget(expiring);
+                        dget_dlock(expiring);
                        spin_unlock(&expiring->d_lock);
                        spin_unlock(&sbi->lookup_lock);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&autofs4_lock);
                        return expiring;
                }
 next:
                spin_unlock(&expiring->d_lock);
        }
        spin_unlock(&sbi->lookup_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
        return NULL;
 }
@@ -559,7 +578,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
                 * we check for the hashed dentry and return the newly
                 * hashed dentry.
                 */
-                dentry->d_op = &autofs4_root_dentry_operations;
+                d_set_d_op(dentry, &autofs4_root_dentry_operations);
                /*
                 * And we need to ensure that the same dentry is used for
@@ -698,9 +717,9 @@ static int autofs4_dir_symlink(struct inode *dir,
        d_add(dentry, inode);
        if (dir == dir->i_sb->s_root->d_inode)
-                dentry->d_op = &autofs4_root_dentry_operations;
+                d_set_d_op(dentry, &autofs4_root_dentry_operations);
        else
-                dentry->d_op = &autofs4_dentry_operations;
+                d_set_d_op(dentry, &autofs4_dentry_operations);
        dentry->d_fsdata = ino;
        ino->dentry = dget(dentry);
@@ -753,12 +772,12 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
        dir->i_mtime = CURRENT_TIME;
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
        autofs4_add_expiring(dentry);
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
        return 0;
 }
@@ -775,16 +794,20 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
        if (!autofs4_oz_mode(sbi))
                return -EACCES;
-        spin_lock(&dcache_lock);
+        spin_lock(&autofs4_lock);
+        spin_lock(&sbi->lookup_lock);
+        spin_lock(&dentry->d_lock);
        if (!list_empty(&dentry->d_subdirs)) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
+                spin_unlock(&sbi->lookup_lock);
+                spin_unlock(&autofs4_lock);
                return -ENOTEMPTY;
        }
-        autofs4_add_expiring(dentry);
+        __autofs4_add_expiring(dentry);
-        spin_lock(&dentry->d_lock);
+        spin_unlock(&sbi->lookup_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
        if (atomic_dec_and_test(&ino->count)) {
                p_ino = autofs4_dentry_ino(dentry->d_parent);
@@ -829,9 +852,9 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        d_add(dentry, inode);
        if (dir == dir->i_sb->s_root->d_inode)
-                dentry->d_op = &autofs4_root_dentry_operations;
+                d_set_d_op(dentry, &autofs4_root_dentry_operations);
        else
-                dentry->d_op = &autofs4_dentry_operations;
+                d_set_d_op(dentry, &autofs4_dentry_operations);
        dentry->d_fsdata = ino;
        ino->dentry = dget(dentry);
@@ -980,19 +1003,11 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
        }
 }
-static DEFINE_MUTEX(autofs4_ioctl_mutex);
 static long autofs4_root_ioctl(struct file *filp,
                               unsigned int cmd, unsigned long arg)
 {
-        long ret;
        struct inode *inode = filp->f_dentry->d_inode;
+        return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
-        mutex_lock(&autofs4_ioctl_mutex);
-        ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
-        mutex_unlock(&autofs4_ioctl_mutex);
-        return ret;
 }
 #ifdef CONFIG_COMPAT
@@ -1002,13 +1017,11 @@ static long autofs4_root_compat_ioctl(struct file *filp,
        struct inode *inode = filp->f_path.dentry->d_inode;
        int ret;
-        mutex_lock(&autofs4_ioctl_mutex);
        if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
                ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
        else
                ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
                        (unsigned long)compat_ptr(arg));
-        mutex_unlock(&autofs4_ioctl_mutex);
        return ret;
 }
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 2341375386f8..c5f8459c905e 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -186,16 +186,26 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
 {
        struct dentry *root = sbi->sb->s_root;
        struct dentry *tmp;
-        char *buf = *name;
+        char *buf;
        char *p;
-        int len = 0;
+        int len;
+        unsigned seq;
-        spin_lock(&dcache_lock);
+rename_retry:
+        buf = *name;
+        len = 0;
+        seq = read_seqbegin(&rename_lock);
+        rcu_read_lock();
+        spin_lock(&autofs4_lock);
        for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
                len += tmp->d_name.len + 1;
        if (!len || --len > NAME_MAX) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&autofs4_lock);
+                rcu_read_unlock();
+                if (read_seqretry(&rename_lock, seq))
+                        goto rename_retry;
                return 0;
        }
@@ -208,7 +218,10 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
                p -= tmp->d_name.len;
                strncpy(p, tmp->d_name.name, tmp->d_name.len);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&autofs4_lock);
+        rcu_read_unlock();
+        if (read_seqretry(&rename_lock, seq))
+                goto rename_retry;
        return len;
 }
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index f024d8aaddef..9ad2369d9e35 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -229,8 +229,11 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
        return -EIO;
 }
-static int bad_inode_permission(struct inode *inode, int mask)
+static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        return -EIO;
 }
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index aa4e7c7ae3c6..de93581b79a2 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -284,12 +284,18 @@ befs_alloc_inode(struct super_block *sb)
        return &bi->vfs_inode;
 }
-static void
+static void befs_i_callback(struct rcu_head *head)
-befs_destroy_inode(struct inode *inode)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
 }
+static void befs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, befs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct befs_inode_info *bi = (struct befs_inode_info *) foo;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 76db6d7d49bb..a8e37f81d097 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -248,11 +248,18 @@ static struct inode *bfs_alloc_inode(struct super_block *sb)
        return &bi->vfs_inode;
 }
-static void bfs_destroy_inode(struct inode *inode)
+static void bfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
 }
+static void bfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, bfs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct bfs_inode_info *bi = foo;
diff --git a/fs/bio.c b/fs/bio.c
index 8abb2dfb2e7c..4bd454fa844e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -370,6 +370,9 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
 {
        struct bio *bio;
+        if (nr_iovecs > UIO_MAXIOV)
+                return NULL;
        bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
                      gfp_mask);
        if (unlikely(!bio))
@@ -697,8 +700,12 @@ static void bio_free_map_data(struct bio_map_data *bmd)
 static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
                                               gfp_t gfp_mask)
 {
-        struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask);
+        struct bio_map_data *bmd;
+        if (iov_count > UIO_MAXIOV)
+                return NULL;
+        bmd = kmalloc(sizeof(*bmd), gfp_mask);
        if (!bmd)
                return NULL;
@@ -827,6 +834,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
                end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
                start = uaddr >> PAGE_SHIFT;
+                /*
+                 * Overflow, abort
+                 */
+                if (end < start)
+                        return ERR_PTR(-EINVAL);
                nr_pages += end - start;
                len += iov[i].iov_len;
        }
@@ -955,6 +968,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
                unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
                unsigned long start = uaddr >> PAGE_SHIFT;
+                /*
+                 * Overflow, abort
+                 */
+                if (end < start)
+                        return ERR_PTR(-EINVAL);
                nr_pages += end - start;
                /*
                 * buffer must be aligned to at least hardsector size for now
@@ -982,7 +1001,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
                unsigned long start = uaddr >> PAGE_SHIFT;
                const int local_nr_pages = end - start;
                const int page_limit = cur_page + local_nr_pages;
-                
                ret = get_user_pages_fast(uaddr, local_nr_pages,
                                write_to_vm, &pages[cur_page]);
                if (ret < local_nr_pages) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 06e8ff12b97c..771f23527010 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -11,7 +11,6 @@
 #include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/major.h>
-#include <linux/smp_lock.h>
 #include <linux/device_cgroup.h>
 #include <linux/highmem.h>
 #include <linux/blkdev.h>
@@ -410,13 +409,20 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void bdev_destroy_inode(struct inode *inode)
+static void bdev_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
        struct bdev_inode *bdi = BDEV_I(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(bdev_cachep, bdi);
 }
+static void bdev_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, bdev_i_callback);
+}
 static void init_once(void *foo)
 {
        struct bdev_inode *ei = (struct bdev_inode *) foo;
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 2222d161c7b6..6ae2c8cac9d5 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -185,18 +185,23 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
        return ret;
 }
-int btrfs_check_acl(struct inode *inode, int mask)
+int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl;
        int error = -EAGAIN;
-        acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        error = -ECHILD;
-        if (IS_ERR(acl))
+        } else {
-                return PTR_ERR(acl);
+                struct posix_acl *acl;
-        if (acl) {
+                acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
-                error = posix_acl_permission(inode, acl, mask);
+                if (IS_ERR(acl))
-                posix_acl_release(acl);
+                        return PTR_ERR(acl);
+                if (acl) {
+                        error = posix_acl_permission(inode, acl, mask);
+                        posix_acl_release(acl);
+                }
        }
        return error;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 7845d1f7d1d9..b50bc4bd5c56 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -91,23 +91,10 @@ static inline int compressed_bio_size(struct btrfs_root *root,
 static struct bio *compressed_bio_alloc(struct block_device *bdev,
                                        u64 first_byte, gfp_t gfp_flags)
 {
-        struct bio *bio;
        int nr_vecs;
        nr_vecs = bio_get_nr_vecs(bdev);
-        bio = bio_alloc(gfp_flags, nr_vecs);
+        return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags);
-        if (bio == NULL && (current->flags & PF_MEMALLOC)) {
-                while (!bio && (nr_vecs /= 2))
-                        bio = bio_alloc(gfp_flags, nr_vecs);
-        }
-        if (bio) {
-                bio->bi_size = 0;
-                bio->bi_bdev = bdev;
-                bio->bi_sector = first_byte >> 9;
-        }
-        return bio;
 }
 static int check_compressed_csum(struct inode *inode,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8db9234f6b41..a142d204b526 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -808,9 +808,9 @@ struct btrfs_block_group_cache {
        int extents_thresh;
        int free_extents;
        int total_bitmaps;
-        int ro:1;
+        unsigned int ro:1;
-        int dirty:1;
+        unsigned int dirty:1;
-        int iref:1;
+        unsigned int iref:1;
        int disk_cache_state;
@@ -2544,7 +2544,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait);
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-int btrfs_check_acl(struct inode *inode, int mask);
+int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
 #else
 #define btrfs_check_acl NULL
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fb827d0d7181..51d2e4de34eb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -28,6 +28,7 @@
 #include <linux/freezer.h>
 #include <linux/crc32c.h>
 #include <linux/slab.h>
+#include <linux/migrate.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -355,6 +356,8 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
        ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
                                             btrfs_header_generation(eb));
        BUG_ON(ret);
+        WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
        found_start = btrfs_header_bytenr(eb);
        if (found_start != start) {
                WARN_ON(1);
@@ -693,6 +696,27 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                                   __btree_submit_bio_done);
 }
+#ifdef CONFIG_MIGRATION
+static int btree_migratepage(struct address_space *mapping,
+                        struct page *newpage, struct page *page)
+{
+        /*
+         * we can't safely write a btree page from here,
+         * we haven't done the locking hook
+         */
+        if (PageDirty(page))
+                return -EAGAIN;
+        /*
+         * Buffers may be managed in a filesystem specific way.
+         * We must have no buffers or drop them.
+         */
+        if (page_has_private(page) &&
+            !try_to_release_page(page, GFP_KERNEL))
+                return -EAGAIN;
+        return migrate_page(mapping, newpage, page);
+}
+#endif
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct extent_io_tree *tree;
@@ -707,8 +731,7 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc)
        }
        redirty_page_for_writepage(wbc, page);
-        eb = btrfs_find_tree_block(root, page_offset(page),
+        eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
-                                      PAGE_CACHE_SIZE);
        WARN_ON(!eb);
        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
@@ -799,6 +822,9 @@ static const struct address_space_operations btree_aops = {
        .releasepage    = btree_releasepage,
        .invalidatepage = btree_invalidatepage,
        .sync_page      = block_sync_page,
+#ifdef CONFIG_MIGRATION
+        .migratepage    = btree_migratepage,
+#endif
 };
 int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -981,7 +1007,10 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
-        BUG_ON(!root->node);
+        if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
+                free_extent_buffer(root->node);
+                return -EIO;
+        }
        root->commit_root = btrfs_root_node(root);
        return 0;
 }
@@ -1538,10 +1567,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                                 GFP_NOFS);
        struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
                                                 GFP_NOFS);
-        struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
+        struct btrfs_root *tree_root = btrfs_sb(sb);
-                                               GFP_NOFS);
+        struct btrfs_fs_info *fs_info = tree_root->fs_info;
-        struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
-                                                GFP_NOFS);
        struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
                                                GFP_NOFS);
        struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 951ef09b82f4..0ccf9a8afcdf 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -110,7 +110,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
        dentry = d_obtain_alias(inode);
        if (!IS_ERR(dentry))
-                dentry->d_op = &btrfs_dentry_operations;
+                d_set_d_op(dentry, &btrfs_dentry_operations);
        return dentry;
 fail:
        srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -166,7 +166,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
 static struct dentry *btrfs_get_parent(struct dentry *child)
 {
        struct inode *dir = child->d_inode;
-        static struct dentry *dentry;
+        struct dentry *dentry;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
@@ -225,16 +225,92 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
        key.offset = 0;
        dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
        if (!IS_ERR(dentry))
-                dentry->d_op = &btrfs_dentry_operations;
+                d_set_d_op(dentry, &btrfs_dentry_operations);
        return dentry;
 fail:
        btrfs_free_path(path);
        return ERR_PTR(ret);
 }
+static int btrfs_get_name(struct dentry *parent, char *name,
+                          struct dentry *child)
+{
+        struct inode *inode = child->d_inode;
+        struct inode *dir = parent->d_inode;
+        struct btrfs_path *path;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct btrfs_inode_ref *iref;
+        struct btrfs_root_ref *rref;
+        struct extent_buffer *leaf;
+        unsigned long name_ptr;
+        struct btrfs_key key;
+        int name_len;
+        int ret;
+        if (!dir || !inode)
+                return -EINVAL;
+        if (!S_ISDIR(dir->i_mode))
+                return -EINVAL;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->leave_spinning = 1;
+        if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+                key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+                key.type = BTRFS_ROOT_BACKREF_KEY;
+                key.offset = (u64)-1;
+                root = root->fs_info->tree_root;
+        } else {
+                key.objectid = inode->i_ino;
+                key.offset = dir->i_ino;
+                key.type = BTRFS_INODE_REF_KEY;
+        }
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0) {
+                btrfs_free_path(path);
+                return ret;
+        } else if (ret > 0) {
+                if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+                        path->slots[0]--;
+                } else {
+                        btrfs_free_path(path);
+                        return -ENOENT;
+                }
+        }
+        leaf = path->nodes[0];
+        if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               rref = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_root_ref);
+               name_ptr = (unsigned long)(rref + 1);
+               name_len = btrfs_root_ref_name_len(leaf, rref);
+        } else {
+                iref = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_inode_ref);
+                name_ptr = (unsigned long)(iref + 1);
+                name_len = btrfs_inode_ref_name_len(leaf, iref);
+        }
+        read_extent_buffer(leaf, name, name_ptr, name_len);
+        btrfs_free_path(path);
+        /*
+         * have to add the null termination to make sure that reconnect_path
+         * gets the right len for strlen
+         */
+        name[name_len] = '\0';
+        return 0;
+}
 const struct export_operations btrfs_export_ops = {
        .encode_fh      = btrfs_encode_fh,
        .fh_to_dentry   = btrfs_fh_to_dentry,
        .fh_to_parent   = btrfs_fh_to_parent,
        .get_parent     = btrfs_get_parent,
+        .get_name       = btrfs_get_name,
 };
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0c097f3aec41..227e5815d838 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -429,6 +429,7 @@ err:
 static int cache_block_group(struct btrfs_block_group_cache *cache,
                             struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
                             int load_cache_only)
 {
        struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -442,9 +443,12 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        /*
         * We can't do the read from on-disk cache during a commit since we need
-         * to have the normal tree locking.
+         * to have the normal tree locking.  Also if we are currently trying to
+         * allocate blocks for the tree root we can't do the fast caching since
+         * we likely hold important locks.
         */
-        if (!trans->transaction->in_commit) {
+        if (!trans->transaction->in_commit &&
+            (root && root != root->fs_info->tree_root)) {
                spin_lock(&cache->lock);
                if (cache->cached != BTRFS_CACHE_NO) {
                        spin_unlock(&cache->lock);
@@ -2741,6 +2745,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
        struct btrfs_root *root = block_group->fs_info->tree_root;
        struct inode *inode = NULL;
        u64 alloc_hint = 0;
+        int dcs = BTRFS_DC_ERROR;
        int num_pages = 0;
        int retries = 0;
        int ret = 0;
@@ -2795,6 +2800,8 @@ again:
        spin_lock(&block_group->lock);
        if (block_group->cached != BTRFS_CACHE_FINISHED) {
+                /* We're not cached, don't bother trying to write stuff out */
+                dcs = BTRFS_DC_WRITTEN;
                spin_unlock(&block_group->lock);
                goto out_put;
        }
@@ -2821,6 +2828,8 @@ again:
        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
                                              num_pages, num_pages,
                                              &alloc_hint);
+        if (!ret)
+                dcs = BTRFS_DC_SETUP;
        btrfs_free_reserved_data_space(inode, num_pages);
 out_put:
        iput(inode);
@@ -2828,10 +2837,7 @@ out_free:
        btrfs_release_path(root, path);
 out:
        spin_lock(&block_group->lock);
-        if (ret)
+        block_group->disk_cache_state = dcs;
-                block_group->disk_cache_state = BTRFS_DC_ERROR;
-        else
-                block_group->disk_cache_state = BTRFS_DC_SETUP;
        spin_unlock(&block_group->lock);
        return ret;
@@ -3037,7 +3043,13 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
-        u64 num_devices = root->fs_info->fs_devices->rw_devices;
+        /*
+         * we add in the count of missing devices because we want
+         * to make sure that any RAID levels on a degraded FS
+         * continue to be honored.
+         */
+        u64 num_devices = root->fs_info->fs_devices->rw_devices +
+                root->fs_info->fs_devices->missing_devices;
        if (num_devices == 1)
                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -3412,7 +3424,7 @@ again:
         * our reservation.
         */
        if (unused <= space_info->total_bytes) {
-                unused -= space_info->total_bytes;
+                unused = space_info->total_bytes - unused;
                if (unused >= num_bytes) {
                        if (!reserved)
                                space_info->bytes_reserved += orig_bytes;
@@ -4080,7 +4092,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                 * space back to the block group, otherwise we will leak space.
                 */
                if (!alloc && cache->cached == BTRFS_CACHE_NO)
-                        cache_block_group(cache, trans, 1);
+                        cache_block_group(cache, trans, NULL, 1);
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
@@ -4930,11 +4942,31 @@ search:
                btrfs_get_block_group(block_group);
                search_start = block_group->key.objectid;
+                /*
+                 * this can happen if we end up cycling through all the
+                 * raid types, but we want to make sure we only allocate
+                 * for the proper type.
+                 */
+                if (!block_group_bits(block_group, data)) {
+                    u64 extra = BTRFS_BLOCK_GROUP_DUP |
+                                BTRFS_BLOCK_GROUP_RAID1 |
+                                BTRFS_BLOCK_GROUP_RAID10;
+                        /*
+                         * if they asked for extra copies and this block group
+                         * doesn't provide them, bail.  This does allow us to
+                         * fill raid0 from raid1.
+                         */
+                        if ((data & extra) && !(block_group->flags & extra))
+                                goto loop;
+                }
 have_block_group:
                if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
                        u64 free_percent;
-                        ret = cache_block_group(block_group, trans, 1);
+                        ret = cache_block_group(block_group, trans,
+                                                orig_root, 1);
                        if (block_group->cached == BTRFS_CACHE_FINISHED)
                                goto have_block_group;
@@ -4958,7 +4990,8 @@ have_block_group:
                        if (loop > LOOP_CACHING_NOWAIT ||
                            (loop > LOOP_FIND_IDEAL &&
                             atomic_read(&space_info->caching_threads) < 2)) {
-                                ret = cache_block_group(block_group, trans, 0);
+                                ret = cache_block_group(block_group, trans,
+                                                        orig_root, 0);
                                BUG_ON(ret);
                        }
                        found_uncached_bg = true;
@@ -5515,7 +5548,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
        u64 num_bytes = ins->offset;
        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-        cache_block_group(block_group, trans, 0);
+        cache_block_group(block_group, trans, NULL, 0);
        caching_ctl = get_caching_control(block_group);
        if (!caching_ctl) {
@@ -6300,9 +6333,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                                           NULL, NULL);
                BUG_ON(ret < 0);
                if (ret > 0) {
-                        ret = btrfs_del_orphan_item(trans, tree_root,
+                        /* if we fail to delete the orphan item this time
-                                                    root->root_key.objectid);
+                         * around, it'll get picked up the next time.
-                        BUG_ON(ret);
+                         *
+                         * The most common failure here is just -ENOENT.
+                         */
+                        btrfs_del_orphan_item(trans, tree_root,
+                                              root->root_key.objectid);
                }
        }
@@ -7878,7 +7915,14 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
        u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
-        num_devices = root->fs_info->fs_devices->rw_devices;
+        /*
+         * we add in the count of missing devices because we want
+         * to make sure that any RAID levels on a degraded FS
+         * continue to be honored.
+         */
+        num_devices = root->fs_info->fs_devices->rw_devices +
+                root->fs_info->fs_devices->missing_devices;
        if (num_devices == 1) {
                stripped |= BTRFS_BLOCK_GROUP_DUP;
                stripped = flags & ~stripped;
@@ -8247,7 +8291,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                        break;
                if (ret != 0)
                        goto error;
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                cache = kzalloc(sizeof(*cache), GFP_NOFS);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eac10e3260a9..3e86b9f36507 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1828,9 +1828,9 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
        bio_put(bio);
 }
-static struct bio *
+struct bio *
-extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
-                 gfp_t gfp_flags)
+                gfp_t gfp_flags)
 {
        struct bio *bio;
@@ -1919,7 +1919,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
        else
                nr = bio_get_nr_vecs(bdev);
-        bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+        bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
        bio_add_page(bio, page, page_size, offset);
        bio->bi_end_io = end_io_func;
@@ -2901,21 +2901,53 @@ out:
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len, get_extent_t *get_extent)
 {
-        int ret;
+        int ret = 0;
        u64 off = start;
        u64 max = start + len;
        u32 flags = 0;
+        u32 found_type;
+        u64 last;
        u64 disko = 0;
+        struct btrfs_key found_key;
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
+        struct btrfs_path *path;
+        struct btrfs_file_extent_item *item;
        int end = 0;
        u64 em_start = 0, em_len = 0;
        unsigned long emflags;
-        ret = 0;
+        int hole = 0;
        if (len == 0)
                return -EINVAL;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->leave_spinning = 1;
+        ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
+                                       path, inode->i_ino, -1, 0);
+        if (ret < 0) {
+                btrfs_free_path(path);
+                return ret;
+        }
+        WARN_ON(!ret);
+        path->slots[0]--;
+        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                              struct btrfs_file_extent_item);
+        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+        found_type = btrfs_key_type(&found_key);
+        /* No extents, just return */
+        if (found_key.objectid != inode->i_ino ||
+            found_type != BTRFS_EXTENT_DATA_KEY) {
+                btrfs_free_path(path);
+                return 0;
+        }
+        last = found_key.offset;
+        btrfs_free_path(path);
        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
                         &cached_state, GFP_NOFS);
        em = get_extent(inode, NULL, 0, off, max - off, 0);
@@ -2925,11 +2957,18 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                ret = PTR_ERR(em);
                goto out;
        }
        while (!end) {
+                hole = 0;
                off = em->start + em->len;
                if (off >= max)
                        end = 1;
+                if (em->block_start == EXTENT_MAP_HOLE) {
+                        hole = 1;
+                        goto next;
+                }
                em_start = em->start;
                em_len = em->len;
@@ -2939,8 +2978,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
                        end = 1;
                        flags |= FIEMAP_EXTENT_LAST;
-                } else if (em->block_start == EXTENT_MAP_HOLE) {
-                        flags |= FIEMAP_EXTENT_UNWRITTEN;
                } else if (em->block_start == EXTENT_MAP_INLINE) {
                        flags |= (FIEMAP_EXTENT_DATA_INLINE |
                                  FIEMAP_EXTENT_NOT_ALIGNED);
@@ -2953,10 +2990,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
                        flags |= FIEMAP_EXTENT_ENCODED;
+next:
                emflags = em->flags;
                free_extent_map(em);
                em = NULL;
                if (!end) {
                        em = get_extent(inode, NULL, 0, off, max - off, 0);
                        if (!em)
@@ -2967,15 +3004,23 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        }
                        emflags = em->flags;
                }
                if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
                        flags |= FIEMAP_EXTENT_LAST;
                        end = 1;
                }
-                ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+                if (em_start == last) {
-                                        em_len, flags);
+                        flags |= FIEMAP_EXTENT_LAST;
-                if (ret)
+                        end = 1;
-                        goto out_free;
+                }
+                if (!hole) {
+                        ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+                                                em_len, flags);
+                        if (ret)
+                                goto out_free;
+                }
        }
 out_free:
        free_extent_map(em);
@@ -3836,8 +3881,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
        spin_lock(&tree->buffer_lock);
        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-        if (!eb)
+        if (!eb) {
-                goto out;
+                spin_unlock(&tree->buffer_lock);
+                return ret;
+        }
        if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
                ret = 0;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1c6d4f342ef7..4183c8178f01 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -310,4 +310,7 @@ int extent_clear_unlock_delalloc(struct inode *inode,
                                struct extent_io_tree *tree,
                                u64 start, u64 end, struct page *locked_page,
                                unsigned long op);
+struct bio *
+btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+                gfp_t gfp_flags);
 #endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e354c33df082..66836d85763b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -48,30 +48,34 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
                                         struct page **prepared_pages,
                                         struct iov_iter *i)
 {
-        size_t copied;
+        size_t copied = 0;
        int pg = 0;
        int offset = pos & (PAGE_CACHE_SIZE - 1);
+        int total_copied = 0;
        while (write_bytes > 0) {
                size_t count = min_t(size_t,
                                     PAGE_CACHE_SIZE - offset, write_bytes);
                struct page *page = prepared_pages[pg];
-again:
+                /*
-                if (unlikely(iov_iter_fault_in_readable(i, count)))
+                 * Copy data from userspace to the current page
-                        return -EFAULT;
+                 *
+                 * Disable pagefault to avoid recursive lock since
-                /* Copy data from userspace to the current page */
+                 * the pages are already locked
-                copied = iov_iter_copy_from_user(page, i, offset, count);
+                 */
+                pagefault_disable();
+                copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
+                pagefault_enable();
                /* Flush processor's dcache for this page */
                flush_dcache_page(page);
                iov_iter_advance(i, copied);
                write_bytes -= copied;
+                total_copied += copied;
+                /* Return to btrfs_file_aio_write to fault page */
                if (unlikely(copied == 0)) {
-                        count = min_t(size_t, PAGE_CACHE_SIZE - offset,
+                        break;
-                                      iov_iter_single_seg_count(i));
-                        goto again;
                }
                if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
@@ -81,7 +85,7 @@ again:
                        offset = 0;
                }
        }
-        return 0;
+        return total_copied;
 }
 /*
@@ -854,6 +858,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        unsigned long last_index;
        int will_write;
        int buffered = 0;
+        int copied = 0;
+        int dirty_pages = 0;
        will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
                      (file->f_flags & O_DIRECT));
@@ -970,7 +976,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                WARN_ON(num_pages > nrptrs);
                memset(pages, 0, sizeof(struct page *) * nrptrs);
-                ret = btrfs_delalloc_reserve_space(inode, write_bytes);
+                /*
+                 * Fault pages before locking them in prepare_pages
+                 * to avoid recursive lock
+                 */
+                if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                ret = btrfs_delalloc_reserve_space(inode,
+                                        num_pages << PAGE_CACHE_SHIFT);
                if (ret)
                        goto out;
@@ -978,37 +994,49 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                                    pos, first_index, last_index,
                                    write_bytes);
                if (ret) {
-                        btrfs_delalloc_release_space(inode, write_bytes);
+                        btrfs_delalloc_release_space(inode,
+                                        num_pages << PAGE_CACHE_SHIFT);
                        goto out;
                }
-                ret = btrfs_copy_from_user(pos, num_pages,
+                copied = btrfs_copy_from_user(pos, num_pages,
                                           write_bytes, pages, &i);
-                if (ret == 0) {
+                dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >>
+                                        PAGE_CACHE_SHIFT;
+                if (num_pages > dirty_pages) {
+                        if (copied > 0)
+                                atomic_inc(
+                                        &BTRFS_I(inode)->outstanding_extents);
+                        btrfs_delalloc_release_space(inode,
+                                        (num_pages - dirty_pages) <<
+                                        PAGE_CACHE_SHIFT);
+                }
+                if (copied > 0) {
                        dirty_and_release_pages(NULL, root, file, pages,
-                                                num_pages, pos, write_bytes);
+                                                dirty_pages, pos, copied);
                }
                btrfs_drop_pages(pages, num_pages);
-                if (ret) {
-                        btrfs_delalloc_release_space(inode, write_bytes);
-                        goto out;
-                }
-                if (will_write) {
+                if (copied > 0) {
-                        filemap_fdatawrite_range(inode->i_mapping, pos,
+                        if (will_write) {
-                                                 pos + write_bytes - 1);
+                                filemap_fdatawrite_range(inode->i_mapping, pos,
-                } else {
+                                                         pos + copied - 1);
-                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+                        } else {
-                                                           num_pages);
+                                balance_dirty_pages_ratelimited_nr(
-                        if (num_pages <
+                                                        inode->i_mapping,
-                            (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+                                                        dirty_pages);
-                                btrfs_btree_balance_dirty(root, 1);
+                                if (dirty_pages <
-                        btrfs_throttle(root);
+                                (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+                                        btrfs_btree_balance_dirty(root, 1);
+                                btrfs_throttle(root);
+                        }
                }
-                pos += write_bytes;
+                pos += copied;
-                num_written += write_bytes;
+                num_written += copied;
                cond_resched();
        }
@@ -1047,8 +1075,14 @@ out:
                if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
                        trans = btrfs_start_transaction(root, 0);
+                        if (IS_ERR(trans)) {
+                                num_written = PTR_ERR(trans);
+                                goto done;
+                        }
+                        mutex_lock(&inode->i_mutex);
                        ret = btrfs_log_dentry_safe(trans, root,
                                                    file->f_dentry);
+                        mutex_unlock(&inode->i_mutex);
                        if (ret == 0) {
                                ret = btrfs_sync_log(trans, root);
                                if (ret == 0)
@@ -1067,6 +1101,7 @@ out:
                             (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
                }
        }
+done:
        current->backing_dev_info = NULL;
        return num_written ? num_written : err;
 }
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 22ee0dc2e6b8..60d684266959 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -290,7 +290,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
                       (unsigned long long)BTRFS_I(inode)->generation,
                       (unsigned long long)generation,
                       (unsigned long long)block_group->key.objectid);
-                goto out;
+                goto free_cache;
        }
        if (!num_entries)
@@ -524,6 +524,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                return 0;
        }
+        node = rb_first(&block_group->free_space_offset);
+        if (!node) {
+                iput(inode);
+                return 0;
+        }
        last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
        filemap_write_and_wait(inode->i_mapping);
        btrfs_wait_ordered_range(inode, inode->i_size &
@@ -543,10 +549,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
         */
        first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
-        node = rb_first(&block_group->free_space_offset);
-        if (!node)
-                goto out_free;
        /*
         * Lock all pages first so we can lock the extent safely.
         *
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 558cac2dfa54..a0ff46a47895 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -495,7 +495,7 @@ again:
                add_async_extent(async_cow, start, num_bytes,
                                 total_compressed, pages, nr_pages_ret);
-                if (start + num_bytes < end && start + num_bytes < actual_end) {
+                if (start + num_bytes < end) {
                        start += num_bytes;
                        pages = NULL;
                        cond_resched();
@@ -4084,7 +4084,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        int index;
        int ret;
-        dentry->d_op = &btrfs_dentry_operations;
+        d_set_d_op(dentry, &btrfs_dentry_operations);
        if (dentry->d_name.len > BTRFS_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
@@ -4127,7 +4127,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        return inode;
 }
-static int btrfs_dentry_delete(struct dentry *dentry)
+static int btrfs_dentry_delete(const struct dentry *dentry)
 {
        struct btrfs_root *root;
@@ -4501,6 +4501,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        BTRFS_I(inode)->index_cnt = 2;
        BTRFS_I(inode)->root = root;
        BTRFS_I(inode)->generation = trans->transid;
+        inode->i_generation = BTRFS_I(inode)->generation;
        btrfs_set_inode_space_info(root, inode);
        if (mode & S_IFDIR)
@@ -4622,12 +4623,12 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 }
 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
-                            struct dentry *dentry, struct inode *inode,
+                            struct inode *dir, struct dentry *dentry,
-                            int backref, u64 index)
+                            struct inode *inode, int backref, u64 index)
 {
-        int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+        int err = btrfs_add_link(trans, dir, inode,
-                                 inode, dentry->d_name.name,
+                                 dentry->d_name.name, dentry->d_name.len,
-                                 dentry->d_name.len, backref, index);
+                                 backref, index);
        if (!err) {
                d_instantiate(dentry, inode);
                return 0;
@@ -4668,8 +4669,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        btrfs_set_trans_block_group(trans, dir);
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len,
+                                dentry->d_name.len, dir->i_ino, objectid,
-                                dentry->d_parent->d_inode->i_ino, objectid,
                                BTRFS_I(dir)->block_group, mode, &index);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
@@ -4682,7 +4682,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        }
        btrfs_set_trans_block_group(trans, inode);
-        err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
        else {
@@ -4730,10 +4730,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        btrfs_set_trans_block_group(trans, dir);
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len,
+                                dentry->d_name.len, dir->i_ino, objectid,
-                                dentry->d_parent->d_inode->i_ino,
+                                BTRFS_I(dir)->block_group, mode, &index);
-                                objectid, BTRFS_I(dir)->block_group, mode,
-                                &index);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_unlock;
@@ -4745,7 +4743,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        }
        btrfs_set_trans_block_group(trans, inode);
-        err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
        else {
@@ -4787,6 +4785,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                return -EPERM;
        btrfs_inc_nlink(inode);
+        inode->i_ctime = CURRENT_TIME;
        err = btrfs_set_inode_index(dir, &index);
        if (err)
@@ -4805,15 +4804,17 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        btrfs_set_trans_block_group(trans, dir);
        ihold(inode);
-        err = btrfs_add_nondir(trans, dentry, inode, 1, index);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
        if (err) {
                drop_inode = 1;
        } else {
+                struct dentry *parent = dget_parent(dentry);
                btrfs_update_inode_block_group(trans, dir);
                err = btrfs_update_inode(trans, root, inode);
                BUG_ON(err);
-                btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
+                btrfs_log_new_name(trans, inode, NULL, parent);
+                dput(parent);
        }
        nr = trans->blocks_used;
@@ -4853,8 +4854,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        btrfs_set_trans_block_group(trans, dir);
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len,
+                                dentry->d_name.len, dir->i_ino, objectid,
-                                dentry->d_parent->d_inode->i_ino, objectid,
                                BTRFS_I(dir)->block_group, S_IFDIR | mode,
                                &index);
        if (IS_ERR(inode)) {
@@ -4877,9 +4877,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (err)
                goto out_fail;
-        err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+        err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
-                                 inode, dentry->d_name.name,
+                             dentry->d_name.len, 0, index);
-                                 dentry->d_name.len, 0, index);
        if (err)
                goto out_fail;
@@ -5535,13 +5534,21 @@ struct btrfs_dio_private {
        u64 bytes;
        u32 *csums;
        void *private;
+        /* number of bios pending for this dio */
+        atomic_t pending_bios;
+        /* IO errors */
+        int errors;
+        struct bio *orig_bio;
 };
 static void btrfs_endio_direct_read(struct bio *bio, int err)
 {
+        struct btrfs_dio_private *dip = bio->bi_private;
        struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
        struct bio_vec *bvec = bio->bi_io_vec;
-        struct btrfs_dio_private *dip = bio->bi_private;
        struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 start;
@@ -5595,15 +5602,18 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
        struct btrfs_trans_handle *trans;
        struct btrfs_ordered_extent *ordered = NULL;
        struct extent_state *cached_state = NULL;
+        u64 ordered_offset = dip->logical_offset;
+        u64 ordered_bytes = dip->bytes;
        int ret;
        if (err)
                goto out_done;
+again:
-        ret = btrfs_dec_test_ordered_pending(inode, &ordered,
+        ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
-                                             dip->logical_offset, dip->bytes);
+                                                   &ordered_offset,
+                                                   ordered_bytes);
        if (!ret)
-                goto out_done;
+                goto out_test;
        BUG_ON(!ordered);
@@ -5663,8 +5673,20 @@ out_unlock:
 out:
        btrfs_delalloc_release_metadata(inode, ordered->len);
        btrfs_end_transaction(trans, root);
+        ordered_offset = ordered->file_offset + ordered->len;
        btrfs_put_ordered_extent(ordered);
        btrfs_put_ordered_extent(ordered);
+out_test:
+        /*
+         * our bio might span multiple ordered extents.  If we haven't
+         * completed the accounting for the whole dio, go back and try again
+         */
+        if (ordered_offset < dip->logical_offset + dip->bytes) {
+                ordered_bytes = dip->logical_offset + dip->bytes -
+                        ordered_offset;
+                goto again;
+        }
 out_done:
        bio->bi_private = dip->private;
@@ -5684,6 +5706,176 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
        return 0;
 }
+static void btrfs_end_dio_bio(struct bio *bio, int err)
+{
+        struct btrfs_dio_private *dip = bio->bi_private;
+        if (err) {
+                printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu "
+                      "sector %#Lx len %u err no %d\n",
+                      dip->inode->i_ino, bio->bi_rw,
+                      (unsigned long long)bio->bi_sector, bio->bi_size, err);
+                dip->errors = 1;
+                /*
+                 * before atomic variable goto zero, we must make sure
+                 * dip->errors is perceived to be set.
+                 */
+                smp_mb__before_atomic_dec();
+        }
+        /* if there are more bios still pending for this dio, just exit */
+        if (!atomic_dec_and_test(&dip->pending_bios))
+                goto out;
+        if (dip->errors)
+                bio_io_error(dip->orig_bio);
+        else {
+                set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
+                bio_endio(dip->orig_bio, 0);
+        }
+out:
+        bio_put(bio);
+}
+static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
+                                       u64 first_sector, gfp_t gfp_flags)
+{
+        int nr_vecs = bio_get_nr_vecs(bdev);
+        return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
+}
+static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
+                                         int rw, u64 file_offset, int skip_sum,
+                                         u32 *csums)
+{
+        int write = rw & REQ_WRITE;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret;
+        bio_get(bio);
+        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        if (ret)
+                goto err;
+        if (write && !skip_sum) {
+                ret = btrfs_wq_submit_bio(root->fs_info,
+                                   inode, rw, bio, 0, 0,
+                                   file_offset,
+                                   __btrfs_submit_bio_start_direct_io,
+                                   __btrfs_submit_bio_done);
+                goto err;
+        } else if (!skip_sum)
+                btrfs_lookup_bio_sums_dio(root, inode, bio,
+                                          file_offset, csums);
+        ret = btrfs_map_bio(root, rw, bio, 0, 1);
+err:
+        bio_put(bio);
+        return ret;
+}
+static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
+                                    int skip_sum)
+{
+        struct inode *inode = dip->inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+        struct bio *bio;
+        struct bio *orig_bio = dip->orig_bio;
+        struct bio_vec *bvec = orig_bio->bi_io_vec;
+        u64 start_sector = orig_bio->bi_sector;
+        u64 file_offset = dip->logical_offset;
+        u64 submit_len = 0;
+        u64 map_length;
+        int nr_pages = 0;
+        u32 *csums = dip->csums;
+        int ret = 0;
+        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
+        if (!bio)
+                return -ENOMEM;
+        bio->bi_private = dip;
+        bio->bi_end_io = btrfs_end_dio_bio;
+        atomic_inc(&dip->pending_bios);
+        map_length = orig_bio->bi_size;
+        ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+                              &map_length, NULL, 0);
+        if (ret) {
+                bio_put(bio);
+                return -EIO;
+        }
+        while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
+                if (unlikely(map_length < submit_len + bvec->bv_len ||
+                    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+                                 bvec->bv_offset) < bvec->bv_len)) {
+                        /*
+                         * inc the count before we submit the bio so
+                         * we know the end IO handler won't happen before
+                         * we inc the count. Otherwise, the dip might get freed
+                         * before we're done setting it up
+                         */
+                        atomic_inc(&dip->pending_bios);
+                        ret = __btrfs_submit_dio_bio(bio, inode, rw,
+                                                     file_offset, skip_sum,
+                                                     csums);
+                        if (ret) {
+                                bio_put(bio);
+                                atomic_dec(&dip->pending_bios);
+                                goto out_err;
+                        }
+                        if (!skip_sum)
+                                csums = csums + nr_pages;
+                        start_sector += submit_len >> 9;
+                        file_offset += submit_len;
+                        submit_len = 0;
+                        nr_pages = 0;
+                        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
+                                                  start_sector, GFP_NOFS);
+                        if (!bio)
+                                goto out_err;
+                        bio->bi_private = dip;
+                        bio->bi_end_io = btrfs_end_dio_bio;
+                        map_length = orig_bio->bi_size;
+                        ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+                                              &map_length, NULL, 0);
+                        if (ret) {
+                                bio_put(bio);
+                                goto out_err;
+                        }
+                } else {
+                        submit_len += bvec->bv_len;
+                        nr_pages ++;
+                        bvec++;
+                }
+        }
+        ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
+                                     csums);
+        if (!ret)
+                return 0;
+        bio_put(bio);
+out_err:
+        dip->errors = 1;
+        /*
+         * before atomic variable goto zero, we must
+         * make sure dip->errors is perceived to be set.
+         */
+        smp_mb__before_atomic_dec();
+        if (atomic_dec_and_test(&dip->pending_bios))
+                bio_io_error(dip->orig_bio);
+        /* bio_end_io() will handle error, so we needn't return it */
+        return 0;
+}
 static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
                                loff_t file_offset)
 {
@@ -5723,36 +5915,18 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
        dip->disk_bytenr = (u64)bio->bi_sector << 9;
        bio->bi_private = dip;
+        dip->errors = 0;
+        dip->orig_bio = bio;
+        atomic_set(&dip->pending_bios, 0);
        if (write)
                bio->bi_end_io = btrfs_endio_direct_write;
        else
                bio->bi_end_io = btrfs_endio_direct_read;
-        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
-        if (ret)
+        if (!ret)
-                goto out_err;
-        if (write && !skip_sum) {
-                ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-                                   inode, rw, bio, 0, 0,
-                                   dip->logical_offset,
-                                   __btrfs_submit_bio_start_direct_io,
-                                   __btrfs_submit_bio_done);
-                if (ret)
-                        goto out_err;
                return;
-        } else if (!skip_sum)
-                btrfs_lookup_bio_sums_dio(root, inode, bio,
-                                          dip->logical_offset, dip->csums);
-        ret = btrfs_map_bio(root, rw, bio, 0, 1);
-        if (ret)
-                goto out_err;
-        return;
-out_err:
-        kfree(dip->csums);
-        kfree(dip);
 free_ordered:
        /*
         * If this is a write, we need to clean up the reserved space and kill
@@ -5760,8 +5934,7 @@ free_ordered:
         */
        if (write) {
                struct btrfs_ordered_extent *ordered;
-                ordered = btrfs_lookup_ordered_extent(inode,
+                ordered = btrfs_lookup_ordered_extent(inode, file_offset);
-                                                      dip->logical_offset);
                if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
                        btrfs_free_reserved_extent(root, ordered->start,
@@ -6322,6 +6495,13 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        return inode;
 }
+static void btrfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
 void btrfs_destroy_inode(struct inode *inode)
 {
        struct btrfs_ordered_extent *ordered;
@@ -6391,7 +6571,7 @@ void btrfs_destroy_inode(struct inode *inode)
        inode_tree_del(inode);
        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
-        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+        call_rcu(&inode->i_rcu, btrfs_i_callback);
 }
 int btrfs_drop_inode(struct inode *inode)
@@ -6607,8 +6787,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        BUG_ON(ret);
        if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
-                btrfs_log_new_name(trans, old_inode, old_dir,
+                struct dentry *parent = dget_parent(new_dentry);
-                                   new_dentry->d_parent);
+                btrfs_log_new_name(trans, old_inode, old_dir, parent);
+                dput(parent);
                btrfs_end_log_trans(root);
        }
 out_fail:
@@ -6758,8 +6939,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        btrfs_set_trans_block_group(trans, dir);
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len,
+                                dentry->d_name.len, dir->i_ino, objectid,
-                                dentry->d_parent->d_inode->i_ino, objectid,
                                BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
                                &index);
        err = PTR_ERR(inode);
@@ -6773,7 +6953,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        }
        btrfs_set_trans_block_group(trans, inode);
-        err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
        else {
@@ -6844,6 +7024,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 cur_offset = start;
+        u64 i_size;
        int ret = 0;
        bool own_trans = true;
@@ -6885,11 +7066,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                    (actual_len > inode->i_size) &&
                    (cur_offset > inode->i_size)) {
                        if (cur_offset > actual_len)
-                                i_size_write(inode, actual_len);
+                                i_size = actual_len;
                        else
-                                i_size_write(inode, cur_offset);
+                                i_size = cur_offset;
-                        i_size_write(inode, cur_offset);
+                        i_size_write(inode, i_size);
-                        btrfs_ordered_update_i_size(inode, cur_offset, NULL);
+                        btrfs_ordered_update_i_size(inode, i_size, NULL);
                }
                ret = btrfs_update_inode(trans, root, inode);
@@ -6943,6 +7124,10 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
        mutex_lock(&inode->i_mutex);
+        ret = inode_newsize_ok(inode, alloc_end);
+        if (ret)
+                goto out;
        if (alloc_start > inode->i_size) {
                ret = btrfs_cont_expand(inode, alloc_start);
                if (ret)
@@ -7026,11 +7211,11 @@ static int btrfs_set_page_dirty(struct page *page)
        return __set_page_dirty_nobuffers(page);
 }
-static int btrfs_permission(struct inode *inode, int mask)
+static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
        if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
                return -EACCES;
-        return generic_permission(inode, mask, btrfs_check_acl);
+        return generic_permission(inode, mask, flags, btrfs_check_acl);
 }
 static const struct inode_operations btrfs_dir_inode_operations = {
@@ -7139,6 +7324,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
+        .getattr        = btrfs_getattr,
        .permission     = btrfs_permission,
        .setxattr       = btrfs_setxattr,
        .getxattr       = btrfs_getxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 463d91b4dd3a..f87552a1d7ea 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -233,7 +233,8 @@ static noinline int create_subvol(struct btrfs_root *root,
        struct btrfs_inode_item *inode_item;
        struct extent_buffer *leaf;
        struct btrfs_root *new_root;
-        struct inode *dir = dentry->d_parent->d_inode;
+        struct dentry *parent = dget_parent(dentry);
+        struct inode *dir;
        int ret;
        int err;
        u64 objectid;
@@ -242,8 +243,13 @@ static noinline int create_subvol(struct btrfs_root *root,
        ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
                                       0, &objectid);
-        if (ret)
+        if (ret) {
+                dput(parent);
                return ret;
+        }
+        dir = parent->d_inode;
        /*
         * 1 - inode item
         * 2 - refs
@@ -251,8 +257,10 @@ static noinline int create_subvol(struct btrfs_root *root,
         * 2 - dir items
         */
        trans = btrfs_start_transaction(root, 6);
-        if (IS_ERR(trans))
+        if (IS_ERR(trans)) {
+                dput(parent);
                return PTR_ERR(trans);
+        }
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
                                      0, objectid, NULL, 0, 0, 0);
@@ -339,6 +347,7 @@ static noinline int create_subvol(struct btrfs_root *root,
        d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
 fail:
+        dput(parent);
        if (async_transid) {
                *async_transid = trans->transid;
                err = btrfs_commit_transaction_async(trans, root, 1);
@@ -354,6 +363,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
                           char *name, int namelen, u64 *async_transid)
 {
        struct inode *inode;
+        struct dentry *parent;
        struct btrfs_pending_snapshot *pending_snapshot;
        struct btrfs_trans_handle *trans;
        int ret;
@@ -396,7 +406,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        btrfs_orphan_cleanup(pending_snapshot->snap);
-        inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
+        parent = dget_parent(dentry);
+        inode = btrfs_lookup_dentry(parent->d_inode, dentry);
+        dput(parent);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
                goto fail;
@@ -935,23 +947,42 @@ out:
 static noinline int btrfs_ioctl_snap_create(struct file *file,
                                            void __user *arg, int subvol,
-                                            int async)
+                                            int v2)
 {
        struct btrfs_ioctl_vol_args *vol_args = NULL;
-        struct btrfs_ioctl_async_vol_args *async_vol_args = NULL;
+        struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL;
        char *name;
        u64 fd;
-        u64 transid = 0;
        int ret;
-        if (async) {
+        if (v2) {
-                async_vol_args = memdup_user(arg, sizeof(*async_vol_args));
+                u64 transid = 0;
-                if (IS_ERR(async_vol_args))
+                u64 *ptr = NULL;
-                        return PTR_ERR(async_vol_args);
+                vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2));
+                if (IS_ERR(vol_args_v2))
+                        return PTR_ERR(vol_args_v2);
+                if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) {
+                        ret = -EINVAL;
+                        goto out;
+                }
+                name = vol_args_v2->name;
+                fd = vol_args_v2->fd;
+                vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+                if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC)
+                        ptr = &transid;
+                ret = btrfs_ioctl_snap_create_transid(file, name, fd,
+                                                      subvol, ptr);
-                name = async_vol_args->name;
+                if (ret == 0 && ptr &&
-                fd = async_vol_args->fd;
+                    copy_to_user(arg +
-                async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0';
+                                 offsetof(struct btrfs_ioctl_vol_args_v2,
+                                          transid), ptr, sizeof(*ptr)))
+                        ret = -EFAULT;
        } else {
                vol_args = memdup_user(arg, sizeof(*vol_args));
                if (IS_ERR(vol_args))
@@ -959,20 +990,13 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
                name = vol_args->name;
                fd = vol_args->fd;
                vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-        }
-        ret = btrfs_ioctl_snap_create_transid(file, name, fd,
-                                              subvol, &transid);
-        if (!ret && async) {
+                ret = btrfs_ioctl_snap_create_transid(file, name, fd,
-                if (copy_to_user(arg +
+                                                      subvol, NULL);
-                                offsetof(struct btrfs_ioctl_async_vol_args,
-                                transid), &transid, sizeof(transid)))
-                        return -EFAULT;
        }
+out:
        kfree(vol_args);
-        kfree(async_vol_args);
+        kfree(vol_args_v2);
        return ret;
 }
@@ -1669,12 +1693,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                olen = len = src->i_size - off;
        /* if we extend to eof, continue to block boundary */
        if (off + len == src->i_size)
-                len = ((src->i_size + bs-1) & ~(bs-1))
+                len = ALIGN(src->i_size, bs) - off;
-                        - off;
        /* verify the end result is block aligned */
-        if ((off & (bs-1)) ||
+        if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
-            ((off + len) & (bs-1)))
+            !IS_ALIGNED(destoff, bs))
                goto out_unlock;
        /* do any pending delalloc/csum calc on src, one way or
@@ -1874,8 +1897,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                         * but shouldn't round up the file size
                         */
                        endoff = new_key.offset + datal;
-                        if (endoff > off+olen)
+                        if (endoff > destoff+olen)
-                                endoff = off+olen;
+                                endoff = destoff+olen;
                        if (endoff > inode->i_size)
                                btrfs_i_size_write(inode, endoff);
@@ -2235,7 +2258,7 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_getversion(file, argp);
        case BTRFS_IOC_SNAP_CREATE:
                return btrfs_ioctl_snap_create(file, argp, 0, 0);
-        case BTRFS_IOC_SNAP_CREATE_ASYNC:
+        case BTRFS_IOC_SNAP_CREATE_V2:
                return btrfs_ioctl_snap_create(file, argp, 0, 1);
        case BTRFS_IOC_SUBVOL_CREATE:
                return btrfs_ioctl_snap_create(file, argp, 1, 0);
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 17c99ebdf960..c344d12c646b 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,11 +30,15 @@ struct btrfs_ioctl_vol_args {
        char name[BTRFS_PATH_NAME_MAX + 1];
 };
-#define BTRFS_SNAPSHOT_NAME_MAX 4079
+#define BTRFS_SUBVOL_CREATE_ASYNC       (1ULL << 0)
-struct btrfs_ioctl_async_vol_args {
+#define BTRFS_SUBVOL_NAME_MAX 4039
+struct btrfs_ioctl_vol_args_v2 {
        __s64 fd;
        __u64 transid;
-        char name[BTRFS_SNAPSHOT_NAME_MAX + 1];
+        __u64 flags;
+        __u64 unused[4];
+        char name[BTRFS_SUBVOL_NAME_MAX + 1];
 };
 #define BTRFS_INO_LOOKUP_PATH_MAX 4080
@@ -187,6 +191,6 @@ struct btrfs_ioctl_space_args {
                                    struct btrfs_ioctl_space_args)
 #define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
 #define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
-#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \
+#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
-                                   struct btrfs_ioctl_async_vol_args)
+                                   struct btrfs_ioctl_vol_args_v2)
 #endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index f4621f6deca1..ae7737e352c9 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -250,6 +250,73 @@ int btrfs_add_ordered_sum(struct inode *inode,
 /*
 * this is used to account for finished IO across a given range
+ * of the file.  The IO may span ordered extents.  If
+ * a given ordered_extent is completely done, 1 is returned, otherwise
+ * 0.
+ *
+ * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
+ * to make sure this function only returns 1 once for a given ordered extent.
+ *
+ * file_offset is updated to one byte past the range that is recorded as
+ * complete.  This allows you to walk forward in the file.
+ */
+int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+                                   struct btrfs_ordered_extent **cached,
+                                   u64 *file_offset, u64 io_size)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct rb_node *node;
+        struct btrfs_ordered_extent *entry = NULL;
+        int ret;
+        u64 dec_end;
+        u64 dec_start;
+        u64 to_dec;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        spin_lock(&tree->lock);
+        node = tree_search(tree, *file_offset);
+        if (!node) {
+                ret = 1;
+                goto out;
+        }
+        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+        if (!offset_in_entry(entry, *file_offset)) {
+                ret = 1;
+                goto out;
+        }
+        dec_start = max(*file_offset, entry->file_offset);
+        dec_end = min(*file_offset + io_size, entry->file_offset +
+                      entry->len);
+        *file_offset = dec_end;
+        if (dec_start > dec_end) {
+                printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
+                       (unsigned long long)dec_start,
+                       (unsigned long long)dec_end);
+        }
+        to_dec = dec_end - dec_start;
+        if (to_dec > entry->bytes_left) {
+                printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+                       (unsigned long long)entry->bytes_left,
+                       (unsigned long long)to_dec);
+        }
+        entry->bytes_left -= to_dec;
+        if (entry->bytes_left == 0)
+                ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+        else
+                ret = 1;
+out:
+        if (!ret && cached && entry) {
+                *cached = entry;
+                atomic_inc(&entry->refs);
+        }
+        spin_unlock(&tree->lock);
+        return ret == 0;
+}
+/*
+ * this is used to account for finished IO across a given range
 * of the file.  The IO should not span ordered extents.  If
 * a given ordered_extent is completely done, 1 is returned, otherwise
 * 0.
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 8ac365492a3f..61dca83119dd 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -141,6 +141,9 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 int btrfs_dec_test_ordered_pending(struct inode *inode,
                                   struct btrfs_ordered_extent **cached,
                                   u64 file_offset, u64 io_size);
+int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+                                   struct btrfs_ordered_extent **cached,
+                                   u64 *file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                             u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 79cba5fbc28e..f8be250963a0 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -56,8 +56,12 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-        if (ret)
+        if (ret < 0)
                goto out;
+        if (ret) {
+                ret = -ENOENT;
+                goto out;
+        }
        ret = btrfs_del_item(trans, root, path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8299a25ffc8f..883c6fa1367e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -244,6 +244,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_space_cache:
                        printk(KERN_INFO "btrfs: enabling disk space caching\n");
                        btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+                        break;
                case Opt_clear_cache:
                        printk(KERN_INFO "btrfs: force clearing of disk cache\n");
                        btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
@@ -562,12 +563,26 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 static int btrfs_test_super(struct super_block *s, void *data)
 {
-        struct btrfs_fs_devices *test_fs_devices = data;
+        struct btrfs_root *test_root = data;
        struct btrfs_root *root = btrfs_sb(s);
-        return root->fs_info->fs_devices == test_fs_devices;
+        /*
+         * If this super block is going away, return false as it
+         * can't match as an existing super block.
+         */
+        if (!atomic_read(&s->s_active))
+                return 0;
+        return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
+}
+static int btrfs_set_super(struct super_block *s, void *data)
+{
+        s->s_fs_info = data;
+        return set_anon_super(s, data);
 }
 /*
 * Find a superblock for the given device / mount point.
 *
@@ -581,6 +596,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        struct super_block *s;
        struct dentry *root;
        struct btrfs_fs_devices *fs_devices = NULL;
+        struct btrfs_root *tree_root = NULL;
+        struct btrfs_fs_info *fs_info = NULL;
        fmode_t mode = FMODE_READ;
        char *subvol_name = NULL;
        u64 subvol_objectid = 0;
@@ -608,8 +625,24 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
                goto error_close_devices;
        }
+        /*
+         * Setup a dummy root and fs_info for test/set super.  This is because
+         * we don't actually fill this stuff out until open_ctree, but we need
+         * it for searching for existing supers, so this lets us do that and
+         * then open_ctree will properly initialize everything later.
+         */
+        fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
+        tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+        if (!fs_info || !tree_root) {
+                error = -ENOMEM;
+                goto error_close_devices;
+        }
+        fs_info->tree_root = tree_root;
+        fs_info->fs_devices = fs_devices;
+        tree_root->fs_info = fs_info;
        bdev = fs_devices->latest_bdev;
-        s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
+        s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
        if (IS_ERR(s))
                goto error_s;
@@ -652,9 +685,9 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
                mutex_unlock(&root->d_inode->i_mutex);
                if (IS_ERR(new_root)) {
+                        dput(root);
                        deactivate_locked_super(s);
                        error = PTR_ERR(new_root);
-                        dput(root);
                        goto error_free_subvol_name;
                }
                if (!new_root->d_inode) {
@@ -675,6 +708,8 @@ error_s:
        error = PTR_ERR(s);
 error_close_devices:
        btrfs_close_devices(fs_devices);
+        kfree(fs_info);
+        kfree(tree_root);
 error_free_subvol_name:
        kfree(subvol_name);
        return ERR_PTR(error);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1fffbc017bdf..f50e931fc217 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -902,6 +902,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        struct btrfs_root *root = pending->root;
        struct btrfs_root *parent_root;
        struct inode *parent_inode;
+        struct dentry *parent;
        struct dentry *dentry;
        struct extent_buffer *tmp;
        struct extent_buffer *old;
@@ -941,7 +942,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        trans->block_rsv = &pending->block_rsv;
        dentry = pending->dentry;
-        parent_inode = dentry->d_parent->d_inode;
+        parent = dget_parent(dentry);
+        parent_inode = parent->d_inode;
        parent_root = BTRFS_I(parent_inode)->root;
        record_root_in_trans(trans, parent_root);
@@ -989,6 +991,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                                 parent_inode->i_ino, index,
                                 dentry->d_name.name, dentry->d_name.len);
        BUG_ON(ret);
+        dput(parent);
        key.offset = (u64)-1;
        pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a29f19384a27..054744ac5719 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2869,6 +2869,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 {
        int ret = 0;
        struct btrfs_root *root;
+        struct dentry *old_parent = NULL;
        /*
         * for regular files, if its inode is already on disk, we don't
@@ -2910,10 +2911,13 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
                if (IS_ROOT(parent))
                        break;
-                parent = parent->d_parent;
+                parent = dget_parent(parent);
+                dput(old_parent);
+                old_parent = parent;
                inode = parent->d_inode;
        }
+        dput(old_parent);
 out:
        return ret;
 }
@@ -2945,6 +2949,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 {
        int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
        struct super_block *sb;
+        struct dentry *old_parent = NULL;
        int ret = 0;
        u64 last_committed = root->fs_info->last_trans_committed;
@@ -3016,10 +3021,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                if (IS_ROOT(parent))
                        break;
-                parent = parent->d_parent;
+                parent = dget_parent(parent);
+                dput(old_parent);
+                old_parent = parent;
        }
        ret = 0;
 end_trans:
+        dput(old_parent);
        if (ret < 0) {
                BUG_ON(ret != -ENOSPC);
                root->fs_info->last_trans_log_full_commit = trans->transid;
@@ -3039,8 +3047,13 @@ end_no_trans:
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry)
 {
-        return btrfs_log_inode_parent(trans, root, dentry->d_inode,
+        struct dentry *parent = dget_parent(dentry);
-                                      dentry->d_parent, 0);
+        int ret;
+        ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
+        dput(parent);
+        return ret;
 }
 /*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cc04dc1445d6..6b9884507837 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -412,12 +412,16 @@ static noinline int device_list_add(const char *path,
                device->fs_devices = fs_devices;
                fs_devices->num_devices++;
-        } else if (strcmp(device->name, path)) {
+        } else if (!device->name || strcmp(device->name, path)) {
                name = kstrdup(path, GFP_NOFS);
                if (!name)
                        return -ENOMEM;
                kfree(device->name);
                device->name = name;
+                if (device->missing) {
+                        fs_devices->missing_devices--;
+                        device->missing = 0;
+                }
        }
        if (found_transid > fs_devices->latest_trans) {
@@ -1236,6 +1240,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        device->fs_devices->num_devices--;
+        if (device->missing)
+                root->fs_info->fs_devices->missing_devices--;
        next_device = list_entry(root->fs_info->fs_devices->devices.next,
                                 struct btrfs_device, dev_list);
        if (device->bdev == root->fs_info->sb->s_bdev)
@@ -3080,7 +3087,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
        device->devid = devid;
        device->work.func = pending_bios_fn;
        device->fs_devices = fs_devices;
+        device->missing = 1;
        fs_devices->num_devices++;
+        fs_devices->missing_devices++;
        spin_lock_init(&device->io_lock);
        INIT_LIST_HEAD(&device->dev_alloc_list);
        memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
@@ -3278,6 +3287,15 @@ static int read_one_dev(struct btrfs_root *root,
                        device = add_missing_dev(root, devid, dev_uuid);
                        if (!device)
                                return -ENOMEM;
+                } else if (!device->missing) {
+                        /*
+                         * this happens when a device that was properly setup
+                         * in the device info lists suddenly goes bad.
+                         * device->bdev is NULL, and so we have to set
+                         * device->missing to one here
+                         */
+                        root->fs_info->fs_devices->missing_devices++;
+                        device->missing = 1;
                }
        }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2b638b6e4eea..2740db49eb04 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -44,6 +44,7 @@ struct btrfs_device {
        int writeable;
        int in_fs_metadata;
+        int missing;
        spinlock_t io_lock;
@@ -93,6 +94,7 @@ struct btrfs_fs_devices {
        u64 num_devices;
        u64 open_devices;
        u64 rw_devices;
+        u64 missing_devices;
        u64 total_rw_bytes;
        struct block_device *latest_bdev;
diff --git a/fs/buffer.c b/fs/buffer.c
index 5930e382959b..2219a76e2caf 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1270,12 +1270,10 @@ static inline void check_irqs_on(void)
 static void bh_lru_install(struct buffer_head *bh)
 {
        struct buffer_head *evictee = NULL;
-        struct bh_lru *lru;
        check_irqs_on();
        bh_lru_lock();
-        lru = &__get_cpu_var(bh_lrus);
+        if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
-        if (lru->bhs[0] != bh) {
                struct buffer_head *bhs[BH_LRU_SIZE];
                int in;
                int out = 0;
@@ -1283,7 +1281,8 @@ static void bh_lru_install(struct buffer_head *bh)
                get_bh(bh);
                bhs[out++] = bh;
                for (in = 0; in < BH_LRU_SIZE; in++) {
-                        struct buffer_head *bh2 = lru->bhs[in];
+                        struct buffer_head *bh2 =
+                                __this_cpu_read(bh_lrus.bhs[in]);
                        if (bh2 == bh) {
                                __brelse(bh2);
@@ -1298,7 +1297,7 @@ static void bh_lru_install(struct buffer_head *bh)
                }
                while (out < BH_LRU_SIZE)
                        bhs[out++] = NULL;
-                memcpy(lru->bhs, bhs, sizeof(bhs));
+                memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
        }
        bh_lru_unlock();
@@ -1313,23 +1312,22 @@ static struct buffer_head *
 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
 {
        struct buffer_head *ret = NULL;
-        struct bh_lru *lru;
        unsigned int i;
        check_irqs_on();
        bh_lru_lock();
-        lru = &__get_cpu_var(bh_lrus);
        for (i = 0; i < BH_LRU_SIZE; i++) {
-                struct buffer_head *bh = lru->bhs[i];
+                struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
                if (bh && bh->b_bdev == bdev &&
                                bh->b_blocknr == block && bh->b_size == size) {
                        if (i) {
                                while (i) {
-                                        lru->bhs[i] = lru->bhs[i - 1];
+                                        __this_cpu_write(bh_lrus.bhs[i],
+                                                __this_cpu_read(bh_lrus.bhs[i - 1]));
                                        i--;
                                }
-                                lru->bhs[0] = bh;
+                                __this_cpu_write(bh_lrus.bhs[0], bh);
                        }
                        get_bh(bh);
                        ret = bh;
@@ -3203,22 +3201,23 @@ static void recalc_bh_state(void)
        int i;
        int tot = 0;
-        if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
+        if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
                return;
-        __get_cpu_var(bh_accounting).ratelimit = 0;
+        __this_cpu_write(bh_accounting.ratelimit, 0);
        for_each_online_cpu(i)
                tot += per_cpu(bh_accounting, i).nr;
        buffer_heads_over_limit = (tot > max_buffer_heads);
 }
-        
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 {
        struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
        if (ret) {
                INIT_LIST_HEAD(&ret->b_assoc_buffers);
-                get_cpu_var(bh_accounting).nr++;
+                preempt_disable();
+                __this_cpu_inc(bh_accounting.nr);
                recalc_bh_state();
-                put_cpu_var(bh_accounting);
+                preempt_enable();
        }
        return ret;
 }
@@ -3228,9 +3227,10 @@ void free_buffer_head(struct buffer_head *bh)
 {
        BUG_ON(!list_empty(&bh->b_assoc_buffers));
        kmem_cache_free(bh_cachep, bh);
-        get_cpu_var(bh_accounting).nr--;
+        preempt_disable();
+        __this_cpu_dec(bh_accounting.nr);
        recalc_bh_state();
-        put_cpu_var(bh_accounting);
+        preempt_enable();
 }
 EXPORT_SYMBOL(free_buffer_head);
@@ -3243,9 +3243,8 @@ static void buffer_exit_cpu(int cpu)
                brelse(b->bhs[i]);
                b->bhs[i] = NULL;
        }
-        get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
+        this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
        per_cpu(bh_accounting, cpu).nr = 0;
-        put_cpu_var(bh_accounting);
 }
 static int buffer_cpu_notify(struct notifier_block *self,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e9c874abc9e1..561438b6a50c 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -204,7 +204,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
        err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
                                  page->index << PAGE_CACHE_SHIFT, &len,
                                  ci->i_truncate_seq, ci->i_truncate_size,
-                                  &page, 1);
+                                  &page, 1, 0);
        if (err == -ENOENT)
                err = 0;
        if (err < 0) {
@@ -287,7 +287,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
        rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
                                 offset, &len,
                                 ci->i_truncate_seq, ci->i_truncate_size,
-                                 pages, nr_pages);
+                                 pages, nr_pages, 0);
        if (rc == -ENOENT)
                rc = 0;
        if (rc < 0)
@@ -774,7 +774,7 @@ get_more_pages:
                                            snapc, do_sync,
                                            ci->i_truncate_seq,
                                            ci->i_truncate_size,
-                                            &inode->i_mtime, true, 1);
+                                            &inode->i_mtime, true, 1, 0);
                                max_pages = req->r_num_pages;
                                alloc_page_vec(fsc, req);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 98ab13e2b71d..60d27bc9eb83 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1430,8 +1430,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
            invalidating_gen == ci->i_rdcache_gen) {
                /* success. */
                dout("try_nonblocking_invalidate %p success\n", inode);
-                ci->i_rdcache_gen = 0;
+                /* save any racing async invalidate some trouble */
-                ci->i_rdcache_revoking = 0;
+                ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
                return 0;
        }
        dout("try_nonblocking_invalidate %p failed\n", inode);
@@ -2273,8 +2273,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
-        unsigned seq = le32_to_cpu(grant->seq);
+        int seq = le32_to_cpu(grant->seq);
-        unsigned issue_seq = le32_to_cpu(grant->issue_seq);
        int newcaps = le32_to_cpu(grant->caps);
        int issued, implemented, used, wanted, dirty;
        u64 size = le64_to_cpu(grant->size);
@@ -2286,8 +2285,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        int revoked_rdcache = 0;
        int queue_invalidate = 0;
-        dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n",
+        dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
-             inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps));
+             inode, cap, mds, seq, ceph_cap_string(newcaps));
        dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
                inode->i_size);
@@ -2383,7 +2382,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        }
        cap->seq = seq;
-        cap->issue_seq = issue_seq;
        /* file layout may have changed */
        ci->i_layout = grant->layout;
@@ -2691,6 +2689,11 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
                     NULL /* no caps context */);
        try_flush_caps(inode, session, NULL);
        up_read(&mdsc->snap_rwsem);
+        /* make sure we re-request max_size, if necessary */
+        spin_lock(&inode->i_lock);
+        ci->i_requested_max_size = 0;
+        spin_unlock(&inode->i_lock);
 }
 /*
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e0a2dc6fcafc..fa7ca04ee816 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -40,12 +40,13 @@ int ceph_init_dentry(struct dentry *dentry)
        if (dentry->d_fsdata)
                return 0;
-        if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+        if (dentry->d_parent == NULL ||   /* nfs fh_to_dentry */
-                dentry->d_op = &ceph_dentry_ops;
+            ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+                d_set_d_op(dentry, &ceph_dentry_ops);
        else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
-                dentry->d_op = &ceph_snapdir_dentry_ops;
+                d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
        else
-                dentry->d_op = &ceph_snap_dentry_ops;
+                d_set_d_op(dentry, &ceph_snap_dentry_ops);
        di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
        if (!di)
@@ -111,11 +112,11 @@ static int __dcache_readdir(struct file *filp,
        dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
             last);
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        /* start at beginning? */
-        if (filp->f_pos == 2 || (last &&
+        if (filp->f_pos == 2 || last == NULL ||
-                                 filp->f_pos < ceph_dentry(last)->offset)) {
+            filp->f_pos < ceph_dentry(last)->offset) {
                if (list_empty(&parent->d_subdirs))
                        goto out_unlock;
                p = parent->d_subdirs.prev;
@@ -135,6 +136,7 @@ more:
                        fi->at_end = 1;
                        goto out_unlock;
                }
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
                if (!d_unhashed(dentry) && dentry->d_inode &&
                    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
                    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
@@ -144,13 +146,15 @@ more:
                     dentry->d_name.len, dentry->d_name.name, di->offset,
                     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
                     !dentry->d_inode ? " null" : "");
+                spin_unlock(&dentry->d_lock);
                p = p->prev;
                dentry = list_entry(p, struct dentry, d_u.d_child);
                di = ceph_dentry(dentry);
        }
-        atomic_inc(&dentry->d_count);
+        dget_dlock(dentry);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&parent->d_lock);
        dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
             dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -176,19 +180,19 @@ more:
        filp->f_pos++;
-        /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
+        /* make sure a dentry wasn't dropped while we didn't have parent lock */
        if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
                dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
                err = -EAGAIN;
                goto out;
        }
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        p = p->prev;    /* advance to next dentry */
        goto more;
 out_unlock:
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
 out:
        if (last)
                dput(last);
@@ -336,7 +340,10 @@ more:
                if (req->r_reply_info.dir_end) {
                        kfree(fi->last_name);
                        fi->last_name = NULL;
-                        fi->next_offset = 2;
+                        if (ceph_frag_is_rightmost(frag))
+                                fi->next_offset = 2;
+                        else
+                                fi->next_offset = 0;
                } else {
                        rinfo = &req->r_reply_info;
                        err = note_last_dentry(fi,
@@ -355,18 +362,22 @@ more:
                u64 pos = ceph_make_fpos(frag, off);
                struct ceph_mds_reply_inode *in =
                        rinfo->dir_in[off - fi->offset].in;
+                struct ceph_vino vino;
+                ino_t ino;
                dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
                     off, off - fi->offset, rinfo->dir_nr, pos,
                     rinfo->dir_dname_len[off - fi->offset],
                     rinfo->dir_dname[off - fi->offset], in);
                BUG_ON(!in);
                ftype = le32_to_cpu(in->mode) >> 12;
+                vino.ino = le64_to_cpu(in->ino);
+                vino.snap = le64_to_cpu(in->snapid);
+                ino = ceph_vino_to_ino(vino);
                if (filldir(dirent,
                            rinfo->dir_dname[off - fi->offset],
                            rinfo->dir_dname_len[off - fi->offset],
-                            pos,
+                            pos, ino, ftype) < 0) {
-                            le64_to_cpu(in->ino),
-                            ftype) < 0) {
                        dout("filldir stopping us...\n");
                        return 0;
                }
@@ -414,6 +425,7 @@ static void reset_readdir(struct ceph_file_info *fi)
                fi->last_readdir = NULL;
        }
        kfree(fi->last_name);
+        fi->last_name = NULL;
        fi->next_offset = 2;  /* compensate for . and .. */
        if (fi->dentry) {
                dput(fi->dentry);
@@ -978,7 +990,12 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
 */
 static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *dir = dentry->d_parent->d_inode;
+        struct inode *dir;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        dir = dentry->d_parent->d_inode;
        dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
             dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index e77c28cf3690..7d0e4a82d898 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -154,11 +154,13 @@ int ceph_open(struct inode *inode, struct file *file)
        }
        /*
-         * No need to block if we have any caps.  Update wanted set
+         * No need to block if we have caps on the auth MDS (for
+         * write) or any MDS (for read).  Update wanted set
         * asynchronously.
         */
        spin_lock(&inode->i_lock);
-        if (__ceph_is_any_real_caps(ci)) {
+        if (__ceph_is_any_real_caps(ci) &&
+            (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
                int mds_wanted = __ceph_caps_mds_wanted(ci);
                int issued = __ceph_caps_issued(ci, NULL);
@@ -280,11 +282,13 @@ int ceph_release(struct inode *inode, struct file *file)
 static int striped_read(struct inode *inode,
                        u64 off, u64 len,
                        struct page **pages, int num_pages,
-                        int *checkeof)
+                        int *checkeof, bool align_to_pages,
+                        unsigned long buf_align)
 {
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_inode_info *ci = ceph_inode(inode);
        u64 pos, this_len;
+        int io_align, page_align;
        int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
        int left, pages_left;
        int read;
@@ -300,14 +304,19 @@ static int striped_read(struct inode *inode,
        page_pos = pages;
        pages_left = num_pages;
        read = 0;
+        io_align = off & ~PAGE_MASK;
 more:
+        if (align_to_pages)
+                page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
+        else
+                page_align = pos & ~PAGE_MASK;
        this_len = left;
        ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
                                  &ci->i_layout, pos, &this_len,
                                  ci->i_truncate_seq,
                                  ci->i_truncate_size,
-                                  page_pos, pages_left);
+                                  page_pos, pages_left, page_align);
        hit_stripe = this_len < left;
        was_short = ret >= 0 && ret < this_len;
        if (ret == -ENOENT)
@@ -368,32 +377,34 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
        struct inode *inode = file->f_dentry->d_inode;
        struct page **pages;
        u64 off = *poff;
-        int num_pages = calc_pages_for(off, len);
+        int num_pages, ret;
-        int ret;
        dout("sync_read on file %p %llu~%u %s\n", file, off, len,
             (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
        if (file->f_flags & O_DIRECT) {
-                pages = ceph_get_direct_page_vector(data, num_pages, off, len);
+                num_pages = calc_pages_for((unsigned long)data, len);
+                pages = ceph_get_direct_page_vector(data, num_pages, true);
-                /*
-                 * flush any page cache pages in this range.  this
-                 * will make concurrent normal and O_DIRECT io slow,
-                 * but it will at least behave sensibly when they are
-                 * in sequence.
-                 */
        } else {
+                num_pages = calc_pages_for(off, len);
                pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
        }
        if (IS_ERR(pages))
                return PTR_ERR(pages);
+        /*
+         * flush any page cache pages in this range.  this
+         * will make concurrent normal and sync io slow,
+         * but it will at least behave sensibly when they are
+         * in sequence.
+         */
        ret = filemap_write_and_wait(inode->i_mapping);
        if (ret < 0)
                goto done;
-        ret = striped_read(inode, off, len, pages, num_pages, checkeof);
+        ret = striped_read(inode, off, len, pages, num_pages, checkeof,
+                           file->f_flags & O_DIRECT,
+                           (unsigned long)data & ~PAGE_MASK);
        if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
                ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
@@ -402,7 +413,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
 done:
        if (file->f_flags & O_DIRECT)
-                ceph_put_page_vector(pages, num_pages);
+                ceph_put_page_vector(pages, num_pages, true);
        else
                ceph_release_page_vector(pages, num_pages);
        dout("sync_read result %d\n", ret);
@@ -448,6 +459,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
        int flags;
        int do_sync = 0;
        int check_caps = 0;
+        int page_align, io_align;
+        unsigned long buf_align;
        int ret;
        struct timespec mtime = CURRENT_TIME;
@@ -462,6 +475,9 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
        else
                pos = *offset;
+        io_align = pos & ~PAGE_MASK;
+        buf_align = (unsigned long)data & ~PAGE_MASK;
        ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
        if (ret < 0)
                return ret;
@@ -486,20 +502,27 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
         */
 more:
        len = left;
+        if (file->f_flags & O_DIRECT) {
+                /* write from beginning of first page, regardless of
+                   io alignment */
+                page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
+                num_pages = calc_pages_for((unsigned long)data, len);
+        } else {
+                page_align = pos & ~PAGE_MASK;
+                num_pages = calc_pages_for(pos, len);
+        }
        req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                    ceph_vino(inode), pos, &len,
                                    CEPH_OSD_OP_WRITE, flags,
                                    ci->i_snap_realm->cached_context,
                                    do_sync,
                                    ci->i_truncate_seq, ci->i_truncate_size,
-                                    &mtime, false, 2);
+                                    &mtime, false, 2, page_align);
        if (!req)
                return -ENOMEM;
-        num_pages = calc_pages_for(pos, len);
        if (file->f_flags & O_DIRECT) {
-                pages = ceph_get_direct_page_vector(data, num_pages, pos, len);
+                pages = ceph_get_direct_page_vector(data, num_pages, false);
                if (IS_ERR(pages)) {
                        ret = PTR_ERR(pages);
                        goto out;
@@ -549,7 +572,7 @@ more:
        }
        if (file->f_flags & O_DIRECT)
-                ceph_put_page_vector(pages, num_pages);
+                ceph_put_page_vector(pages, num_pages, false);
        else if (file->f_flags & O_SYNC)
                ceph_release_page_vector(pages, num_pages);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 1d6a45b5a04c..e61de4f7b99d 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2,7 +2,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/uaccess.h>
@@ -369,6 +368,15 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        return &ci->vfs_inode;
 }
+static void ceph_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ceph_inode_cachep, ci);
+}
 void ceph_destroy_inode(struct inode *inode)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
@@ -408,7 +416,7 @@ void ceph_destroy_inode(struct inode *inode)
        if (ci->i_xattrs.prealloc_blob)
                ceph_buffer_put(ci->i_xattrs.prealloc_blob);
-        kmem_cache_free(ceph_inode_cachep, ci);
+        call_rcu(&inode->i_rcu, ceph_i_callback);
 }
@@ -471,7 +479,9 @@ void ceph_fill_file_time(struct inode *inode, int issued,
        if (issued & (CEPH_CAP_FILE_EXCL|
                      CEPH_CAP_FILE_WR|
-                      CEPH_CAP_FILE_BUFFER)) {
+                      CEPH_CAP_FILE_BUFFER|
+                      CEPH_CAP_AUTH_EXCL|
+                      CEPH_CAP_XATTR_EXCL)) {
                if (timespec_compare(ctime, &inode->i_ctime) > 0) {
                        dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
                             inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
@@ -511,7 +521,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
                        warn = 1;
                }
        } else {
-                /* we have no write caps; whatever the MDS says is true */
+                /* we have no write|excl caps; whatever the MDS says is true */
                if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
                        inode->i_ctime = *ctime;
                        inode->i_mtime = *mtime;
@@ -567,12 +577,17 @@ static int fill_inode(struct inode *inode,
        /*
         * provided version will be odd if inode value is projected,
-         * even if stable.  skip the update if we have a newer info
+         * even if stable.  skip the update if we have newer stable
-         * (e.g., due to inode info racing form multiple MDSs), or if
+         * info (ours>=theirs, e.g. due to racing mds replies), unless
-         * we are getting projected (unstable) inode info.
+         * we are getting projected (unstable) info (in which case the
+         * version is odd, and we want ours>theirs).
+         *   us   them
+         *   2    2     skip
+         *   3    2     skip
+         *   3    3     update
         */
        if (le64_to_cpu(info->version) > 0 &&
-            (ci->i_version & ~1) > le64_to_cpu(info->version))
+            (ci->i_version & ~1) >= le64_to_cpu(info->version))
                goto no_change;
        issued = __ceph_caps_issued(ci, &implemented);
@@ -606,7 +621,14 @@ static int fill_inode(struct inode *inode,
                            le32_to_cpu(info->time_warp_seq),
                            &ctime, &mtime, &atime);
-        ci->i_max_size = le64_to_cpu(info->max_size);
+        /* only update max_size on auth cap */
+        if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+            ci->i_max_size != le64_to_cpu(info->max_size)) {
+                dout("max_size %lld -> %llu\n", ci->i_max_size,
+                     le64_to_cpu(info->max_size));
+                ci->i_max_size = le64_to_cpu(info->max_size);
+        }
        ci->i_layout = info->layout;
        inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
@@ -828,13 +850,13 @@ static void ceph_set_dentry_offset(struct dentry *dn)
        di->offset = ceph_inode(inode)->i_max_offset++;
        spin_unlock(&inode->i_lock);
-        spin_lock(&dcache_lock);
+        spin_lock(&dir->d_lock);
-        spin_lock(&dn->d_lock);
+        spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
        list_move(&dn->d_u.d_child, &dir->d_subdirs);
        dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
             dn->d_u.d_child.prev, dn->d_u.d_child.next);
        spin_unlock(&dn->d_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dir->d_lock);
 }
 /*
@@ -866,8 +888,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
        } else if (realdn) {
                dout("dn %p (%d) spliced with %p (%d) "
                     "inode %p ino %llx.%llx\n",
-                     dn, atomic_read(&dn->d_count),
+                     dn, dn->d_count,
-                     realdn, atomic_read(&realdn->d_count),
+                     realdn, realdn->d_count,
                     realdn->d_inode, ceph_vinop(realdn->d_inode));
                dput(dn);
                dn = realdn;
@@ -1055,7 +1077,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                ininfo = rinfo->targeti.in;
                vino.ino = le64_to_cpu(ininfo->ino);
                vino.snap = le64_to_cpu(ininfo->snapid);
-                if (!dn->d_inode) {
+                in = dn->d_inode;
+                if (!in) {
                        in = ceph_get_inode(sb, vino);
                        if (IS_ERR(in)) {
                                pr_err("fill_trace bad get_inode "
@@ -1217,11 +1240,11 @@ retry_lookup:
                        goto retry_lookup;
                } else {
                        /* reorder parent's d_subdirs */
-                        spin_lock(&dcache_lock);
+                        spin_lock(&parent->d_lock);
-                        spin_lock(&dn->d_lock);
+                        spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
                        list_move(&dn->d_u.d_child, &parent->d_subdirs);
                        spin_unlock(&dn->d_lock);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&parent->d_lock);
                }
                di = dn->d_fsdata;
@@ -1386,11 +1409,8 @@ static void ceph_invalidate_work(struct work_struct *work)
        spin_lock(&inode->i_lock);
        dout("invalidate_pages %p gen %d revoking %d\n", inode,
             ci->i_rdcache_gen, ci->i_rdcache_revoking);
-        if (ci->i_rdcache_gen == 0 ||
+        if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
-            ci->i_rdcache_revoking != ci->i_rdcache_gen) {
-                BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
                /* nevermind! */
-                ci->i_rdcache_revoking = 0;
                spin_unlock(&inode->i_lock);
                goto out;
        }
@@ -1400,15 +1420,16 @@ static void ceph_invalidate_work(struct work_struct *work)
        ceph_invalidate_nondirty_pages(inode->i_mapping);
        spin_lock(&inode->i_lock);
-        if (orig_gen == ci->i_rdcache_gen) {
+        if (orig_gen == ci->i_rdcache_gen &&
+            orig_gen == ci->i_rdcache_revoking) {
                dout("invalidate_pages %p gen %d successful\n", inode,
                     ci->i_rdcache_gen);
-                ci->i_rdcache_gen = 0;
+                ci->i_rdcache_revoking--;
-                ci->i_rdcache_revoking = 0;
                check = 1;
        } else {
-                dout("invalidate_pages %p gen %d raced, gen now %d\n",
+                dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
-                     inode, orig_gen, ci->i_rdcache_gen);
+                     inode, orig_gen, ci->i_rdcache_gen,
+                     ci->i_rdcache_revoking);
        }
        spin_unlock(&inode->i_lock);
@@ -1739,7 +1760,7 @@ int ceph_do_getattr(struct inode *inode, int mask)
                return 0;
        }
-        dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
+        dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
        if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
                return 0;
@@ -1760,12 +1781,17 @@ int ceph_do_getattr(struct inode *inode, int mask)
 * Check inode permissions.  We verify we have a valid value for
 * the AUTH cap, then call the generic handler.
 */
-int ceph_permission(struct inode *inode, int mask)
+int ceph_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
+        int err;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
        if (!err)
-                err = generic_permission(inode, mask, NULL);
+                err = generic_permission(inode, mask, flags, NULL);
        return err;
 }
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index a6ce54e94eb5..52e8fd74d450 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
 #include <linux/ioctl.h>
 #include <linux/types.h>
-#define CEPH_IOCTL_MAGIC 0x98
+#define CEPH_IOCTL_MAGIC 0x97
 /* just use u64 to align sanely on all archs */
 struct ceph_ioctl_layout {
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 40abde93c345..476b329867d4 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -11,40 +11,68 @@
 * Implement fcntl and flock locking functions.
 */
 static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
-                             u64 pid, u64 pid_ns,
+                             int cmd, u8 wait, struct file_lock *fl)
-                             int cmd, u64 start, u64 length, u8 wait)
 {
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_mds_client *mdsc =
                ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_mds_request *req;
        int err;
+        u64 length = 0;
        req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
        if (IS_ERR(req))
                return PTR_ERR(req);
        req->r_inode = igrab(inode);
+        /* mds requires start and length rather than start and end */
+        if (LLONG_MAX == fl->fl_end)
+                length = 0;
+        else
+                length = fl->fl_end - fl->fl_start + 1;
        dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
             "length: %llu, wait: %d, type`: %d", (int)lock_type,
-             (int)operation, pid, start, length, wait, cmd);
+             (int)operation, (u64)fl->fl_pid, fl->fl_start,
+             length, wait, fl->fl_type);
        req->r_args.filelock_change.rule = lock_type;
        req->r_args.filelock_change.type = cmd;
-        req->r_args.filelock_change.pid = cpu_to_le64(pid);
+        req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
        /* This should be adjusted, but I'm not sure if
           namespaces actually get id numbers*/
        req->r_args.filelock_change.pid_namespace =
-                cpu_to_le64((u64)pid_ns);
+                cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
-        req->r_args.filelock_change.start = cpu_to_le64(start);
+        req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
        req->r_args.filelock_change.length = cpu_to_le64(length);
        req->r_args.filelock_change.wait = wait;
        err = ceph_mdsc_do_request(mdsc, inode, req);
+        if ( operation == CEPH_MDS_OP_GETFILELOCK){
+                fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
+                if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
+                        fl->fl_type = F_RDLCK;
+                else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
+                        fl->fl_type = F_WRLCK;
+                else
+                        fl->fl_type = F_UNLCK;
+                fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
+                length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
+                                                 le64_to_cpu(req->r_reply_info.filelock_reply->length);
+                if (length >= 1)
+                        fl->fl_end = length -1;
+                else
+                        fl->fl_end = 0;
+        }
        ceph_mdsc_put_request(req);
        dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
-             "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type,
+             "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type,
-             (int)operation, pid, start, length, wait, cmd, err);
+             (int)operation, (u64)fl->fl_pid, fl->fl_start,
+             length, wait, fl->fl_type, err);
        return err;
 }
@@ -54,7 +82,6 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
 */
 int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 {
-        u64 length;
        u8 lock_cmd;
        int err;
        u8 wait = 0;
@@ -76,29 +103,20 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
        else
                lock_cmd = CEPH_LOCK_UNLOCK;
-        if (LLONG_MAX == fl->fl_end)
+        err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
-                length = 0;
-        else
-                length = fl->fl_end - fl->fl_start + 1;
-        err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-                                (u64)fl->fl_pid,
-                                (u64)(unsigned long)fl->fl_nspid,
-                                lock_cmd, fl->fl_start,
-                                length, wait);
        if (!err) {
-                dout("mds locked, locking locally");
+                if ( op != CEPH_MDS_OP_GETFILELOCK ){
-                err = posix_lock_file(file, fl, NULL);
+                        dout("mds locked, locking locally");
-                if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
+                        err = posix_lock_file(file, fl, NULL);
-                        /* undo! This should only happen if the kernel detects
+                        if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
-                         * local deadlock. */
+                                /* undo! This should only happen if the kernel detects
-                        ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+                                 * local deadlock. */
-                                          (u64)fl->fl_pid,
+                                ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-                                          (u64)(unsigned long)fl->fl_nspid,
+                                                  CEPH_LOCK_UNLOCK, 0, fl);
-                                          CEPH_LOCK_UNLOCK, fl->fl_start,
+                                dout("got %d on posix_lock_file, undid lock", err);
-                                          length, 0);
+                        }
-                        dout("got %d on posix_lock_file, undid lock", err);
                }
        } else {
                dout("mds returned error code %d", err);
        }
@@ -107,7 +125,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-        u64 length;
        u8 lock_cmd;
        int err;
        u8 wait = 1;
@@ -127,26 +144,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
                lock_cmd = CEPH_LOCK_EXCL;
        else
                lock_cmd = CEPH_LOCK_UNLOCK;
-        /* mds requires start and length rather than start and end */
-        if (LLONG_MAX == fl->fl_end)
-                length = 0;
-        else
-                length = fl->fl_end - fl->fl_start + 1;
        err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
-                                file, (u64)fl->fl_pid,
+                                file, lock_cmd, wait, fl);
-                                (u64)(unsigned long)fl->fl_nspid,
-                                lock_cmd, fl->fl_start,
-                                length, wait);
        if (!err) {
                err = flock_lock_file_wait(file, fl);
                if (err) {
                        ceph_lock_message(CEPH_LOCK_FLOCK,
                                          CEPH_MDS_OP_SETFILELOCK,
-                                          file, (u64)fl->fl_pid,
+                                          file, CEPH_LOCK_UNLOCK, 0, fl);
-                                          (u64)(unsigned long)fl->fl_nspid,
-                                          CEPH_LOCK_UNLOCK, fl->fl_start,
-                                          length, 0);
                        dout("got %d on flock_lock_file_wait, undid lock", err);
                }
        } else {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 3142b15940c2..a50fca1e03be 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -6,7 +6,6 @@
 #include <linux/sched.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
 #include "super.h"
 #include "mds_client.h"
@@ -203,6 +202,38 @@ out_bad:
 }
 /*
+ * parse fcntl F_GETLK results
+ */
+static int parse_reply_info_filelock(void **p, void *end,
+                struct ceph_mds_reply_info_parsed *info)
+{
+        if (*p + sizeof(*info->filelock_reply) > end)
+                goto bad;
+        info->filelock_reply = *p;
+        *p += sizeof(*info->filelock_reply);
+        if (unlikely(*p != end))
+                goto bad;
+        return 0;
+bad:
+        return -EIO;
+}
+/*
+ * parse extra results
+ */
+static int parse_reply_info_extra(void **p, void *end,
+                struct ceph_mds_reply_info_parsed *info)
+{
+        if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
+                return parse_reply_info_filelock(p, end, info);
+        else
+                return parse_reply_info_dir(p, end, info);
+}
+/*
 * parse entire mds reply
 */
 static int parse_reply_info(struct ceph_msg *msg,
@@ -224,10 +255,10 @@ static int parse_reply_info(struct ceph_msg *msg,
                        goto out_bad;
        }
-        /* dir content */
+        /* extra */
        ceph_decode_32_safe(&p, end, len, bad);
        if (len > 0) {
-                err = parse_reply_info_dir(&p, p+len, info);
+                err = parse_reply_info_extra(&p, p+len, info);
                if (err < 0)
                        goto out_bad;
        }
@@ -529,6 +560,9 @@ static void __register_request(struct ceph_mds_client *mdsc,
        ceph_mdsc_get_request(req);
        __insert_request(mdsc, req);
+        req->r_uid = current_fsuid();
+        req->r_gid = current_fsgid();
        if (dir) {
                struct ceph_inode_info *ci = ceph_inode(dir);
@@ -1452,7 +1486,7 @@ retry:
        *base = ceph_ino(temp->d_inode);
        *plen = len;
        dout("build_path on %p %d built %llx '%.*s'\n",
-             dentry, atomic_read(&dentry->d_count), *base, len, path);
+             dentry, dentry->d_count, *base, len, path);
        return path;
 }
@@ -1588,8 +1622,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
        head->op = cpu_to_le32(req->r_op);
-        head->caller_uid = cpu_to_le32(current_fsuid());
+        head->caller_uid = cpu_to_le32(req->r_uid);
-        head->caller_gid = cpu_to_le32(current_fsgid());
+        head->caller_gid = cpu_to_le32(req->r_gid);
        head->args = req->r_args;
        ceph_encode_filepath(&p, end, ino1, path1);
@@ -2072,7 +2106,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        mutex_lock(&session->s_mutex);
        if (err < 0) {
-                pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
+                pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
                ceph_msg_dump(msg);
                goto out_err;
        }
@@ -2092,7 +2126,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        mutex_lock(&req->r_fill_mutex);
        err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
        if (err == 0) {
-                if (result == 0 && rinfo->dir_nr)
+                if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
+                    rinfo->dir_nr)
                        ceph_readdir_prepopulate(req, req->r_session);
                ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
        }
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index d66d63c72355..aabe563b54db 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -42,26 +42,37 @@ struct ceph_mds_reply_info_in {
 };
 /*
- * parsed info about an mds reply, including information about the
+ * parsed info about an mds reply, including information about
- * target inode and/or its parent directory and dentry, and directory
+ * either: 1) the target inode and/or its parent directory and dentry,
- * contents (for readdir results).
+ * and directory contents (for readdir results), or
+ * 2) the file range lock info (for fcntl F_GETLK results).
 */
 struct ceph_mds_reply_info_parsed {
        struct ceph_mds_reply_head    *head;
+        /* trace */
        struct ceph_mds_reply_info_in diri, targeti;
        struct ceph_mds_reply_dirfrag *dirfrag;
        char                          *dname;
        u32                           dname_len;
        struct ceph_mds_reply_lease   *dlease;
-        struct ceph_mds_reply_dirfrag *dir_dir;
+        /* extra */
-        int                           dir_nr;
+        union {
-        char                          **dir_dname;
+                /* for fcntl F_GETLK results */
-        u32                           *dir_dname_len;
+                struct ceph_filelock *filelock_reply;
-        struct ceph_mds_reply_lease   **dir_dlease;
-        struct ceph_mds_reply_info_in *dir_in;
+                /* for readdir results */
-        u8                            dir_complete, dir_end;
+                struct {
+                        struct ceph_mds_reply_dirfrag *dir_dir;
+                        int                           dir_nr;
+                        char                          **dir_dname;
+                        u32                           *dir_dname_len;
+                        struct ceph_mds_reply_lease   **dir_dlease;
+                        struct ceph_mds_reply_info_in *dir_in;
+                        u8                            dir_complete, dir_end;
+                };
+        };
        /* encoded blob describing snapshot contexts for certain
           operations (e.g., open) */
@@ -170,6 +181,8 @@ struct ceph_mds_request {
        union ceph_mds_request_args r_args;
        int r_fmode;        /* file mode, if expecting cap */
+        uid_t r_uid;
+        gid_t r_gid;
        /* for choosing which mds to send this request to */
        int r_direct_mode;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 1886294e12f7..4553d8829edb 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -293,9 +293,7 @@ struct ceph_inode_info {
        int i_rd_ref, i_rdcache_ref, i_wr_ref;
        int i_wrbuffer_ref, i_wrbuffer_ref_head;
        u32 i_shared_gen;       /* increment each time we get FILE_SHARED */
-        u32 i_rdcache_gen;      /* we increment this each time we get
+        u32 i_rdcache_gen;      /* incremented each time we get FILE_CACHE. */
-                                   FILE_CACHE.  If it's non-zero, we
-                                   _may_ have cached pages. */
        u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
        struct list_head i_unsafe_writes; /* uncommitted sync writes */
@@ -667,7 +665,7 @@ extern void ceph_queue_invalidate(struct inode *inode);
 extern void ceph_queue_writeback(struct inode *inode);
 extern int ceph_do_getattr(struct inode *inode, int mask);
-extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_permission(struct inode *inode, int mask, unsigned int flags);
 extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
 extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
                        struct kstat *stat);
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 0ed213970ced..ee45648b0d1a 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -4,6 +4,7 @@ config CIFS
        select NLS
        select CRYPTO
        select CRYPTO_MD5
+        select CRYPTO_HMAC
        select CRYPTO_ARC4
        help
          This is the client VFS module for the Common Internet File System
@@ -143,6 +144,13 @@ config CIFS_FSCACHE
            to be cached locally on disk through the general filesystem cache
            manager. If unsure, say N.
+config CIFS_ACL
+          bool "Provide CIFS ACL support (EXPERIMENTAL)"
+          depends on EXPERIMENTAL && CIFS_XATTR
+          help
+            Allows to fetch CIFS/NTFS ACL from the server.  The DACL blob
+            is handed over to the application/caller.
 config CIFS_EXPERIMENTAL
          bool "CIFS Experimental Features (EXPERIMENTAL)"
          depends on CIFS && EXPERIMENTAL
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index adefa60a9bdc..43b19dd39191 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -6,7 +6,9 @@ obj-$(CONFIG_CIFS) += cifs.o
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
          link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
          md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
-          readdir.o ioctl.o sess.o export.o cifsacl.o
+          readdir.o ioctl.o sess.o export.o
+cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
 cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
diff --git a/fs/cifs/README b/fs/cifs/README
index ee68d1036544..46af99ab3614 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -337,6 +337,15 @@ A partial list of the supported mount options follows:
  wsize         default write size (default 57344)
                maximum wsize currently allowed by CIFS is 57344 (fourteen
                4096 byte pages)
+  actimeo=n     attribute cache timeout in seconds (default 1 second).
+                After this timeout, the cifs client requests fresh attribute
+                information from the server. This option allows to tune the
+                attribute cache timeout to suit the workload needs. Shorter
+                timeouts mean better the cache coherency, but increased number
+                of calls to the server. Longer timeouts mean reduced number
+                of calls to the server at the expense of less stricter cache
+                coherency checks (i.e. incorrect attribute cache for a short
+                period of time).
  rw            mount the network share read-write (note that the
                server may still consider the share read-only)
  ro            mount network share read-only
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index 5aff46c61e52..355abcdcda98 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -81,7 +81,7 @@ u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for
 v) mount check for unmatched uids
-w) Add support for new vfs entry points for setlease and fallocate 
+w) Add support for new vfs entry point for fallocate
 x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of 
 processes can proceed better in parallel (on the server)
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 525ba59a4105..7852cd677051 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -15,7 +15,7 @@
 *   the GNU Lesser General Public License for more details.
 *
 */
-#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
 #ifndef _CIFS_FS_SB_H
 #define _CIFS_FS_SB_H
@@ -42,12 +42,13 @@
 #define CIFS_MOUNT_MULTIUSER    0x20000 /* multiuser mount */
 struct cifs_sb_info {
-        struct radix_tree_root tlink_tree;
+        struct rb_root tlink_tree;
-#define CIFS_TLINK_MASTER_TAG           0       /* is "master" (mount) tcon */
        spinlock_t tlink_tree_lock;
+        struct tcon_link *master_tlink;
        struct nls_table *local_nls;
        unsigned int rsize;
        unsigned int wsize;
+        unsigned long actimeo; /* attribute cache timeout (jiffies) */
        atomic_t active;
        uid_t   mnt_uid;
        gid_t   mnt_gid;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index c9b4792ae825..a437ec391a01 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -30,8 +30,6 @@
 #include "cifs_debug.h"
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
        {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
        {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
@@ -560,7 +558,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
-                return NULL;
+                return ERR_CAST(tlink);
        xid = GetXid();
        rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen);
@@ -568,7 +566,9 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
        cifs_put_tlink(tlink);
-        cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
+        cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
+        if (rc)
+                return ERR_PTR(rc);
        return pntsd;
 }
@@ -583,7 +583,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
-                return NULL;
+                return ERR_CAST(tlink);
        tcon = tlink_tcon(tlink);
        xid = GetXid();
@@ -591,23 +591,22 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
        rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
                         &fid, &oplock, NULL, cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        if (rc) {
+        if (!rc) {
-                cERROR(1, "Unable to open file to get ACL");
+                rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
-                goto out;
+                CIFSSMBClose(xid, tcon, fid);
        }
-        rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
-        cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
-        CIFSSMBClose(xid, tcon, fid);
- out:
        cifs_put_tlink(tlink);
        FreeXid(xid);
+        cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
+        if (rc)
+                return ERR_PTR(rc);
        return pntsd;
 }
 /* Retrieve an ACL from the server */
-static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
+struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
                                      struct inode *inode, const char *path,
                                      u32 *pacllen)
 {
@@ -695,7 +694,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 }
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
-void
+int
 cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
                  struct inode *inode, const char *path, const __u16 *pfid)
 {
@@ -711,17 +710,21 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
                pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
        /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
-        if (pntsd)
+        if (IS_ERR(pntsd)) {
+                rc = PTR_ERR(pntsd);
+                cERROR(1, "%s: error %d getting sec desc", __func__, rc);
+        } else {
                rc = parse_sec_desc(pntsd, acllen, fattr);
-        if (rc)
+                kfree(pntsd);
-                cFYI(1, "parse sec desc failed rc = %d", rc);
+                if (rc)
+                        cERROR(1, "parse sec desc failed rc = %d", rc);
+        }
-        kfree(pntsd);
+        return rc;
-        return;
 }
 /* Convert mode bits to an ACL so we can update the ACL on the server */
-int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
+int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
 {
        int rc = 0;
        __u32 secdesclen = 0;
@@ -736,7 +739,10 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
        /* Add three ACEs for owner, group, everyone getting rid of
           other ACEs as chmod disables ACEs and set the security descriptor */
-        if (pntsd) {
+        if (IS_ERR(pntsd)) {
+                rc = PTR_ERR(pntsd);
+                cERROR(1, "%s: error %d getting sec desc", __func__, rc);
+        } else {
                /* allocate memory for the smb header,
                   set security descriptor request security descriptor
                   parameters, and secuirty descriptor itself */
@@ -766,4 +772,3 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
        return rc;
 }
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 6c8096cf5155..c4ae7d036563 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -74,11 +74,7 @@ struct cifs_wksid {
        char sidname[SIDNAMELENGTH];
 } __attribute__((packed));
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 extern int match_sid(struct cifs_sid *);
 extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
-#endif /*  CONFIG_CIFS_EXPERIMENTAL */
 #endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 75c4eaa79588..8e21e0fe65d5 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -116,7 +116,7 @@ cifs_read_super(struct super_block *sb, void *data,
                return -ENOMEM;
        spin_lock_init(&cifs_sb->tlink_tree_lock);
-        INIT_RADIX_TREE(&cifs_sb->tlink_tree, GFP_KERNEL);
+        cifs_sb->tlink_tree = RB_ROOT;
        rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
        if (rc) {
@@ -283,10 +283,13 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
-static int cifs_permission(struct inode *inode, int mask)
+static int cifs_permission(struct inode *inode, int mask, unsigned int flags)
 {
        struct cifs_sb_info *cifs_sb;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        cifs_sb = CIFS_SB(inode->i_sb);
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {
@@ -298,7 +301,7 @@ static int cifs_permission(struct inode *inode, int mask)
                on the client (above and beyond ACL on servers) for
                servers which do not support setting and viewing mode bits,
                so allowing client to check permissions is useful */
-                return generic_permission(inode, mask, NULL);
+                return generic_permission(inode, mask, flags, NULL);
 }
 static struct kmem_cache *cifs_inode_cachep;
@@ -321,8 +324,7 @@ cifs_alloc_inode(struct super_block *sb)
        /* Until the file is open and we have gotten oplock
        info back from the server, can not assume caching of
        file data or metadata */
-        cifs_inode->clientCanCacheRead = false;
+        cifs_set_oplock_level(cifs_inode, 0);
-        cifs_inode->clientCanCacheAll = false;
        cifs_inode->delete_pending = false;
        cifs_inode->invalid_mapping = false;
        cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
@@ -335,10 +337,17 @@ cifs_alloc_inode(struct super_block *sb)
        return &cifs_inode->vfs_inode;
 }
+static void cifs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
+}
 static void
 cifs_destroy_inode(struct inode *inode)
 {
-        kmem_cache_free(cifs_inode_cachep, CIFS_I(inode));
+        call_rcu(&inode->i_rcu, cifs_i_callback);
 }
 static void
@@ -459,9 +468,13 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
                seq_printf(s, ",acl");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
                seq_printf(s, ",mfsymlinks");
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
+                seq_printf(s, ",fsc");
        seq_printf(s, ",rsize=%d", cifs_sb->rsize);
        seq_printf(s, ",wsize=%d", cifs_sb->wsize);
+        /* convert actimeo and display it in seconds */
+                seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
        return 0;
 }
@@ -934,7 +947,6 @@ init_cifs(void)
        GlobalCurrentXid = 0;
        GlobalTotalActiveXid = 0;
        GlobalMaxActiveXid = 0;
-        memset(Local_System_Name, 0, 15);
        spin_lock_init(&cifs_tcp_ses_lock);
        spin_lock_init(&cifs_file_list_lock);
        spin_lock_init(&GlobalMid_Lock);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f259e4d7612d..7136c0c3e2f9 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -45,6 +45,16 @@
 #define CIFS_MIN_RCV_POOL 4
 /*
+ * default attribute cache timeout (jiffies)
+ */
+#define CIFS_DEF_ACTIMEO (1 * HZ)
+/*
+ * max attribute cache timeout (jiffies) - 2^30
+ */
+#define CIFS_MAX_ACTIMEO (1 << 30)
+/*
 * MAX_REQ is the maximum number of requests that WE will send
 * on one socket concurrently. It also matches the most common
 * value of max multiplex returned by servers.  We may
@@ -336,7 +346,8 @@ struct cifsTconInfo {
 * "get" on the container.
 */
 struct tcon_link {
-        unsigned long           tl_index;
+        struct rb_node          tl_rbnode;
+        uid_t                   tl_uid;
        unsigned long           tl_flags;
 #define TCON_LINK_MASTER        0
 #define TCON_LINK_PENDING       1
@@ -745,8 +756,6 @@ GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
 GLOBAL_EXTERN unsigned int GlobalMaxActiveXid;  /* prot by GlobalMid_Sem */
 GLOBAL_EXTERN spinlock_t GlobalMid_Lock;  /* protects above & list operations */
                                          /* on midQ entries */
-GLOBAL_EXTERN char Local_System_Name[15];
 /*
 *  Global counters, updated atomically
 */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index edb6d90efdf2..e6d1481b16c1 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -54,7 +54,8 @@ do {								\
             __func__, curr_xid, (int)rc);                      \
 } while (0)
 extern char *build_path_from_dentry(struct dentry *);
-extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
+extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
+                                        struct cifsTconInfo *tcon);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
 extern char *cifs_compose_mount_options(const char *sb_mountdata,
                const char *fullpath, const struct dfs_info3_param *ref,
@@ -79,9 +80,7 @@ extern bool is_valid_oplock_break(struct smb_hdr *smb,
                                  struct TCP_Server_Info *);
 extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
-#endif
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
@@ -104,6 +103,7 @@ extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
 extern u64 cifs_UnixTimeToNT(struct timespec);
 extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
                                      int offset);
+extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
 extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle,
                                struct file *file, struct tcon_link *tlink,
@@ -129,10 +129,12 @@ extern int cifs_get_file_info_unix(struct file *filp);
 extern int cifs_get_inode_info_unix(struct inode **pinode,
                        const unsigned char *search_path,
                        struct super_block *sb, int xid);
-extern void cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
+extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
                              struct cifs_fattr *fattr, struct inode *inode,
                              const char *path, const __u16 *pfid);
-extern int mode_to_acl(struct inode *inode, const char *path, __u64);
+extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64);
+extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
+                                        const char *, u32 *);
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
                        const char *);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 2f2632b6df5a..67acfb3acad2 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2478,95 +2478,6 @@ querySymLinkRetry:
 }
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-/* Initialize NT TRANSACT SMB into small smb request buffer.
-   This assumes that all NT TRANSACTS that we init here have
-   total parm and data under about 400 bytes (to fit in small cifs
-   buffer size), which is the case so far, it easily fits. NB:
-        Setup words themselves and ByteCount
-        MaxSetupCount (size of returned setup area) and
-        MaxParameterCount (returned parms size) must be set by caller */
-static int
-smb_init_nttransact(const __u16 sub_command, const int setup_count,
-                   const int parm_len, struct cifsTconInfo *tcon,
-                   void **ret_buf)
-{
-        int rc;
-        __u32 temp_offset;
-        struct smb_com_ntransact_req *pSMB;
-        rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
-                                (void **)&pSMB);
-        if (rc)
-                return rc;
-        *ret_buf = (void *)pSMB;
-        pSMB->Reserved = 0;
-        pSMB->TotalParameterCount = cpu_to_le32(parm_len);
-        pSMB->TotalDataCount  = 0;
-        pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
-                                          MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
-        pSMB->ParameterCount = pSMB->TotalParameterCount;
-        pSMB->DataCount  = pSMB->TotalDataCount;
-        temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
-                        (setup_count * 2) - 4 /* for rfc1001 length itself */;
-        pSMB->ParameterOffset = cpu_to_le32(temp_offset);
-        pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
-        pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
-        pSMB->SubCommand = cpu_to_le16(sub_command);
-        return 0;
-}
-static int
-validate_ntransact(char *buf, char **ppparm, char **ppdata,
-                   __u32 *pparmlen, __u32 *pdatalen)
-{
-        char *end_of_smb;
-        __u32 data_count, data_offset, parm_count, parm_offset;
-        struct smb_com_ntransact_rsp *pSMBr;
-        *pdatalen = 0;
-        *pparmlen = 0;
-        if (buf == NULL)
-                return -EINVAL;
-        pSMBr = (struct smb_com_ntransact_rsp *)buf;
-        /* ByteCount was converted from little endian in SendReceive */
-        end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
-                        (char *)&pSMBr->ByteCount;
-        data_offset = le32_to_cpu(pSMBr->DataOffset);
-        data_count = le32_to_cpu(pSMBr->DataCount);
-        parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
-        parm_count = le32_to_cpu(pSMBr->ParameterCount);
-        *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
-        *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
-        /* should we also check that parm and data areas do not overlap? */
-        if (*ppparm > end_of_smb) {
-                cFYI(1, "parms start after end of smb");
-                return -EINVAL;
-        } else if (parm_count + *ppparm > end_of_smb) {
-                cFYI(1, "parm end after end of smb");
-                return -EINVAL;
-        } else if (*ppdata > end_of_smb) {
-                cFYI(1, "data starts after end of smb");
-                return -EINVAL;
-        } else if (data_count + *ppdata > end_of_smb) {
-                cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
-                        *ppdata, data_count, (data_count + *ppdata),
-                        end_of_smb, pSMBr);
-                return -EINVAL;
-        } else if (parm_count + data_count > pSMBr->ByteCount) {
-                cFYI(1, "parm count and data count larger than SMB");
-                return -EINVAL;
-        }
-        *pdatalen = data_count;
-        *pparmlen = parm_count;
-        return 0;
-}
 int
 CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
                        const unsigned char *searchName,
@@ -3056,7 +2967,97 @@ GetExtAttrOut:
 #endif /* CONFIG_POSIX */
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_ACL
+/*
+ * Initialize NT TRANSACT SMB into small smb request buffer.  This assumes that
+ * all NT TRANSACTS that we init here have total parm and data under about 400
+ * bytes (to fit in small cifs buffer size), which is the case so far, it
+ * easily fits. NB: Setup words themselves and ByteCount MaxSetupCount (size of
+ * returned setup area) and MaxParameterCount (returned parms size) must be set
+ * by caller
+ */
+static int
+smb_init_nttransact(const __u16 sub_command, const int setup_count,
+                   const int parm_len, struct cifsTconInfo *tcon,
+                   void **ret_buf)
+{
+        int rc;
+        __u32 temp_offset;
+        struct smb_com_ntransact_req *pSMB;
+        rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon,
+                                (void **)&pSMB);
+        if (rc)
+                return rc;
+        *ret_buf = (void *)pSMB;
+        pSMB->Reserved = 0;
+        pSMB->TotalParameterCount = cpu_to_le32(parm_len);
+        pSMB->TotalDataCount  = 0;
+        pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
+                                          MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+        pSMB->ParameterCount = pSMB->TotalParameterCount;
+        pSMB->DataCount  = pSMB->TotalDataCount;
+        temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
+                        (setup_count * 2) - 4 /* for rfc1001 length itself */;
+        pSMB->ParameterOffset = cpu_to_le32(temp_offset);
+        pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len);
+        pSMB->SetupCount = setup_count; /* no need to le convert byte fields */
+        pSMB->SubCommand = cpu_to_le16(sub_command);
+        return 0;
+}
+static int
+validate_ntransact(char *buf, char **ppparm, char **ppdata,
+                   __u32 *pparmlen, __u32 *pdatalen)
+{
+        char *end_of_smb;
+        __u32 data_count, data_offset, parm_count, parm_offset;
+        struct smb_com_ntransact_rsp *pSMBr;
+        *pdatalen = 0;
+        *pparmlen = 0;
+        if (buf == NULL)
+                return -EINVAL;
+        pSMBr = (struct smb_com_ntransact_rsp *)buf;
+        /* ByteCount was converted from little endian in SendReceive */
+        end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
+                        (char *)&pSMBr->ByteCount;
+        data_offset = le32_to_cpu(pSMBr->DataOffset);
+        data_count = le32_to_cpu(pSMBr->DataCount);
+        parm_offset = le32_to_cpu(pSMBr->ParameterOffset);
+        parm_count = le32_to_cpu(pSMBr->ParameterCount);
+        *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset;
+        *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset;
+        /* should we also check that parm and data areas do not overlap? */
+        if (*ppparm > end_of_smb) {
+                cFYI(1, "parms start after end of smb");
+                return -EINVAL;
+        } else if (parm_count + *ppparm > end_of_smb) {
+                cFYI(1, "parm end after end of smb");
+                return -EINVAL;
+        } else if (*ppdata > end_of_smb) {
+                cFYI(1, "data starts after end of smb");
+                return -EINVAL;
+        } else if (data_count + *ppdata > end_of_smb) {
+                cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
+                        *ppdata, data_count, (data_count + *ppdata),
+                        end_of_smb, pSMBr);
+                return -EINVAL;
+        } else if (parm_count + data_count > pSMBr->ByteCount) {
+                cFYI(1, "parm count and data count larger than SMB");
+                return -EINVAL;
+        }
+        *pdatalen = data_count;
+        *pparmlen = parm_count;
+        return 0;
+}
 /* Get Security Descriptor (by handle) from remote server for a file or dir */
 int
 CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
@@ -3214,7 +3215,7 @@ setCifsAclRetry:
        return (rc);
 }
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
+#endif /* CONFIG_CIFS_ACL */
 /* Legacy Query Path Information call for lookup to old servers such
   as Win9x/WinME */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9eb327defa1d..cc1a8604a790 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -105,6 +105,7 @@ struct smb_vol {
        unsigned int wsize;
        bool sockopt_tcp_nodelay:1;
        unsigned short int port;
+        unsigned long actimeo; /* attribute cache timeout (jiffies) */
        char *prepath;
        struct sockaddr_storage srcaddr; /* allow binding to a local IP */
        struct nls_table *local_nls;
@@ -116,6 +117,7 @@ struct smb_vol {
 static int ipv4_connect(struct TCP_Server_Info *server);
 static int ipv6_connect(struct TCP_Server_Info *server);
+static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
 static void cifs_prune_tlinks(struct work_struct *work);
 /*
@@ -805,23 +807,20 @@ cifs_parse_mount_options(char *options, const char *devname,
        short int override_gid = -1;
        bool uid_specified = false;
        bool gid_specified = false;
+        char *nodename = utsname()->nodename;
        separator[0] = ',';
        separator[1] = 0;
-        if (Local_System_Name[0] != 0)
+        /*
-                memcpy(vol->source_rfc1001_name, Local_System_Name, 15);
+         * does not have to be perfect mapping since field is
-        else {
+         * informational, only used for servers that do not support
-                char *nodename = utsname()->nodename;
+         * port 445 and it can be overridden at mount time
-                int n = strnlen(nodename, 15);
+         */
-                memset(vol->source_rfc1001_name, 0x20, 15);
+        memset(vol->source_rfc1001_name, 0x20, 15);
-                for (i = 0; i < n; i++) {
+        for (i = 0; i < strnlen(nodename, 15); i++)
-                        /* does not have to be perfect mapping since field is
+                vol->source_rfc1001_name[i] = toupper(nodename[i]);
-                        informational, only used for servers that do not support
-                        port 445 and it can be overridden at mount time */
-                        vol->source_rfc1001_name[i] = toupper(nodename[i]);
-                }
-        }
        vol->source_rfc1001_name[15] = 0;
        /* null target name indicates to use *SMBSERVR default called name
           if we end up sending RFC1001 session initialize */
@@ -839,6 +838,8 @@ cifs_parse_mount_options(char *options, const char *devname,
        /* default to using server inode numbers where available */
        vol->server_ino = 1;
+        vol->actimeo = CIFS_DEF_ACTIMEO;
        if (!options)
                return 1;
@@ -1213,6 +1214,16 @@ cifs_parse_mount_options(char *options, const char *devname,
                                        printk(KERN_WARNING "CIFS: server net"
                                        "biosname longer than 15 truncated.\n");
                        }
+                } else if (strnicmp(data, "actimeo", 7) == 0) {
+                        if (value && *value) {
+                                vol->actimeo = HZ * simple_strtoul(value,
+                                                                   &value, 0);
+                                if (vol->actimeo > CIFS_MAX_ACTIMEO) {
+                                        cERROR(1, "CIFS: attribute cache"
+                                                        "timeout too large");
+                                        return 1;
+                                }
+                        }
                } else if (strnicmp(data, "credentials", 4) == 0) {
                        /* ignore */
                } else if (strnicmp(data, "version", 3) == 0) {
@@ -1351,6 +1362,11 @@ cifs_parse_mount_options(char *options, const char *devname,
                                "supported. Instead set "
                                "/proc/fs/cifs/LookupCacheEnabled to 0\n");
                } else if (strnicmp(data, "fsc", 3) == 0) {
+#ifndef CONFIG_CIFS_FSCACHE
+                        cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE"
+                                  "kernel config option set");
+                        return 1;
+#endif
                        vol->fsc = true;
                } else if (strnicmp(data, "mfsymlinks", 10) == 0) {
                        vol->mfsymlinks = true;
@@ -2565,6 +2581,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
        cFYI(1, "file mode: 0x%x  dir mode: 0x%x",
                cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
+        cifs_sb->actimeo = pvolume_info->actimeo;
        if (pvolume_info->noperm)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
        if (pvolume_info->setuids)
@@ -2815,13 +2833,13 @@ remote_path_check:
        /* check if a whole path (including prepath) is not remote */
        if (!rc && cifs_sb->prepathlen && tcon) {
                /* build_path_to_root works only when we have a valid tcon */
-                full_path = cifs_build_path_to_root(cifs_sb);
+                full_path = cifs_build_path_to_root(cifs_sb, tcon);
                if (full_path == NULL) {
                        rc = -ENOMEM;
                        goto mount_fail_check;
                }
                rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
-                if (rc != -EREMOTE) {
+                if (rc != 0 && rc != -EREMOTE) {
                        kfree(full_path);
                        goto mount_fail_check;
                }
@@ -2900,24 +2918,16 @@ remote_path_check:
                goto mount_fail_check;
        }
-        tlink->tl_index = pSesInfo->linux_uid;
+        tlink->tl_uid = pSesInfo->linux_uid;
        tlink->tl_tcon = tcon;
        tlink->tl_time = jiffies;
        set_bit(TCON_LINK_MASTER, &tlink->tl_flags);
        set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
-        rc = radix_tree_preload(GFP_KERNEL);
+        cifs_sb->master_tlink = tlink;
-        if (rc == -ENOMEM) {
-                kfree(tlink);
-                goto mount_fail_check;
-        }
        spin_lock(&cifs_sb->tlink_tree_lock);
-        radix_tree_insert(&cifs_sb->tlink_tree, pSesInfo->linux_uid, tlink);
+        tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
-        radix_tree_tag_set(&cifs_sb->tlink_tree, pSesInfo->linux_uid,
-                           CIFS_TLINK_MASTER_TAG);
        spin_unlock(&cifs_sb->tlink_tree_lock);
-        radix_tree_preload_end();
        queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
                                TLINK_IDLE_EXPIRE);
@@ -3107,32 +3117,25 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 int
 cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 {
-        int i, ret;
+        struct rb_root *root = &cifs_sb->tlink_tree;
+        struct rb_node *node;
+        struct tcon_link *tlink;
        char *tmp;
-        struct tcon_link *tlink[8];
-        unsigned long index = 0;
        cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
-        do {
+        spin_lock(&cifs_sb->tlink_tree_lock);
-                spin_lock(&cifs_sb->tlink_tree_lock);
+        while ((node = rb_first(root))) {
-                ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree,
+                tlink = rb_entry(node, struct tcon_link, tl_rbnode);
-                                             (void **)tlink, index,
+                cifs_get_tlink(tlink);
-                                             ARRAY_SIZE(tlink));
+                clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
-                /* increment index for next pass */
+                rb_erase(node, root);
-                if (ret > 0)
-                        index = tlink[ret - 1]->tl_index + 1;
-                for (i = 0; i < ret; i++) {
-                        cifs_get_tlink(tlink[i]);
-                        clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags);
-                        radix_tree_delete(&cifs_sb->tlink_tree,
-                                                        tlink[i]->tl_index);
-                }
-                spin_unlock(&cifs_sb->tlink_tree_lock);
-                for (i = 0; i < ret; i++)
+                spin_unlock(&cifs_sb->tlink_tree_lock);
-                        cifs_put_tlink(tlink[i]);
+                cifs_put_tlink(tlink);
-        } while (ret != 0);
+                spin_lock(&cifs_sb->tlink_tree_lock);
+        }
+        spin_unlock(&cifs_sb->tlink_tree_lock);
        tmp = cifs_sb->prepath;
        cifs_sb->prepathlen = 0;
@@ -3271,22 +3274,10 @@ out:
        return tcon;
 }
-static struct tcon_link *
+static inline struct tcon_link *
 cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb)
 {
-        struct tcon_link *tlink;
+        return cifs_sb->master_tlink;
-        unsigned int ret;
-        spin_lock(&cifs_sb->tlink_tree_lock);
-        ret = radix_tree_gang_lookup_tag(&cifs_sb->tlink_tree, (void **)&tlink,
-                                        0, 1, CIFS_TLINK_MASTER_TAG);
-        spin_unlock(&cifs_sb->tlink_tree_lock);
-        /* the master tcon should always be present */
-        if (ret == 0)
-                BUG();
-        return tlink;
 }
 struct cifsTconInfo *
@@ -3302,6 +3293,47 @@ cifs_sb_tcon_pending_wait(void *unused)
        return signal_pending(current) ? -ERESTARTSYS : 0;
 }
+/* find and return a tlink with given uid */
+static struct tcon_link *
+tlink_rb_search(struct rb_root *root, uid_t uid)
+{
+        struct rb_node *node = root->rb_node;
+        struct tcon_link *tlink;
+        while (node) {
+                tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+                if (tlink->tl_uid > uid)
+                        node = node->rb_left;
+                else if (tlink->tl_uid < uid)
+                        node = node->rb_right;
+                else
+                        return tlink;
+        }
+        return NULL;
+}
+/* insert a tcon_link into the tree */
+static void
+tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
+{
+        struct rb_node **new = &(root->rb_node), *parent = NULL;
+        struct tcon_link *tlink;
+        while (*new) {
+                tlink = rb_entry(*new, struct tcon_link, tl_rbnode);
+                parent = *new;
+                if (tlink->tl_uid > new_tlink->tl_uid)
+                        new = &((*new)->rb_left);
+                else
+                        new = &((*new)->rb_right);
+        }
+        rb_link_node(&new_tlink->tl_rbnode, parent, new);
+        rb_insert_color(&new_tlink->tl_rbnode, root);
+}
 /*
 * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the
 * current task.
@@ -3309,7 +3341,7 @@ cifs_sb_tcon_pending_wait(void *unused)
 * If the superblock doesn't refer to a multiuser mount, then just return
 * the master tcon for the mount.
 *
- * First, search the radix tree for an existing tcon for this fsuid. If one
+ * First, search the rbtree for an existing tcon for this fsuid. If one
 * exists, then check to see if it's pending construction. If it is then wait
 * for construction to complete. Once it's no longer pending, check to see if
 * it failed and either return an error or retry construction, depending on
@@ -3322,14 +3354,14 @@ struct tcon_link *
 cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
 {
        int ret;
-        unsigned long fsuid = (unsigned long) current_fsuid();
+        uid_t fsuid = current_fsuid();
        struct tcon_link *tlink, *newtlink;
        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
                return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
        spin_lock(&cifs_sb->tlink_tree_lock);
-        tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid);
+        tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
        if (tlink)
                cifs_get_tlink(tlink);
        spin_unlock(&cifs_sb->tlink_tree_lock);
@@ -3338,36 +3370,24 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
                newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL);
                if (newtlink == NULL)
                        return ERR_PTR(-ENOMEM);
-                newtlink->tl_index = fsuid;
+                newtlink->tl_uid = fsuid;
                newtlink->tl_tcon = ERR_PTR(-EACCES);
                set_bit(TCON_LINK_PENDING, &newtlink->tl_flags);
                set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags);
                cifs_get_tlink(newtlink);
-                ret = radix_tree_preload(GFP_KERNEL);
-                if (ret != 0) {
-                        kfree(newtlink);
-                        return ERR_PTR(ret);
-                }
                spin_lock(&cifs_sb->tlink_tree_lock);
                /* was one inserted after previous search? */
-                tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid);
+                tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid);
                if (tlink) {
                        cifs_get_tlink(tlink);
                        spin_unlock(&cifs_sb->tlink_tree_lock);
-                        radix_tree_preload_end();
                        kfree(newtlink);
                        goto wait_for_construction;
                }
-                ret = radix_tree_insert(&cifs_sb->tlink_tree, fsuid, newtlink);
-                spin_unlock(&cifs_sb->tlink_tree_lock);
-                radix_tree_preload_end();
-                if (ret) {
-                        kfree(newtlink);
-                        return ERR_PTR(ret);
-                }
                tlink = newtlink;
+                tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
+                spin_unlock(&cifs_sb->tlink_tree_lock);
        } else {
 wait_for_construction:
                ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
@@ -3413,39 +3433,39 @@ cifs_prune_tlinks(struct work_struct *work)
 {
        struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info,
                                                    prune_tlinks.work);
-        struct tcon_link *tlink[8];
+        struct rb_root *root = &cifs_sb->tlink_tree;
-        unsigned long now = jiffies;
+        struct rb_node *node = rb_first(root);
-        unsigned long index = 0;
+        struct rb_node *tmp;
-        int i, ret;
+        struct tcon_link *tlink;
-        do {
+        /*
-                spin_lock(&cifs_sb->tlink_tree_lock);
+         * Because we drop the spinlock in the loop in order to put the tlink
-                ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree,
+         * it's not guarded against removal of links from the tree. The only
-                                             (void **)tlink, index,
+         * places that remove entries from the tree are this function and
-                                             ARRAY_SIZE(tlink));
+         * umounts. Because this function is non-reentrant and is canceled
-                /* increment index for next pass */
+         * before umount can proceed, this is safe.
-                if (ret > 0)
+         */
-                        index = tlink[ret - 1]->tl_index + 1;
+        spin_lock(&cifs_sb->tlink_tree_lock);
-                for (i = 0; i < ret; i++) {
+        node = rb_first(root);
-                        if (test_bit(TCON_LINK_MASTER, &tlink[i]->tl_flags) ||
+        while (node != NULL) {
-                            atomic_read(&tlink[i]->tl_count) != 0 ||
+                tmp = node;
-                            time_after(tlink[i]->tl_time + TLINK_IDLE_EXPIRE,
+                node = rb_next(tmp);
-                                       now)) {
+                tlink = rb_entry(tmp, struct tcon_link, tl_rbnode);
-                                tlink[i] = NULL;
-                                continue;
+                if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) ||
-                        }
+                    atomic_read(&tlink->tl_count) != 0 ||
-                        cifs_get_tlink(tlink[i]);
+                    time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies))
-                        clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags);
+                        continue;
-                        radix_tree_delete(&cifs_sb->tlink_tree,
-                                          tlink[i]->tl_index);
-                }
-                spin_unlock(&cifs_sb->tlink_tree_lock);
-                for (i = 0; i < ret; i++) {
+                cifs_get_tlink(tlink);
-                        if (tlink[i] != NULL)
+                clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags);
-                                cifs_put_tlink(tlink[i]);
+                rb_erase(tmp, root);
-                }
-        } while (ret != 0);
+                spin_unlock(&cifs_sb->tlink_tree_lock);
+                cifs_put_tlink(tlink);
+                spin_lock(&cifs_sb->tlink_tree_lock);
+        }
+        spin_unlock(&cifs_sb->tlink_tree_lock);
        queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks,
                                TLINK_IDLE_EXPIRE);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3840eddbfb7a..db2a58c00f7b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -135,9 +135,9 @@ static void setup_cifs_dentry(struct cifsTconInfo *tcon,
                              struct inode *newinode)
 {
        if (tcon->nocase)
-                direntry->d_op = &cifs_ci_dentry_ops;
+                d_set_d_op(direntry, &cifs_ci_dentry_ops);
        else
-                direntry->d_op = &cifs_dentry_ops;
+                d_set_d_op(direntry, &cifs_dentry_ops);
        d_instantiate(direntry, newinode);
 }
@@ -421,9 +421,9 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                rc = cifs_get_inode_info_unix(&newinode, full_path,
                                                inode->i_sb, xid);
                if (pTcon->nocase)
-                        direntry->d_op = &cifs_ci_dentry_ops;
+                        d_set_d_op(direntry, &cifs_ci_dentry_ops);
                else
-                        direntry->d_op = &cifs_dentry_ops;
+                        d_set_d_op(direntry, &cifs_dentry_ops);
                if (rc == 0)
                        d_instantiate(direntry, newinode);
@@ -604,9 +604,9 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        if ((rc == 0) && (newInode != NULL)) {
                if (pTcon->nocase)
-                        direntry->d_op = &cifs_ci_dentry_ops;
+                        d_set_d_op(direntry, &cifs_ci_dentry_ops);
                else
-                        direntry->d_op = &cifs_dentry_ops;
+                        d_set_d_op(direntry, &cifs_dentry_ops);
                d_add(direntry, newInode);
                if (posix_open) {
                        filp = lookup_instantiate_filp(nd, direntry,
@@ -634,9 +634,9 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                rc = 0;
                direntry->d_time = jiffies;
                if (pTcon->nocase)
-                        direntry->d_op = &cifs_ci_dentry_ops;
+                        d_set_d_op(direntry, &cifs_ci_dentry_ops);
                else
-                        direntry->d_op = &cifs_dentry_ops;
+                        d_set_d_op(direntry, &cifs_dentry_ops);
                d_add(direntry, NULL);
        /*      if it was once a directory (but how can we tell?) we could do
                shrink_dcache_parent(direntry); */
@@ -656,22 +656,37 @@ lookup_out:
 static int
 cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 {
-        int isValid = 1;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        if (direntry->d_inode) {
                if (cifs_revalidate_dentry(direntry))
                        return 0;
-        } else {
+                else
-                cFYI(1, "neg dentry 0x%p name = %s",
+                        return 1;
-                         direntry, direntry->d_name.name);
-                if (time_after(jiffies, direntry->d_time + HZ) ||
-                        !lookupCacheEnabled) {
-                        d_drop(direntry);
-                        isValid = 0;
-                }
        }
-        return isValid;
+        /*
+         * This may be nfsd (or something), anyway, we can't see the
+         * intent of this. So, since this can be for creation, drop it.
+         */
+        if (!nd)
+                return 0;
+        /*
+         * Drop the negative dentry, in order to make sure to use the
+         * case sensitive name which is specified by user if this is
+         * for creation.
+         */
+        if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+                if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+                        return 0;
+        }
+        if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled)
+                return 0;
+        return 1;
 }
 /* static int cifs_d_delete(struct dentry *direntry)
@@ -688,9 +703,10 @@ const struct dentry_operations cifs_dentry_ops = {
 /* d_delete:       cifs_d_delete,      */ /* not needed except for debugging */
 };
-static int cifs_ci_hash(struct dentry *dentry, struct qstr *q)
+static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *q)
 {
-        struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls;
+        struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
        unsigned long hash;
        int i;
@@ -703,21 +719,16 @@ static int cifs_ci_hash(struct dentry *dentry, struct qstr *q)
        return 0;
 }
-static int cifs_ci_compare(struct dentry *dentry, struct qstr *a,
+static int cifs_ci_compare(const struct dentry *parent,
-                           struct qstr *b)
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct nls_table *codepage = CIFS_SB(dentry->d_inode->i_sb)->local_nls;
+        struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls;
-        if ((a->len == b->len) &&
+        if ((name->len == len) &&
-            (nls_strnicmp(codepage, a->name, b->name, a->len) == 0)) {
+            (nls_strnicmp(codepage, name->name, str, len) == 0))
-                /*
-                 * To preserve case, don't let an existing negative dentry's
-                 * case take precedence.  If a is not a negative dentry, this
-                 * should have no side effects
-                 */
-                memcpy((void *)a->name, b->name, a->len);
                return 0;
-        }
        return 1;
 }
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 0eb87026cad3..548f06230a6d 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -66,7 +66,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
        /* Search for server name delimiter */
        sep = memchr(hostname, '\\', len);
        if (sep)
-                len = sep - unc;
+                len = sep - hostname;
        else
                cFYI(1, "%s: probably server name is whole unc: %s",
                     __func__, unc);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ae82159cf7fa..5a28660ca2b5 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -146,12 +146,7 @@ client_can_cache:
                rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
                                         xid, NULL);
-        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+        cifs_set_oplock_level(pCifsInode, oplock);
-                pCifsInode->clientCanCacheAll = true;
-                pCifsInode->clientCanCacheRead = true;
-                cFYI(1, "Exclusive Oplock granted on inode %p", inode);
-        } else if ((oplock & 0xF) == OPLOCK_READ)
-                pCifsInode->clientCanCacheRead = true;
        return rc;
 }
@@ -253,12 +248,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
                list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList);
        spin_unlock(&cifs_file_list_lock);
-        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+        cifs_set_oplock_level(pCifsInode, oplock);
-                pCifsInode->clientCanCacheAll = true;
-                pCifsInode->clientCanCacheRead = true;
-                cFYI(1, "Exclusive Oplock inode %p", inode);
-        } else if ((oplock & 0xF) == OPLOCK_READ)
-                pCifsInode->clientCanCacheRead = true;
        file->private_data = pCifsFile;
        return pCifsFile;
@@ -271,8 +261,9 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
 */
 void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 {
+        struct inode *inode = cifs_file->dentry->d_inode;
        struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink);
-        struct cifsInodeInfo *cifsi = CIFS_I(cifs_file->dentry->d_inode);
+        struct cifsInodeInfo *cifsi = CIFS_I(inode);
        struct cifsLockInfo *li, *tmp;
        spin_lock(&cifs_file_list_lock);
@@ -288,8 +279,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
        if (list_empty(&cifsi->openFileList)) {
                cFYI(1, "closing last open instance for inode %p",
                        cifs_file->dentry->d_inode);
-                cifsi->clientCanCacheRead = false;
+                cifs_set_oplock_level(cifsi, 0);
-                cifsi->clientCanCacheAll  = false;
        }
        spin_unlock(&cifs_file_list_lock);
@@ -607,8 +597,6 @@ reopen_success:
                rc = filemap_write_and_wait(inode->i_mapping);
                mapping_set_error(inode->i_mapping, rc);
-                pCifsInode->clientCanCacheAll = false;
-                pCifsInode->clientCanCacheRead = false;
                if (tcon->unix_ext)
                        rc = cifs_get_inode_info_unix(&inode,
                                full_path, inode->i_sb, xid);
@@ -622,18 +610,9 @@ reopen_success:
             invalidate the current end of file on the server
             we can not go to the server to get the new inod
             info */
-        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-                pCifsInode->clientCanCacheAll = true;
+        cifs_set_oplock_level(pCifsInode, oplock);
-                pCifsInode->clientCanCacheRead = true;
-                cFYI(1, "Exclusive Oplock granted on inode %p",
-                         pCifsFile->dentry->d_inode);
-        } else if ((oplock & 0xF) == OPLOCK_READ) {
-                pCifsInode->clientCanCacheRead = true;
-                pCifsInode->clientCanCacheAll = false;
-        } else {
-                pCifsInode->clientCanCacheRead = false;
-                pCifsInode->clientCanCacheAll = false;
-        }
        cifs_relock_file(pCifsFile);
 reopen_error_exit:
@@ -775,12 +754,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
-        if (file->private_data == NULL) {
-                rc = -EBADF;
-                FreeXid(xid);
-                return rc;
-        }
        netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
        if ((tcon->ses->capabilities & CAP_UNIX) &&
@@ -956,6 +929,7 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
 ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        size_t write_size, loff_t *poffset)
 {
+        struct inode *inode = file->f_path.dentry->d_inode;
        int rc = 0;
        unsigned int bytes_written = 0;
        unsigned int total_written;
@@ -963,7 +937,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        struct cifsTconInfo *pTcon;
        int xid, long_op;
        struct cifsFileInfo *open_file;
-        struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode);
+        struct cifsInodeInfo *cifsi = CIFS_I(inode);
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
@@ -1029,21 +1003,17 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        cifs_stats_bytes_written(pTcon, total_written);
-        /* since the write may have blocked check these pointers again */
-        if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) {
-                struct inode *inode = file->f_path.dentry->d_inode;
 /* Do not update local mtime - server will set its actual value on write
- *              inode->i_ctime = inode->i_mtime =
+ *      inode->i_ctime = inode->i_mtime =
- *                      current_fs_time(inode->i_sb);*/
+ *              current_fs_time(inode->i_sb);*/
-                if (total_written > 0) {
+        if (total_written > 0) {
-                        spin_lock(&inode->i_lock);
+                spin_lock(&inode->i_lock);
-                        if (*poffset > file->f_path.dentry->d_inode->i_size)
+                if (*poffset > inode->i_size)
-                                i_size_write(file->f_path.dentry->d_inode,
+                        i_size_write(inode, *poffset);
-                                        *poffset);
+                spin_unlock(&inode->i_lock);
-                        spin_unlock(&inode->i_lock);
-                }
-                mark_inode_dirty_sync(file->f_path.dentry->d_inode);
        }
+        mark_inode_dirty_sync(inode);
        FreeXid(xid);
        return total_written;
 }
@@ -1138,7 +1108,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
        return total_written;
 }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
                                        bool fsuid_only)
 {
@@ -1172,13 +1141,12 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
        spin_unlock(&cifs_file_list_lock);
        return NULL;
 }
-#endif
 struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
                                        bool fsuid_only)
 {
        struct cifsFileInfo *open_file;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+        struct cifs_sb_info *cifs_sb;
        bool any_available = false;
        int rc;
@@ -1192,6 +1160,8 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
                return NULL;
        }
+        cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
        /* only filter by fsuid on multiuser mounts */
        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
                fsuid_only = false;
@@ -2299,8 +2269,10 @@ void cifs_oplock_break_get(struct cifsFileInfo *cfile)
 void cifs_oplock_break_put(struct cifsFileInfo *cfile)
 {
+        struct super_block *sb = cfile->dentry->d_sb;
        cifsFileInfo_put(cfile);
-        cifs_sb_deactive(cfile->dentry->d_sb);
+        cifs_sb_deactive(sb);
 }
 const struct address_space_operations cifs_addr_ops = {
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index a2ad94efcfe6..297a43d0ff7f 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -2,7 +2,7 @@
 *   fs/cifs/fscache.c - CIFS filesystem cache interface
 *
 *   Copyright (c) 2010 Novell, Inc.
- *   Author(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *   Author(s): Suresh Jayaraman <sjayaraman@suse.de>
 *
 *   This library is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU Lesser General Public License as published
@@ -67,10 +67,12 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
        if (cifsi->fscache)
                return;
-        cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
+                cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
                                &cifs_fscache_inode_object_def, cifsi);
-        cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
+                cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache,
                                cifsi->fscache);
+        }
 }
 void cifs_fscache_release_inode_cookie(struct inode *inode)
@@ -101,10 +103,8 @@ void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
 {
        if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
                cifs_fscache_disable_inode_cookie(inode);
-        else {
+        else
                cifs_fscache_enable_inode_cookie(inode);
-                cFYI(1, "CIFS: fscache inode cookie set");
-        }
 }
 void cifs_fscache_reset_inode_cookie(struct inode *inode)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 39869c3c3efb..a853a89857a5 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -686,13 +686,18 @@ int cifs_get_inode_info(struct inode **pinode,
                        cFYI(1, "cifs_sfu_type failed: %d", tmprc);
        }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_ACL
        /* fill in 0777 bits from ACL */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-                cFYI(1, "Getting mode bits from ACL");
+                rc = cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path,
-                cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid);
+                                                pfid);
+                if (rc) {
+                        cFYI(1, "%s: Getting ACL failed with error: %d",
+                                __func__, rc);
+                        goto cgii_exit;
+                }
        }
-#endif
+#endif /* CONFIG_CIFS_ACL */
        /* fill in remaining high mode bits e.g. SUID, VTX */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
@@ -723,12 +728,12 @@ static const struct inode_operations cifs_ipc_inode_ops = {
        .lookup = cifs_lookup,
 };
-char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
+char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
+                                struct cifsTconInfo *tcon)
 {
        int pplen = cifs_sb->prepathlen;
        int dfsplen;
        char *full_path = NULL;
-        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
        /* if no prefix path, simply set path to the root of share to "" */
        if (pplen == 0) {
@@ -804,14 +809,14 @@ inode_has_hashed_dentries(struct inode *inode)
 {
        struct dentry *dentry;
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
                if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&inode->i_lock);
                        return true;
                }
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        return false;
 }
@@ -870,7 +875,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
        char *full_path;
        struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
-        full_path = cifs_build_path_to_root(cifs_sb);
+        full_path = cifs_build_path_to_root(cifs_sb, tcon);
        if (full_path == NULL)
                return ERR_PTR(-ENOMEM);
@@ -881,8 +886,10 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
                rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
                                                xid, NULL);
-        if (!inode)
+        if (!inode) {
-                return ERR_PTR(rc);
+                inode = ERR_PTR(rc);
+                goto out;
+        }
 #ifdef CONFIG_CIFS_FSCACHE
        /* populate tcon->resource_id */
@@ -898,13 +905,11 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
                inode->i_uid = cifs_sb->mnt_uid;
                inode->i_gid = cifs_sb->mnt_gid;
        } else if (rc) {
-                kfree(full_path);
-                _FreeXid(xid);
                iget_failed(inode);
-                return ERR_PTR(rc);
+                inode = ERR_PTR(rc);
        }
+out:
        kfree(full_path);
        /* can not call macro FreeXid here since in a void func
         * TODO: This is no longer true
@@ -1314,9 +1319,9 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
        to set uid/gid */
                        inc_nlink(inode);
                        if (pTcon->nocase)
-                                direntry->d_op = &cifs_ci_dentry_ops;
+                                d_set_d_op(direntry, &cifs_ci_dentry_ops);
                        else
-                                direntry->d_op = &cifs_dentry_ops;
+                                d_set_d_op(direntry, &cifs_dentry_ops);
                        cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
                        cifs_fill_uniqueid(inode->i_sb, &fattr);
@@ -1358,9 +1363,9 @@ mkdir_get_info:
                                                 inode->i_sb, xid, NULL);
                if (pTcon->nocase)
-                        direntry->d_op = &cifs_ci_dentry_ops;
+                        d_set_d_op(direntry, &cifs_ci_dentry_ops);
                else
-                        direntry->d_op = &cifs_dentry_ops;
+                        d_set_d_op(direntry, &cifs_dentry_ops);
                d_instantiate(direntry, newinode);
                 /* setting nlink not necessary except in cases where we
                  * failed to get it from the server or was set bogus */
@@ -1648,6 +1653,7 @@ static bool
 cifs_inode_needs_reval(struct inode *inode)
 {
        struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        if (cifs_i->clientCanCacheRead)
                return false;
@@ -1658,19 +1664,21 @@ cifs_inode_needs_reval(struct inode *inode)
        if (cifs_i->time == 0)
                return true;
-        /* FIXME: the actimeo should be tunable */
+        if (!time_in_range(jiffies, cifs_i->time,
-        if (time_after_eq(jiffies, cifs_i->time + HZ))
+                                cifs_i->time + cifs_sb->actimeo))
                return true;
        /* hardlinked files w/ noserverino get "special" treatment */
-        if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
            S_ISREG(inode->i_mode) && inode->i_nlink != 1)
                return true;
        return false;
 }
-/* check invalid_mapping flag and zap the cache if it's set */
+/*
+ * Zap the cache. Called when invalid_mapping flag is set.
+ */
 static void
 cifs_invalidate_mapping(struct inode *inode)
 {
@@ -2114,11 +2122,16 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
        if (attrs->ia_valid & ATTR_MODE) {
                rc = 0;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_ACL
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-                        rc = mode_to_acl(inode, full_path, mode);
+                        rc = mode_to_cifs_acl(inode, full_path, mode);
-                else
+                        if (rc) {
-#endif
+                                cFYI(1, "%s: Setting ACL failed with error: %d",
+                                        __func__, rc);
+                                goto cifs_setattr_exit;
+                        }
+                } else
+#endif /* CONFIG_CIFS_ACL */
                if (((mode & S_IWUGO) == 0) &&
                    (cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
@@ -2177,7 +2190,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
        setattr_copy(inode, attrs);
        mark_inode_dirty(inode);
-        return 0;
 cifs_setattr_exit:
        kfree(full_path);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 077bf756f342..0c98672d0122 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -38,10 +38,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
        struct cifs_sb_info *cifs_sb;
 #ifdef CONFIG_CIFS_POSIX
        struct cifsFileInfo *pSMBFile = filep->private_data;
-        struct cifsTconInfo *tcon = tlink_tcon(pSMBFile->tlink);
+        struct cifsTconInfo *tcon;
        __u64   ExtAttrBits = 0;
        __u64   ExtAttrMask = 0;
-        __u64   caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
+        __u64   caps;
 #endif /* CONFIG_CIFS_POSIX */
        xid = GetXid();
@@ -62,9 +62,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
                        break;
 #ifdef CONFIG_CIFS_POSIX
                case FS_IOC_GETFLAGS:
+                        if (pSMBFile == NULL)
+                                break;
+                        tcon = tlink_tcon(pSMBFile->tlink);
+                        caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
                        if (CIFS_UNIX_EXTATTR_CAP & caps) {
-                                if (pSMBFile == NULL)
-                                        break;
                                rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid,
                                        &ExtAttrBits, &ExtAttrMask);
                                if (rc == 0)
@@ -75,13 +77,15 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
                        break;
                case FS_IOC_SETFLAGS:
+                        if (pSMBFile == NULL)
+                                break;
+                        tcon = tlink_tcon(pSMBFile->tlink);
+                        caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
                        if (CIFS_UNIX_EXTATTR_CAP & caps) {
                                if (get_user(ExtAttrBits, (int __user *)arg)) {
                                        rc = -EFAULT;
                                        break;
                                }
-                                if (pSMBFile == NULL)
-                                        break;
                                /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
                                        extAttrBits, &ExtAttrMask);*/
                        }
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 85cdbf831e7b..fe2f6a93c49e 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -525,9 +525,9 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
                              rc);
                } else {
                        if (pTcon->nocase)
-                                direntry->d_op = &cifs_ci_dentry_ops;
+                                d_set_d_op(direntry, &cifs_ci_dentry_ops);
                        else
-                                direntry->d_op = &cifs_dentry_ops;
+                                d_set_d_op(direntry, &cifs_dentry_ops);
                        d_instantiate(direntry, newinode);
                }
        }
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index c4e296fe3518..43f10281bc19 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -569,10 +569,9 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                                cFYI(1, "file id match, oplock break");
                                pCifsInode = CIFS_I(netfile->dentry->d_inode);
-                                pCifsInode->clientCanCacheAll = false;
-                                if (pSMB->OplockLevel == 0)
-                                        pCifsInode->clientCanCacheRead = false;
+                                cifs_set_oplock_level(pCifsInode,
+                                                      pSMB->OplockLevel);
                                /*
                                 * cifs_oplock_break_put() can't be called
                                 * from here.  Get reference after queueing
@@ -722,3 +721,23 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
                           cifs_sb_master_tcon(cifs_sb)->treeName);
        }
 }
+void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
+{
+        oplock &= 0xF;
+        if (oplock == OPLOCK_EXCLUSIVE) {
+                cinode->clientCanCacheAll = true;
+                cinode->clientCanCacheRead = true;
+                cFYI(1, "Exclusive Oplock granted on inode %p",
+                     &cinode->vfs_inode);
+        } else if (oplock == OPLOCK_READ) {
+                cinode->clientCanCacheAll = false;
+                cinode->clientCanCacheRead = true;
+                cFYI(1, "Level II Oplock granted on inode %p",
+                    &cinode->vfs_inode);
+        } else {
+                cinode->clientCanCacheAll = false;
+                cinode->clientCanCacheRead = false;
+        }
+}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index ef7bb7b50f58..ec5b68e3b928 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -79,7 +79,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
        cFYI(1, "For %s", name->name);
        if (parent->d_op && parent->d_op->d_hash)
-                parent->d_op->d_hash(parent, name);
+                parent->d_op->d_hash(parent, parent->d_inode, name);
        else
                name->hash = full_name_hash(name->name, name->len);
@@ -103,9 +103,9 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
        }
        if (cifs_sb_master_tcon(CIFS_SB(sb))->nocase)
-                dentry->d_op = &cifs_ci_dentry_ops;
+                d_set_d_op(dentry, &cifs_ci_dentry_ops);
        else
-                dentry->d_op = &cifs_dentry_ops;
+                d_set_d_op(dentry, &cifs_dentry_ops);
        alias = d_materialise_unique(dentry, inode);
        if (alias != NULL) {
@@ -226,26 +226,29 @@ static int initiate_cifs_search(const int xid, struct file *file)
        char *full_path = NULL;
        struct cifsFileInfo *cifsFile;
        struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        struct tcon_link *tlink;
+        struct tcon_link *tlink = NULL;
        struct cifsTconInfo *pTcon;
-        tlink = cifs_sb_tlink(cifs_sb);
-        if (IS_ERR(tlink))
-                return PTR_ERR(tlink);
-        pTcon = tlink_tcon(tlink);
-        if (file->private_data == NULL)
-                file->private_data =
-                        kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
        if (file->private_data == NULL) {
-                rc = -ENOMEM;
+                tlink = cifs_sb_tlink(cifs_sb);
-                goto error_exit;
+                if (IS_ERR(tlink))
+                        return PTR_ERR(tlink);
+                cifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+                if (cifsFile == NULL) {
+                        rc = -ENOMEM;
+                        goto error_exit;
+                }
+                file->private_data = cifsFile;
+                cifsFile->tlink = cifs_get_tlink(tlink);
+                pTcon = tlink_tcon(tlink);
+        } else {
+                cifsFile = file->private_data;
+                pTcon = tlink_tcon(cifsFile->tlink);
        }
-        cifsFile = file->private_data;
        cifsFile->invalidHandle = true;
        cifsFile->srch_inf.endOfSearch = false;
-        cifsFile->tlink = cifs_get_tlink(tlink);
        full_path = build_path_from_dentry(file->f_path.dentry);
        if (full_path == NULL) {
@@ -756,18 +759,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
        rc = filldir(direntry, qstring.name, qstring.len, file->f_pos,
                     ino, fattr.cf_dtype);
-        /*
-         * we can not return filldir errors to the caller since they are
-         * "normal" when the stat blocksize is too small - we return remapped
-         * error instead
-         *
-         * FIXME: This looks bogus. filldir returns -EOVERFLOW in the above
-         * case already. Why should we be clobbering other errors from it?
-         */
-        if (rc) {
-                cFYI(1, "filldir rc = %d", rc);
-                rc = -EOVERFLOW;
-        }
        dput(tmp_dentry);
        return rc;
 }
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a264b744bb41..eae2a1491608 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -30,10 +30,11 @@
 #define MAX_EA_VALUE_SIZE 65535
 #define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
+#define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
 #define CIFS_XATTR_USER_PREFIX "user."
 #define CIFS_XATTR_SYSTEM_PREFIX "system."
 #define CIFS_XATTR_OS2_PREFIX "os2."
-#define CIFS_XATTR_SECURITY_PREFIX ".security"
+#define CIFS_XATTR_SECURITY_PREFIX "security."
 #define CIFS_XATTR_TRUSTED_PREFIX "trusted."
 #define XATTR_TRUSTED_PREFIX_LEN  8
 #define XATTR_SECURITY_PREFIX_LEN 9
@@ -277,29 +278,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                                cifs_sb->local_nls,
                                cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-                else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-                        __u16 fid;
-                        int oplock = 0;
-                        struct cifs_ntsd *pacl = NULL;
-                        __u32 buflen = 0;
-                        if (experimEnabled)
-                                rc = CIFSSMBOpen(xid, pTcon, full_path,
-                                        FILE_OPEN, GENERIC_READ, 0, &fid,
-                                        &oplock, NULL, cifs_sb->local_nls,
-                                        cifs_sb->mnt_cifs_flags &
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        /* else rc is EOPNOTSUPP from above */
-                        if (rc == 0) {
-                                rc = CIFSSMBGetCIFSACL(xid, pTcon, fid, &pacl,
-                                                      &buflen);
-                                CIFSSMBClose(xid, pTcon, fid);
-                        }
-                }
-#endif /* EXPERIMENTAL */
 #else
-                cFYI(1, "query POSIX ACL not supported yet");
+                cFYI(1, "Query POSIX ACL not supported yet");
 #endif /* CONFIG_CIFS_POSIX */
        } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
                          strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -311,8 +291,33 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                                cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
 #else
-                cFYI(1, "query POSIX default ACL not supported yet");
+                cFYI(1, "Query POSIX default ACL not supported yet");
-#endif
+#endif /* CONFIG_CIFS_POSIX */
+        } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
+                                strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
+#ifdef CONFIG_CIFS_ACL
+                        u32 acllen;
+                        struct cifs_ntsd *pacl;
+                        pacl = get_cifs_acl(cifs_sb, direntry->d_inode,
+                                                full_path, &acllen);
+                        if (IS_ERR(pacl)) {
+                                rc = PTR_ERR(pacl);
+                                cERROR(1, "%s: error %zd getting sec desc",
+                                                __func__, rc);
+                        } else {
+                                if (ea_value) {
+                                        if (acllen > buf_size)
+                                                acllen = -ERANGE;
+                                        else
+                                                memcpy(ea_value, pacl, acllen);
+                                }
+                                rc = acllen;
+                                kfree(pacl);
+                        }
+#else
+                cFYI(1, "Query CIFS ACL not supported yet");
+#endif /* CONFIG_CIFS_ACL */
        } else if (strncmp(ea_name,
                  CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
                cFYI(1, "Trusted xattr namespace not supported yet");
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 9060f08e70cf..5525e1c660fd 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -93,7 +93,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
        struct list_head *child;
        struct dentry *de;
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        list_for_each(child, &parent->d_subdirs)
        {
                de = list_entry(child, struct dentry, d_u.d_child);
@@ -102,7 +102,7 @@ static void coda_flag_children(struct dentry *parent, int flag)
                        continue;
                coda_flag_inode(de->d_inode, flag);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
        return; 
 }
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 5d8b35539601..29badd91360f 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -18,6 +18,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
+#include <linux/namei.h>
 #include <asm/uaccess.h>
@@ -47,7 +48,7 @@ static int coda_readdir(struct file *file, void *buf, filldir_t filldir);
 /* dentry ops */
 static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd);
-static int coda_dentry_delete(struct dentry *);
+static int coda_dentry_delete(const struct dentry *);
 /* support routines */
 static int coda_venus_readdir(struct file *coda_file, void *buf,
@@ -125,7 +126,7 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
                return ERR_PTR(error);
 exit:
-        entry->d_op = &coda_dentry_operations;
+        d_set_d_op(entry, &coda_dentry_operations);
        if (inode && (type & CODA_NOCACHE))
                coda_flag_inode(inode, C_VATTR | C_PURGE);
@@ -134,10 +135,13 @@ exit:
 }
-int coda_permission(struct inode *inode, int mask)
+int coda_permission(struct inode *inode, int mask, unsigned int flags)
 {
        int error;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 
        if (!mask)
@@ -541,9 +545,13 @@ out:
 /* called when a cache lookup succeeds */
 static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
 {
-        struct inode *inode = de->d_inode;
+        struct inode *inode;
        struct coda_inode_info *cii;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = de->d_inode;
        if (!inode || coda_isroot(inode))
                goto out;
        if (is_bad_inode(inode))
@@ -559,7 +567,7 @@ static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
        if (cii->c_flags & C_FLUSH) 
                coda_flag_inode_children(inode, C_FLUSH);
-        if (atomic_read(&de->d_count) > 1)
+        if (de->d_count > 1)
                /* pretend it's valid, but don't change the flags */
                goto out;
@@ -577,7 +585,7 @@ out:
 * This is the callback from dput() when d_count is going to 0.
 * We use this to unhash dentries with bad inodes.
 */
-static int coda_dentry_delete(struct dentry * dentry)
+static int coda_dentry_delete(const struct dentry * dentry)
 {
        int flags;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 5ea57c8c7f97..50dc7d189f56 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -56,11 +56,18 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void coda_destroy_inode(struct inode *inode)
+static void coda_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(coda_inode_cachep, ITOC(inode));
 }
+static void coda_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, coda_i_callback);
+}
 static void init_once(void *foo)
 {
        struct coda_inode_info *ei = (struct coda_inode_info *) foo;
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 2fd89b5c5c7b..741f0bd03918 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -24,7 +24,7 @@
 #include <linux/coda_psdev.h>
 /* pioctl ops */
-static int coda_ioctl_permission(struct inode *inode, int mask);
+static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags);
 static long coda_pioctl(struct file *filp, unsigned int cmd,
                        unsigned long user_data);
@@ -41,8 +41,10 @@ const struct file_operations coda_ioctl_operations = {
 };
 /* the coda pioctl inode ops */
-static int coda_ioctl_permission(struct inode *inode, int mask)
+static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        return (mask & MAY_EXEC) ? -EACCES : 0;
 }
diff --git a/fs/compat.c b/fs/compat.c
index c580c322fa6b..eb1740ac8c0a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1350,6 +1350,10 @@ static int compat_count(compat_uptr_t __user *argv, int max)
                        argv++;
                        if (i++ >= max)
                                return -E2BIG;
+                        if (fatal_signal_pending(current))
+                                return -ERESTARTNOHAND;
+                        cond_resched();
                }
        }
        return i;
@@ -1391,6 +1395,12 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
                while (len > 0) {
                        int offset, bytes_to_copy;
+                        if (fatal_signal_pending(current)) {
+                                ret = -ERESTARTNOHAND;
+                                goto out;
+                        }
+                        cond_resched();
                        offset = pos % PAGE_SIZE;
                        if (offset == 0)
                                offset = PAGE_SIZE;
@@ -1407,18 +1417,8 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
                        if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
                                struct page *page;
-#ifdef CONFIG_STACK_GROWSUP
+                                page = get_arg_page(bprm, pos, 1);
-                                ret = expand_stack_downwards(bprm->vma, pos);
+                                if (!page) {
-                                if (ret < 0) {
-                                        /* We've exceed the stack rlimit. */
-                                        ret = -E2BIG;
-                                        goto out;
-                                }
-#endif
-                                ret = get_user_pages(current, bprm->mm, pos,
-                                                     1, 1, 1, &page, NULL);
-                                if (ret <= 0) {
-                                        /* We've exceed the stack rlimit. */
                                        ret = -E2BIG;
                                        goto out;
                                }
@@ -1539,8 +1539,10 @@ int compat_do_execve(char * filename,
        return retval;
 out:
-        if (bprm->mm)
+        if (bprm->mm) {
+                acct_arg_size(bprm, 0);
                mmput(bprm->mm);
+        }
 out_file:
        if (bprm->file) {
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 410ed188faa1..61abb638b4bf 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -19,7 +19,6 @@
 #include <linux/compiler.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/ioctl.h>
 #include <linux/if.h>
 #include <linux/if_bridge.h>
@@ -43,7 +42,7 @@
 #include <linux/tty.h>
 #include <linux/vt_kern.h>
 #include <linux/fb.h>
-#include <linux/videodev.h>
+#include <linux/videodev2.h>
 #include <linux/netdevice.h>
 #include <linux/raw.h>
 #include <linux/blkdev.h>
@@ -837,6 +836,7 @@ COMPATIBLE_IOCTL(TCSETSW)
 COMPATIBLE_IOCTL(TCSETSF)
 COMPATIBLE_IOCTL(TIOCLINUX)
 COMPATIBLE_IOCTL(TIOCSBRK)
+COMPATIBLE_IOCTL(TIOCGDEV)
 COMPATIBLE_IOCTL(TIOCCBRK)
 COMPATIBLE_IOCTL(TIOCGSID)
 COMPATIBLE_IOCTL(TIOCGICOUNT)
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index da6061a6df40..026cf68553a4 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -120,7 +120,7 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
 {
        struct config_item * item = NULL;
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
        if (!d_unhashed(dentry)) {
                struct configfs_dirent * sd = dentry->d_fsdata;
                if (sd->s_type & CONFIGFS_ITEM_LINK) {
@@ -129,7 +129,7 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
                } else
                        item = config_item_get(sd->s_element);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
        return item;
 }
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 0b502f80c691..36637a8c1ed3 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -67,7 +67,7 @@ static void configfs_d_iput(struct dentry * dentry,
 * We _must_ delete our dentries on last dput, as the chain-to-parent
 * behavior is required to clear the parents of default_groups.
 */
-static int configfs_d_delete(struct dentry *dentry)
+static int configfs_d_delete(const struct dentry *dentry)
 {
        return 1;
 }
@@ -232,10 +232,8 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
        sd->s_mode = mode;
        sd->s_dentry = dentry;
-        if (dentry) {
+        if (dentry)
                dentry->d_fsdata = configfs_get(sd);
-                dentry->d_op = &configfs_dentry_ops;
-        }
        return 0;
 }
@@ -278,7 +276,6 @@ static int create_dir(struct config_item * k, struct dentry * p,
                error = configfs_create(d, mode, init_dir);
                if (!error) {
                        inc_nlink(p->d_inode);
-                        (d)->d_op = &configfs_dentry_ops;
                } else {
                        struct configfs_dirent *sd = d->d_fsdata;
                        if (sd) {
@@ -371,9 +368,7 @@ int configfs_create_link(struct configfs_symlink *sl,
                                   CONFIGFS_ITEM_LINK);
        if (!err) {
                err = configfs_create(dentry, mode, init_symlink);
-                if (!err)
+                if (err) {
-                        dentry->d_op = &configfs_dentry_ops;
-                else {
                        struct configfs_dirent *sd = dentry->d_fsdata;
                        if (sd) {
                                spin_lock(&configfs_dirent_lock);
@@ -399,8 +394,7 @@ static void remove_dir(struct dentry * d)
        if (d->d_inode)
                simple_rmdir(parent->d_inode,d);
-        pr_debug(" o %s removing done (%d)\n",d->d_name.name,
+        pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);
-                 atomic_read(&d->d_count));
        dput(parent);
 }
@@ -448,7 +442,7 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
                return error;
        }
-        dentry->d_op = &configfs_dentry_ops;
+        d_set_d_op(dentry, &configfs_dentry_ops);
        d_rehash(dentry);
        return 0;
@@ -493,7 +487,11 @@ static struct dentry * configfs_lookup(struct inode *dir,
                 * If it doesn't exist and it isn't a NOT_PINNED item,
                 * it must be negative.
                 */
-                return simple_lookup(dir, dentry, nd);
+                if (dentry->d_name.len > NAME_MAX)
+                        return ERR_PTR(-ENAMETOOLONG);
+                d_set_d_op(dentry, &configfs_dentry_ops);
+                d_add(dentry, NULL);
+                return NULL;
        }
 out:
@@ -685,6 +683,7 @@ static int create_default_group(struct config_group *parent_group,
        ret = -ENOMEM;
        child = d_alloc(parent, &name);
        if (child) {
+                d_set_d_op(child, &configfs_dentry_ops);
                d_add(child, NULL);
                ret = configfs_attach_group(&parent_group->cg_item,
@@ -1682,6 +1681,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
        err = -ENOMEM;
        dentry = d_alloc(configfs_sb->s_root, &name);
        if (dentry) {
+                d_set_d_op(dentry, &configfs_dentry_ops);
                d_add(dentry, NULL);
                err = configfs_attach_group(sd->s_element, &group->cg_item,
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 253476d78ed8..c83f4768eeaa 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -250,18 +250,14 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
        struct dentry * dentry = sd->s_dentry;
        if (dentry) {
-                spin_lock(&dcache_lock);
                spin_lock(&dentry->d_lock);
                if (!(d_unhashed(dentry) && dentry->d_inode)) {
-                        dget_locked(dentry);
+                        dget_dlock(dentry);
                        __d_drop(dentry);
                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&dcache_lock);
                        simple_unlink(parent->d_inode, dentry);
-                } else {
+                } else
                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&dcache_lock);
-                }
        }
 }
diff --git a/fs/dcache.c b/fs/dcache.c
index 23702a9d4e6d..5699d4c027cb 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -33,20 +33,58 @@
 #include <linux/bootmem.h>
 #include <linux/fs_struct.h>
 #include <linux/hardirq.h>
+#include <linux/bit_spinlock.h>
+#include <linux/rculist_bl.h>
 #include "internal.h"
+/*
+ * Usage:
+ * dcache->d_inode->i_lock protects:
+ *   - i_dentry, d_alias, d_inode of aliases
+ * dcache_hash_bucket lock protects:
+ *   - the dcache hash table
+ * s_anon bl list spinlock protects:
+ *   - the s_anon list (see __d_drop)
+ * dcache_lru_lock protects:
+ *   - the dcache lru lists and counters
+ * d_lock protects:
+ *   - d_flags
+ *   - d_name
+ *   - d_lru
+ *   - d_count
+ *   - d_unhashed()
+ *   - d_parent and d_subdirs
+ *   - childrens' d_child and d_parent
+ *   - d_alias, d_inode
+ *
+ * Ordering:
+ * dentry->d_inode->i_lock
+ *   dentry->d_lock
+ *     dcache_lru_lock
+ *     dcache_hash_bucket lock
+ *     s_anon lock
+ *
+ * If there is an ancestor relationship:
+ * dentry->d_parent->...->d_parent->d_lock
+ *   ...
+ *     dentry->d_parent->d_lock
+ *       dentry->d_lock
+ *
+ * If no ancestor relationship:
+ * if (dentry1 < dentry2)
+ *   dentry1->d_lock
+ *     dentry2->d_lock
+ */
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
- __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
-EXPORT_SYMBOL(dcache_lock);
+EXPORT_SYMBOL(rename_lock);
 static struct kmem_cache *dentry_cache __read_mostly;
-#define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname))
 /*
 * This is the single most critical data structure when it comes
 * to the dcache: the hashtable for lookups. Somebody should try
@@ -60,22 +98,51 @@ static struct kmem_cache *dentry_cache __read_mostly;
 static unsigned int d_hash_mask __read_mostly;
 static unsigned int d_hash_shift __read_mostly;
-static struct hlist_head *dentry_hashtable __read_mostly;
+struct dcache_hash_bucket {
+        struct hlist_bl_head head;
+};
+static struct dcache_hash_bucket *dentry_hashtable __read_mostly;
+static inline struct dcache_hash_bucket *d_hash(struct dentry *parent,
+                                        unsigned long hash)
+{
+        hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
+        hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
+        return dentry_hashtable + (hash & D_HASHMASK);
+}
+static inline void spin_lock_bucket(struct dcache_hash_bucket *b)
+{
+        bit_spin_lock(0, (unsigned long *)&b->head.first);
+}
+static inline void spin_unlock_bucket(struct dcache_hash_bucket *b)
+{
+        __bit_spin_unlock(0, (unsigned long *)&b->head.first);
+}
 /* Statistics gathering. */
 struct dentry_stat_t dentry_stat = {
        .age_limit = 45,
 };
-static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(unsigned int, nr_dentry);
-static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp;
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+static int get_nr_dentry(void)
+{
+        int i;
+        int sum = 0;
+        for_each_possible_cpu(i)
+                sum += per_cpu(nr_dentry, i);
+        return sum < 0 ? 0 : sum;
+}
 int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
                   size_t *lenp, loff_t *ppos)
 {
-        dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
+        dentry_stat.nr_dentry = get_nr_dentry();
-        dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
        return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #endif
@@ -91,35 +158,50 @@ static void __d_free(struct rcu_head *head)
 }
 /*
- * no dcache_lock, please.
+ * no locks, please.
 */
 static void d_free(struct dentry *dentry)
 {
-        percpu_counter_dec(&nr_dentry);
+        BUG_ON(dentry->d_count);
+        this_cpu_dec(nr_dentry);
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);
        /* if dentry was never inserted into hash, immediate free is OK */
-        if (hlist_unhashed(&dentry->d_hash))
+        if (hlist_bl_unhashed(&dentry->d_hash))
                __d_free(&dentry->d_u.d_rcu);
        else
                call_rcu(&dentry->d_u.d_rcu, __d_free);
 }
+/**
+ * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * After this call, in-progress rcu-walk path lookup will fail. This
+ * should be called after unhashing, and after changing d_inode (if
+ * the dentry has not already been unhashed).
+ */
+static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
+{
+        assert_spin_locked(&dentry->d_lock);
+        /* Go through a barrier */
+        write_seqcount_barrier(&dentry->d_seq);
+}
 /*
 * Release the dentry's inode, using the filesystem
- * d_iput() operation if defined.
+ * d_iput() operation if defined. Dentry has no refcount
+ * and is unhashed.
 */
 static void dentry_iput(struct dentry * dentry)
        __releases(dentry->d_lock)
-        __releases(dcache_lock)
+        __releases(dentry->d_inode->i_lock)
 {
        struct inode *inode = dentry->d_inode;
        if (inode) {
                dentry->d_inode = NULL;
                list_del_init(&dentry->d_alias);
                spin_unlock(&dentry->d_lock);
-                spin_unlock(&dcache_lock);
+                spin_unlock(&inode->i_lock);
                if (!inode->i_nlink)
                        fsnotify_inoderemove(inode);
                if (dentry->d_op && dentry->d_op->d_iput)
@@ -128,40 +210,72 @@ static void dentry_iput(struct dentry * dentry)
                        iput(inode);
        } else {
                spin_unlock(&dentry->d_lock);
-                spin_unlock(&dcache_lock);
        }
 }
 /*
- * dentry_lru_(add|del|move_tail) must be called with dcache_lock held.
+ * Release the dentry's inode, using the filesystem
+ * d_iput() operation if defined. dentry remains in-use.
+ */
+static void dentry_unlink_inode(struct dentry * dentry)
+        __releases(dentry->d_lock)
+        __releases(dentry->d_inode->i_lock)
+{
+        struct inode *inode = dentry->d_inode;
+        dentry->d_inode = NULL;
+        list_del_init(&dentry->d_alias);
+        dentry_rcuwalk_barrier(dentry);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&inode->i_lock);
+        if (!inode->i_nlink)
+                fsnotify_inoderemove(inode);
+        if (dentry->d_op && dentry->d_op->d_iput)
+                dentry->d_op->d_iput(dentry, inode);
+        else
+                iput(inode);
+}
+/*
+ * dentry_lru_(add|del|move_tail) must be called with d_lock held.
 */
 static void dentry_lru_add(struct dentry *dentry)
 {
        if (list_empty(&dentry->d_lru)) {
+                spin_lock(&dcache_lru_lock);
                list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
                dentry->d_sb->s_nr_dentry_unused++;
-                percpu_counter_inc(&nr_dentry_unused);
+                dentry_stat.nr_unused++;
+                spin_unlock(&dcache_lru_lock);
        }
 }
+static void __dentry_lru_del(struct dentry *dentry)
+{
+        list_del_init(&dentry->d_lru);
+        dentry->d_sb->s_nr_dentry_unused--;
+        dentry_stat.nr_unused--;
+}
 static void dentry_lru_del(struct dentry *dentry)
 {
        if (!list_empty(&dentry->d_lru)) {
-                list_del_init(&dentry->d_lru);
+                spin_lock(&dcache_lru_lock);
-                dentry->d_sb->s_nr_dentry_unused--;
+                __dentry_lru_del(dentry);
-                percpu_counter_dec(&nr_dentry_unused);
+                spin_unlock(&dcache_lru_lock);
        }
 }
 static void dentry_lru_move_tail(struct dentry *dentry)
 {
+        spin_lock(&dcache_lru_lock);
        if (list_empty(&dentry->d_lru)) {
                list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
                dentry->d_sb->s_nr_dentry_unused++;
-                percpu_counter_inc(&nr_dentry_unused);
+                dentry_stat.nr_unused++;
        } else {
                list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
        }
+        spin_unlock(&dcache_lru_lock);
 }
 /**
@@ -171,22 +285,115 @@ static void dentry_lru_move_tail(struct dentry *dentry)
 * The dentry must already be unhashed and removed from the LRU.
 *
 * If this is the root of the dentry tree, return NULL.
+ *
+ * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by
+ * d_kill.
 */
-static struct dentry *d_kill(struct dentry *dentry)
+static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
        __releases(dentry->d_lock)
-        __releases(dcache_lock)
+        __releases(parent->d_lock)
+        __releases(dentry->d_inode->i_lock)
 {
-        struct dentry *parent;
+        dentry->d_parent = NULL;
        list_del(&dentry->d_u.d_child);
-        /*drops the locks, at that point nobody can reach this dentry */
+        if (parent)
+                spin_unlock(&parent->d_lock);
        dentry_iput(dentry);
+        /*
+         * dentry_iput drops the locks, at which point nobody (except
+         * transient RCU lookups) can reach this dentry.
+         */
+        d_free(dentry);
+        return parent;
+}
+/**
+ * d_drop - drop a dentry
+ * @dentry: dentry to drop
+ *
+ * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
+ * be found through a VFS lookup any more. Note that this is different from
+ * deleting the dentry - d_delete will try to mark the dentry negative if
+ * possible, giving a successful _negative_ lookup, while d_drop will
+ * just make the cache lookup fail.
+ *
+ * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
+ * reason (NFS timeouts or autofs deletes).
+ *
+ * __d_drop requires dentry->d_lock.
+ */
+void __d_drop(struct dentry *dentry)
+{
+        if (!(dentry->d_flags & DCACHE_UNHASHED)) {
+                if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) {
+                        bit_spin_lock(0,
+                                (unsigned long *)&dentry->d_sb->s_anon.first);
+                        dentry->d_flags |= DCACHE_UNHASHED;
+                        hlist_bl_del_init(&dentry->d_hash);
+                        __bit_spin_unlock(0,
+                                (unsigned long *)&dentry->d_sb->s_anon.first);
+                } else {
+                        struct dcache_hash_bucket *b;
+                        b = d_hash(dentry->d_parent, dentry->d_name.hash);
+                        spin_lock_bucket(b);
+                        /*
+                         * We may not actually need to put DCACHE_UNHASHED
+                         * manipulations under the hash lock, but follow
+                         * the principle of least surprise.
+                         */
+                        dentry->d_flags |= DCACHE_UNHASHED;
+                        hlist_bl_del_rcu(&dentry->d_hash);
+                        spin_unlock_bucket(b);
+                        dentry_rcuwalk_barrier(dentry);
+                }
+        }
+}
+EXPORT_SYMBOL(__d_drop);
+void d_drop(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        __d_drop(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(d_drop);
+/*
+ * Finish off a dentry we've decided to kill.
+ * dentry->d_lock must be held, returns with it unlocked.
+ * If ref is non-zero, then decrement the refcount too.
+ * Returns dentry requiring refcount drop, or NULL if we're done.
+ */
+static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
+        __releases(dentry->d_lock)
+{
+        struct inode *inode;
+        struct dentry *parent;
+        inode = dentry->d_inode;
+        if (inode && !spin_trylock(&inode->i_lock)) {
+relock:
+                spin_unlock(&dentry->d_lock);
+                cpu_relax();
+                return dentry; /* try again with same dentry */
+        }
        if (IS_ROOT(dentry))
                parent = NULL;
        else
                parent = dentry->d_parent;
-        d_free(dentry);
+        if (parent && !spin_trylock(&parent->d_lock)) {
-        return parent;
+                if (inode)
+                        spin_unlock(&inode->i_lock);
+                goto relock;
+        }
+        if (ref)
+                dentry->d_count--;
+        /* if dentry was on the d_lru list delete it from there */
+        dentry_lru_del(dentry);
+        /* if it was on the hash then remove it */
+        __d_drop(dentry);
+        return d_kill(dentry, parent);
 }
 /* 
@@ -214,34 +421,26 @@ static struct dentry *d_kill(struct dentry *dentry)
 * call the dentry unlink method as well as removing it from the queues and
 * releasing its resources. If the parent dentries were scheduled for release
 * they too may now get deleted.
- *
- * no dcache lock, please.
 */
 void dput(struct dentry *dentry)
 {
        if (!dentry)
                return;
 repeat:
-        if (atomic_read(&dentry->d_count) == 1)
+        if (dentry->d_count == 1)
                might_sleep();
-        if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
-                return;
        spin_lock(&dentry->d_lock);
-        if (atomic_read(&dentry->d_count)) {
+        BUG_ON(!dentry->d_count);
+        if (dentry->d_count > 1) {
+                dentry->d_count--;
                spin_unlock(&dentry->d_lock);
-                spin_unlock(&dcache_lock);
                return;
        }
-        /*
+        if (dentry->d_flags & DCACHE_OP_DELETE) {
-         * AV: ->d_delete() is _NOT_ allowed to block now.
-         */
-        if (dentry->d_op && dentry->d_op->d_delete) {
                if (dentry->d_op->d_delete(dentry))
-                        goto unhash_it;
+                        goto kill_it;
        }
        /* Unreachable? Get rid of it */
@@ -252,16 +451,12 @@ repeat:
        dentry->d_flags |= DCACHE_REFERENCED;
        dentry_lru_add(dentry);
-        spin_unlock(&dentry->d_lock);
+        dentry->d_count--;
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
        return;
-unhash_it:
-        __d_drop(dentry);
 kill_it:
-        /* if dentry was on the d_lru list delete it from there */
+        dentry = dentry_kill(dentry, 1);
-        dentry_lru_del(dentry);
-        dentry = d_kill(dentry);
        if (dentry)
                goto repeat;
 }
@@ -284,9 +479,9 @@ int d_invalidate(struct dentry * dentry)
        /*
         * If it's already been dropped, return OK.
         */
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
        if (d_unhashed(dentry)) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
                return 0;
        }
        /*
@@ -294,9 +489,9 @@ int d_invalidate(struct dentry * dentry)
         * to get rid of unused child entries.
         */
        if (!list_empty(&dentry->d_subdirs)) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&dentry->d_lock);
                shrink_dcache_parent(dentry);
-                spin_lock(&dcache_lock);
+                spin_lock(&dentry->d_lock);
        }
        /*
@@ -309,35 +504,61 @@ int d_invalidate(struct dentry * dentry)
         * we might still populate it if it was a
         * working directory or similar).
         */
-        spin_lock(&dentry->d_lock);
+        if (dentry->d_count > 1) {
-        if (atomic_read(&dentry->d_count) > 1) {
                if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) {
                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&dcache_lock);
                        return -EBUSY;
                }
        }
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
        return 0;
 }
 EXPORT_SYMBOL(d_invalidate);
-/* This should be called _only_ with dcache_lock held */
+/* This must be called with d_lock held */
-static inline struct dentry * __dget_locked(struct dentry *dentry)
+static inline void __dget_dlock(struct dentry *dentry)
 {
-        atomic_inc(&dentry->d_count);
+        dentry->d_count++;
-        dentry_lru_del(dentry);
-        return dentry;
 }
-struct dentry * dget_locked(struct dentry *dentry)
+static inline void __dget(struct dentry *dentry)
 {
-        return __dget_locked(dentry);
+        spin_lock(&dentry->d_lock);
+        __dget_dlock(dentry);
+        spin_unlock(&dentry->d_lock);
+}
+struct dentry *dget_parent(struct dentry *dentry)
+{
+        struct dentry *ret;
+repeat:
+        /*
+         * Don't need rcu_dereference because we re-check it was correct under
+         * the lock.
+         */
+        rcu_read_lock();
+        ret = dentry->d_parent;
+        if (!ret) {
+                rcu_read_unlock();
+                goto out;
+        }
+        spin_lock(&ret->d_lock);
+        if (unlikely(ret != dentry->d_parent)) {
+                spin_unlock(&ret->d_lock);
+                rcu_read_unlock();
+                goto repeat;
+        }
+        rcu_read_unlock();
+        BUG_ON(!ret->d_count);
+        ret->d_count++;
+        spin_unlock(&ret->d_lock);
+out:
+        return ret;
 }
-EXPORT_SYMBOL(dget_locked);
+EXPORT_SYMBOL(dget_parent);
 /**
 * d_find_alias - grab a hashed alias of inode
@@ -355,42 +576,51 @@ EXPORT_SYMBOL(dget_locked);
 * any other hashed alias over that one unless @want_discon is set,
 * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
 */
+static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
-static struct dentry * __d_find_alias(struct inode *inode, int want_discon)
 {
-        struct list_head *head, *next, *tmp;
+        struct dentry *alias, *discon_alias;
-        struct dentry *alias, *discon_alias=NULL;
-        head = &inode->i_dentry;
+again:
-        next = inode->i_dentry.next;
+        discon_alias = NULL;
-        while (next != head) {
+        list_for_each_entry(alias, &inode->i_dentry, d_alias) {
-                tmp = next;
+                spin_lock(&alias->d_lock);
-                next = tmp->next;
-                prefetch(next);
-                alias = list_entry(tmp, struct dentry, d_alias);
                if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
                        if (IS_ROOT(alias) &&
-                            (alias->d_flags & DCACHE_DISCONNECTED))
+                            (alias->d_flags & DCACHE_DISCONNECTED)) {
                                discon_alias = alias;
-                        else if (!want_discon) {
+                        } else if (!want_discon) {
-                                __dget_locked(alias);
+                                __dget_dlock(alias);
+                                spin_unlock(&alias->d_lock);
+                                return alias;
+                        }
+                }
+                spin_unlock(&alias->d_lock);
+        }
+        if (discon_alias) {
+                alias = discon_alias;
+                spin_lock(&alias->d_lock);
+                if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
+                        if (IS_ROOT(alias) &&
+                            (alias->d_flags & DCACHE_DISCONNECTED)) {
+                                __dget_dlock(alias);
+                                spin_unlock(&alias->d_lock);
                                return alias;
                        }
                }
+                spin_unlock(&alias->d_lock);
+                goto again;
        }
-        if (discon_alias)
+        return NULL;
-                __dget_locked(discon_alias);
-        return discon_alias;
 }
-struct dentry * d_find_alias(struct inode *inode)
+struct dentry *d_find_alias(struct inode *inode)
 {
        struct dentry *de = NULL;
        if (!list_empty(&inode->i_dentry)) {
-                spin_lock(&dcache_lock);
+                spin_lock(&inode->i_lock);
                de = __d_find_alias(inode, 0);
-                spin_unlock(&dcache_lock);
+                spin_unlock(&inode->i_lock);
        }
        return de;
 }
@@ -404,54 +634,61 @@ void d_prune_aliases(struct inode *inode)
 {
        struct dentry *dentry;
 restart:
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
                spin_lock(&dentry->d_lock);
-                if (!atomic_read(&dentry->d_count)) {
+                if (!dentry->d_count) {
-                        __dget_locked(dentry);
+                        __dget_dlock(dentry);
                        __d_drop(dentry);
                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&inode->i_lock);
                        dput(dentry);
                        goto restart;
                }
                spin_unlock(&dentry->d_lock);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL(d_prune_aliases);
 /*
- * Throw away a dentry - free the inode, dput the parent.  This requires that
+ * Try to throw away a dentry - free the inode, dput the parent.
- * the LRU list has already been removed.
+ * Requires dentry->d_lock is held, and dentry->d_count == 0.
+ * Releases dentry->d_lock.
 *
- * Try to prune ancestors as well.  This is necessary to prevent
+ * This may fail if locks cannot be acquired no problem, just try again.
- * quadratic behavior of shrink_dcache_parent(), but is also expected
- * to be beneficial in reducing dentry cache fragmentation.
 */
-static void prune_one_dentry(struct dentry * dentry)
+static void try_prune_one_dentry(struct dentry *dentry)
        __releases(dentry->d_lock)
-        __releases(dcache_lock)
-        __acquires(dcache_lock)
 {
-        __d_drop(dentry);
+        struct dentry *parent;
-        dentry = d_kill(dentry);
+        parent = dentry_kill(dentry, 0);
        /*
-         * Prune ancestors.  Locking is simpler than in dput(),
+         * If dentry_kill returns NULL, we have nothing more to do.
-         * because dcache_lock needs to be taken anyway.
+         * if it returns the same dentry, trylocks failed. In either
+         * case, just loop again.
+         *
+         * Otherwise, we need to prune ancestors too. This is necessary
+         * to prevent quadratic behavior of shrink_dcache_parent(), but
+         * is also expected to be beneficial in reducing dentry cache
+         * fragmentation.
         */
-        spin_lock(&dcache_lock);
+        if (!parent)
+                return;
+        if (parent == dentry)
+                return;
+        /* Prune ancestors. */
+        dentry = parent;
        while (dentry) {
-                if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock))
+                spin_lock(&dentry->d_lock);
+                if (dentry->d_count > 1) {
+                        dentry->d_count--;
+                        spin_unlock(&dentry->d_lock);
                        return;
+                }
-                if (dentry->d_op && dentry->d_op->d_delete)
+                dentry = dentry_kill(dentry, 1);
-                        dentry->d_op->d_delete(dentry);
-                dentry_lru_del(dentry);
-                __d_drop(dentry);
-                dentry = d_kill(dentry);
-                spin_lock(&dcache_lock);
        }
 }
@@ -459,24 +696,35 @@ static void shrink_dentry_list(struct list_head *list)
 {
        struct dentry *dentry;
-        while (!list_empty(list)) {
+        rcu_read_lock();
-                dentry = list_entry(list->prev, struct dentry, d_lru);
+        for (;;) {
-                dentry_lru_del(dentry);
+                dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
+                if (&dentry->d_lru == list)
+                        break; /* empty */
+                spin_lock(&dentry->d_lock);
+                if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
+                        spin_unlock(&dentry->d_lock);
+                        continue;
+                }
                /*
                 * We found an inuse dentry which was not removed from
                 * the LRU because of laziness during lookup.  Do not free
                 * it - just keep it off the LRU list.
                 */
-                spin_lock(&dentry->d_lock);
+                if (dentry->d_count) {
-                if (atomic_read(&dentry->d_count)) {
+                        dentry_lru_del(dentry);
                        spin_unlock(&dentry->d_lock);
                        continue;
                }
-                prune_one_dentry(dentry);
-                /* dentry->d_lock was dropped in prune_one_dentry() */
+                rcu_read_unlock();
-                cond_resched_lock(&dcache_lock);
+                try_prune_one_dentry(dentry);
+                rcu_read_lock();
        }
+        rcu_read_unlock();
 }
 /**
@@ -495,42 +743,44 @@ static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
        LIST_HEAD(tmp);
        int cnt = *count;
-        spin_lock(&dcache_lock);
+relock:
+        spin_lock(&dcache_lru_lock);
        while (!list_empty(&sb->s_dentry_lru)) {
                dentry = list_entry(sb->s_dentry_lru.prev,
                                struct dentry, d_lru);
                BUG_ON(dentry->d_sb != sb);
+                if (!spin_trylock(&dentry->d_lock)) {
+                        spin_unlock(&dcache_lru_lock);
+                        cpu_relax();
+                        goto relock;
+                }
                /*
                 * If we are honouring the DCACHE_REFERENCED flag and the
                 * dentry has this flag set, don't free it.  Clear the flag
                 * and put it back on the LRU.
                 */
-                if (flags & DCACHE_REFERENCED) {
+                if (flags & DCACHE_REFERENCED &&
-                        spin_lock(&dentry->d_lock);
+                                dentry->d_flags & DCACHE_REFERENCED) {
-                        if (dentry->d_flags & DCACHE_REFERENCED) {
+                        dentry->d_flags &= ~DCACHE_REFERENCED;
-                                dentry->d_flags &= ~DCACHE_REFERENCED;
+                        list_move(&dentry->d_lru, &referenced);
-                                list_move(&dentry->d_lru, &referenced);
-                                spin_unlock(&dentry->d_lock);
-                                cond_resched_lock(&dcache_lock);
-                                continue;
-                        }
                        spin_unlock(&dentry->d_lock);
+                } else {
+                        list_move_tail(&dentry->d_lru, &tmp);
+                        spin_unlock(&dentry->d_lock);
+                        if (!--cnt)
+                                break;
                }
+                cond_resched_lock(&dcache_lru_lock);
-                list_move_tail(&dentry->d_lru, &tmp);
-                if (!--cnt)
-                        break;
-                cond_resched_lock(&dcache_lock);
        }
-        *count = cnt;
-        shrink_dentry_list(&tmp);
        if (!list_empty(&referenced))
                list_splice(&referenced, &sb->s_dentry_lru);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dcache_lru_lock);
+        shrink_dentry_list(&tmp);
+        *count = cnt;
 }
 /**
@@ -546,13 +796,12 @@ static void prune_dcache(int count)
 {
        struct super_block *sb, *p = NULL;
        int w_count;
-        int unused = percpu_counter_sum_positive(&nr_dentry_unused);
+        int unused = dentry_stat.nr_unused;
        int prune_ratio;
        int pruned;
        if (unused == 0 || count == 0)
                return;
-        spin_lock(&dcache_lock);
        if (count >= unused)
                prune_ratio = 1;
        else
@@ -589,11 +838,9 @@ static void prune_dcache(int count)
                if (down_read_trylock(&sb->s_umount)) {
                        if ((sb->s_root != NULL) &&
                            (!list_empty(&sb->s_dentry_lru))) {
-                                spin_unlock(&dcache_lock);
                                __shrink_dcache_sb(sb, &w_count,
                                                DCACHE_REFERENCED);
                                pruned -= w_count;
-                                spin_lock(&dcache_lock);
                        }
                        up_read(&sb->s_umount);
                }
@@ -609,7 +856,6 @@ static void prune_dcache(int count)
        if (p)
                __put_super(p);
        spin_unlock(&sb_lock);
-        spin_unlock(&dcache_lock);
 }
 /**
@@ -623,12 +869,14 @@ void shrink_dcache_sb(struct super_block *sb)
 {
        LIST_HEAD(tmp);
-        spin_lock(&dcache_lock);
+        spin_lock(&dcache_lru_lock);
        while (!list_empty(&sb->s_dentry_lru)) {
                list_splice_init(&sb->s_dentry_lru, &tmp);
+                spin_unlock(&dcache_lru_lock);
                shrink_dentry_list(&tmp);
+                spin_lock(&dcache_lru_lock);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dcache_lru_lock);
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
@@ -645,10 +893,10 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
        BUG_ON(!IS_ROOT(dentry));
        /* detach this root from the system */
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
        dentry_lru_del(dentry);
        __d_drop(dentry);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
        for (;;) {
                /* descend to the first leaf in the current subtree */
@@ -657,14 +905,16 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                        /* this is a branch with children - detach all of them
                         * from the system in one go */
-                        spin_lock(&dcache_lock);
+                        spin_lock(&dentry->d_lock);
                        list_for_each_entry(loop, &dentry->d_subdirs,
                                            d_u.d_child) {
+                                spin_lock_nested(&loop->d_lock,
+                                                DENTRY_D_LOCK_NESTED);
                                dentry_lru_del(loop);
                                __d_drop(loop);
-                                cond_resched_lock(&dcache_lock);
+                                spin_unlock(&loop->d_lock);
                        }
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&dentry->d_lock);
                        /* move to the first child */
                        dentry = list_entry(dentry->d_subdirs.next,
@@ -676,7 +926,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                do {
                        struct inode *inode;
-                        if (atomic_read(&dentry->d_count) != 0) {
+                        if (dentry->d_count != 0) {
                                printk(KERN_ERR
                                       "BUG: Dentry %p{i=%lx,n=%s}"
                                       " still in use (%d)"
@@ -685,20 +935,23 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                                       dentry->d_inode ?
                                       dentry->d_inode->i_ino : 0UL,
                                       dentry->d_name.name,
-                                       atomic_read(&dentry->d_count),
+                                       dentry->d_count,
                                       dentry->d_sb->s_type->name,
                                       dentry->d_sb->s_id);
                                BUG();
                        }
-                        if (IS_ROOT(dentry))
+                        if (IS_ROOT(dentry)) {
                                parent = NULL;
-                        else {
+                                list_del(&dentry->d_u.d_child);
+                        } else {
                                parent = dentry->d_parent;
-                                atomic_dec(&parent->d_count);
+                                spin_lock(&parent->d_lock);
+                                parent->d_count--;
+                                list_del(&dentry->d_u.d_child);
+                                spin_unlock(&parent->d_lock);
                        }
-                        list_del(&dentry->d_u.d_child);
                        detached++;
                        inode = dentry->d_inode;
@@ -728,8 +981,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 /*
 * destroy the dentries attached to a superblock on unmounting
- * - we don't need to use dentry->d_lock, and only need dcache_lock when
+ * - we don't need to use dentry->d_lock because:
- *   removing the dentry from the system lists and hashes because:
 *   - the superblock is detached from all mountings and open files, so the
 *     dentry trees will not be rearranged by the VFS
 *   - s_umount is write-locked, so the memory pressure shrinker will ignore
@@ -746,11 +998,13 @@ void shrink_dcache_for_umount(struct super_block *sb)
        dentry = sb->s_root;
        sb->s_root = NULL;
-        atomic_dec(&dentry->d_count);
+        spin_lock(&dentry->d_lock);
+        dentry->d_count--;
+        spin_unlock(&dentry->d_lock);
        shrink_dcache_for_umount_subtree(dentry);
-        while (!hlist_empty(&sb->s_anon)) {
+        while (!hlist_bl_empty(&sb->s_anon)) {
-                dentry = hlist_entry(sb->s_anon.first, struct dentry, d_hash);
+                dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
                shrink_dcache_for_umount_subtree(dentry);
        }
 }
@@ -768,15 +1022,20 @@ void shrink_dcache_for_umount(struct super_block *sb)
 * Return true if the parent or its subdirectories contain
 * a mount point
 */
- 
 int have_submounts(struct dentry *parent)
 {
-        struct dentry *this_parent = parent;
+        struct dentry *this_parent;
        struct list_head *next;
+        unsigned seq;
+        int locked = 0;
+        seq = read_seqbegin(&rename_lock);
+again:
+        this_parent = parent;
-        spin_lock(&dcache_lock);
        if (d_mountpoint(parent))
                goto positive;
+        spin_lock(&this_parent->d_lock);
 repeat:
        next = this_parent->d_subdirs.next;
 resume:
@@ -784,27 +1043,65 @@ resume:
                struct list_head *tmp = next;
                struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
                next = tmp->next;
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
                /* Have we found a mount point ? */
-                if (d_mountpoint(dentry))
+                if (d_mountpoint(dentry)) {
+                        spin_unlock(&dentry->d_lock);
+                        spin_unlock(&this_parent->d_lock);
                        goto positive;
+                }
                if (!list_empty(&dentry->d_subdirs)) {
+                        spin_unlock(&this_parent->d_lock);
+                        spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
                        this_parent = dentry;
+                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
+                spin_unlock(&dentry->d_lock);
        }
        /*
         * All done at this level ... ascend and resume the search.
         */
        if (this_parent != parent) {
-                next = this_parent->d_u.d_child.next;
+                struct dentry *tmp;
-                this_parent = this_parent->d_parent;
+                struct dentry *child;
+                tmp = this_parent->d_parent;
+                rcu_read_lock();
+                spin_unlock(&this_parent->d_lock);
+                child = this_parent;
+                this_parent = tmp;
+                spin_lock(&this_parent->d_lock);
+                /* might go back up the wrong parent if we have had a rename
+                 * or deletion */
+                if (this_parent != child->d_parent ||
+                         (!locked && read_seqretry(&rename_lock, seq))) {
+                        spin_unlock(&this_parent->d_lock);
+                        rcu_read_unlock();
+                        goto rename_retry;
+                }
+                rcu_read_unlock();
+                next = child->d_u.d_child.next;
                goto resume;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&this_parent->d_lock);
+        if (!locked && read_seqretry(&rename_lock, seq))
+                goto rename_retry;
+        if (locked)
+                write_sequnlock(&rename_lock);
        return 0; /* No mount points found in tree */
 positive:
-        spin_unlock(&dcache_lock);
+        if (!locked && read_seqretry(&rename_lock, seq))
+                goto rename_retry;
+        if (locked)
+                write_sequnlock(&rename_lock);
        return 1;
+rename_retry:
+        locked = 1;
+        write_seqlock(&rename_lock);
+        goto again;
 }
 EXPORT_SYMBOL(have_submounts);
@@ -824,11 +1121,16 @@ EXPORT_SYMBOL(have_submounts);
 */
 static int select_parent(struct dentry * parent)
 {
-        struct dentry *this_parent = parent;
+        struct dentry *this_parent;
        struct list_head *next;
+        unsigned seq;
        int found = 0;
+        int locked = 0;
-        spin_lock(&dcache_lock);
+        seq = read_seqbegin(&rename_lock);
+again:
+        this_parent = parent;
+        spin_lock(&this_parent->d_lock);
 repeat:
        next = this_parent->d_subdirs.next;
 resume:
@@ -837,11 +1139,13 @@ resume:
                struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
                next = tmp->next;
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
                /* 
                 * move only zero ref count dentries to the end 
                 * of the unused list for prune_dcache
                 */
-                if (!atomic_read(&dentry->d_count)) {
+                if (!dentry->d_count) {
                        dentry_lru_move_tail(dentry);
                        found++;
                } else {
@@ -853,28 +1157,63 @@ resume:
                 * ensures forward progress). We'll be coming back to find
                 * the rest.
                 */
-                if (found && need_resched())
+                if (found && need_resched()) {
+                        spin_unlock(&dentry->d_lock);
                        goto out;
+                }
                /*
                 * Descend a level if the d_subdirs list is non-empty.
                 */
                if (!list_empty(&dentry->d_subdirs)) {
+                        spin_unlock(&this_parent->d_lock);
+                        spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
                        this_parent = dentry;
+                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
+                spin_unlock(&dentry->d_lock);
        }
        /*
         * All done at this level ... ascend and resume the search.
         */
        if (this_parent != parent) {
-                next = this_parent->d_u.d_child.next;
+                struct dentry *tmp;
-                this_parent = this_parent->d_parent;
+                struct dentry *child;
+                tmp = this_parent->d_parent;
+                rcu_read_lock();
+                spin_unlock(&this_parent->d_lock);
+                child = this_parent;
+                this_parent = tmp;
+                spin_lock(&this_parent->d_lock);
+                /* might go back up the wrong parent if we have had a rename
+                 * or deletion */
+                if (this_parent != child->d_parent ||
+                        (!locked && read_seqretry(&rename_lock, seq))) {
+                        spin_unlock(&this_parent->d_lock);
+                        rcu_read_unlock();
+                        goto rename_retry;
+                }
+                rcu_read_unlock();
+                next = child->d_u.d_child.next;
                goto resume;
        }
 out:
-        spin_unlock(&dcache_lock);
+        spin_unlock(&this_parent->d_lock);
+        if (!locked && read_seqretry(&rename_lock, seq))
+                goto rename_retry;
+        if (locked)
+                write_sequnlock(&rename_lock);
        return found;
+rename_retry:
+        if (found)
+                return found;
+        locked = 1;
+        write_seqlock(&rename_lock);
+        goto again;
 }
 /**
@@ -908,16 +1247,13 @@ EXPORT_SYMBOL(shrink_dcache_parent);
 */
 static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
-        int nr_unused;
        if (nr) {
                if (!(gfp_mask & __GFP_FS))
                        return -1;
                prune_dcache(nr);
        }
-        nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+        return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
-        return (nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 static struct shrinker dcache_shrinker = {
@@ -960,38 +1296,52 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
        memcpy(dname, name->name, name->len);
        dname[name->len] = 0;
-        atomic_set(&dentry->d_count, 1);
+        dentry->d_count = 1;
        dentry->d_flags = DCACHE_UNHASHED;
        spin_lock_init(&dentry->d_lock);
+        seqcount_init(&dentry->d_seq);
        dentry->d_inode = NULL;
        dentry->d_parent = NULL;
        dentry->d_sb = NULL;
        dentry->d_op = NULL;
        dentry->d_fsdata = NULL;
-        dentry->d_mounted = 0;
+        INIT_HLIST_BL_NODE(&dentry->d_hash);
-        INIT_HLIST_NODE(&dentry->d_hash);
        INIT_LIST_HEAD(&dentry->d_lru);
        INIT_LIST_HEAD(&dentry->d_subdirs);
        INIT_LIST_HEAD(&dentry->d_alias);
+        INIT_LIST_HEAD(&dentry->d_u.d_child);
        if (parent) {
-                dentry->d_parent = dget(parent);
+                spin_lock(&parent->d_lock);
+                /*
+                 * don't need child lock because it is not subject
+                 * to concurrency here
+                 */
+                __dget_dlock(parent);
+                dentry->d_parent = parent;
                dentry->d_sb = parent->d_sb;
-        } else {
-                INIT_LIST_HEAD(&dentry->d_u.d_child);
-        }
-        spin_lock(&dcache_lock);
-        if (parent)
                list_add(&dentry->d_u.d_child, &parent->d_subdirs);
-        spin_unlock(&dcache_lock);
+                spin_unlock(&parent->d_lock);
+        }
-        percpu_counter_inc(&nr_dentry);
+        this_cpu_inc(nr_dentry);
        return dentry;
 }
 EXPORT_SYMBOL(d_alloc);
+struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
+{
+        struct dentry *dentry = d_alloc(NULL, name);
+        if (dentry) {
+                dentry->d_sb = sb;
+                dentry->d_parent = dentry;
+                dentry->d_flags |= DCACHE_DISCONNECTED;
+        }
+        return dentry;
+}
+EXPORT_SYMBOL(d_alloc_pseudo);
 struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 {
        struct qstr q;
@@ -1003,12 +1353,36 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 }
 EXPORT_SYMBOL(d_alloc_name);
-/* the caller must hold dcache_lock */
+void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
+{
+        BUG_ON(dentry->d_op);
+        BUG_ON(dentry->d_flags & (DCACHE_OP_HASH        |
+                                DCACHE_OP_COMPARE       |
+                                DCACHE_OP_REVALIDATE    |
+                                DCACHE_OP_DELETE ));
+        dentry->d_op = op;
+        if (!op)
+                return;
+        if (op->d_hash)
+                dentry->d_flags |= DCACHE_OP_HASH;
+        if (op->d_compare)
+                dentry->d_flags |= DCACHE_OP_COMPARE;
+        if (op->d_revalidate)
+                dentry->d_flags |= DCACHE_OP_REVALIDATE;
+        if (op->d_delete)
+                dentry->d_flags |= DCACHE_OP_DELETE;
+}
+EXPORT_SYMBOL(d_set_d_op);
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 {
+        spin_lock(&dentry->d_lock);
        if (inode)
                list_add(&dentry->d_alias, &inode->i_dentry);
        dentry->d_inode = inode;
+        dentry_rcuwalk_barrier(dentry);
+        spin_unlock(&dentry->d_lock);
        fsnotify_d_instantiate(dentry, inode);
 }
@@ -1030,9 +1404,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 void d_instantiate(struct dentry *entry, struct inode * inode)
 {
        BUG_ON(!list_empty(&entry->d_alias));
-        spin_lock(&dcache_lock);
+        if (inode)
+                spin_lock(&inode->i_lock);
        __d_instantiate(entry, inode);
-        spin_unlock(&dcache_lock);
+        if (inode)
+                spin_unlock(&inode->i_lock);
        security_d_instantiate(entry, inode);
 }
 EXPORT_SYMBOL(d_instantiate);
@@ -1069,15 +1445,18 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
        list_for_each_entry(alias, &inode->i_dentry, d_alias) {
                struct qstr *qstr = &alias->d_name;
+                /*
+                 * Don't need alias->d_lock here, because aliases with
+                 * d_parent == entry->d_parent are not subject to name or
+                 * parent changes, because the parent inode i_mutex is held.
+                 */
                if (qstr->hash != hash)
                        continue;
                if (alias->d_parent != entry->d_parent)
                        continue;
-                if (qstr->len != len)
+                if (dentry_cmp(qstr->name, qstr->len, name, len))
                        continue;
-                if (memcmp(qstr->name, name, len))
+                __dget(alias);
-                        continue;
-                dget_locked(alias);
                return alias;
        }
@@ -1091,9 +1470,11 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
        BUG_ON(!list_empty(&entry->d_alias));
-        spin_lock(&dcache_lock);
+        if (inode)
+                spin_lock(&inode->i_lock);
        result = __d_instantiate_unique(entry, inode);
-        spin_unlock(&dcache_lock);
+        if (inode)
+                spin_unlock(&inode->i_lock);
        if (!result) {
                security_d_instantiate(entry, inode);
@@ -1134,14 +1515,6 @@ struct dentry * d_alloc_root(struct inode * root_inode)
 }
 EXPORT_SYMBOL(d_alloc_root);
-static inline struct hlist_head *d_hash(struct dentry *parent,
-                                        unsigned long hash)
-{
-        hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES;
-        hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS);
-        return dentry_hashtable + (hash & D_HASHMASK);
-}
 /**
 * d_obtain_alias - find or allocate a dentry for a given inode
 * @inode: inode to allocate the dentry for
@@ -1182,10 +1555,11 @@ struct dentry *d_obtain_alias(struct inode *inode)
        }
        tmp->d_parent = tmp; /* make sure dput doesn't croak */
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        res = __d_find_alias(inode, 0);
        if (res) {
-                spin_unlock(&dcache_lock);
+                spin_unlock(&inode->i_lock);
                dput(tmp);
                goto out_iput;
        }
@@ -1195,12 +1569,14 @@ struct dentry *d_obtain_alias(struct inode *inode)
        tmp->d_sb = inode->i_sb;
        tmp->d_inode = inode;
        tmp->d_flags |= DCACHE_DISCONNECTED;
-        tmp->d_flags &= ~DCACHE_UNHASHED;
        list_add(&tmp->d_alias, &inode->i_dentry);
-        hlist_add_head(&tmp->d_hash, &inode->i_sb->s_anon);
+        bit_spin_lock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
+        tmp->d_flags &= ~DCACHE_UNHASHED;
+        hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
+        __bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
        spin_unlock(&tmp->d_lock);
+        spin_unlock(&inode->i_lock);
-        spin_unlock(&dcache_lock);
        return tmp;
 out_iput:
@@ -1230,18 +1606,18 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
        struct dentry *new = NULL;
        if (inode && S_ISDIR(inode->i_mode)) {
-                spin_lock(&dcache_lock);
+                spin_lock(&inode->i_lock);
                new = __d_find_alias(inode, 1);
                if (new) {
                        BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&inode->i_lock);
                        security_d_instantiate(new, inode);
                        d_move(new, dentry);
                        iput(inode);
                } else {
-                        /* already taking dcache_lock, so d_add() by hand */
+                        /* already taking inode->i_lock, so d_add() by hand */
                        __d_instantiate(dentry, inode);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&inode->i_lock);
                        security_d_instantiate(dentry, inode);
                        d_rehash(dentry);
                }
@@ -1314,10 +1690,10 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
         * Negative dentry: instantiate it unless the inode is a directory and
         * already has a dentry.
         */
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) {
                __d_instantiate(found, inode);
-                spin_unlock(&dcache_lock);
+                spin_unlock(&inode->i_lock);
                security_d_instantiate(found, inode);
                return found;
        }
@@ -1327,8 +1703,8 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
         * reference to it, move it in place and use it.
         */
        new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
-        dget_locked(new);
+        __dget(new);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        security_d_instantiate(found, inode);
        d_move(new, found);
        iput(inode);
@@ -1342,6 +1718,112 @@ err_out:
 EXPORT_SYMBOL(d_add_ci);
 /**
+ * __d_lookup_rcu - search for a dentry (racy, store-free)
+ * @parent: parent dentry
+ * @name: qstr of name we wish to find
+ * @seq: returns d_seq value at the point where the dentry was found
+ * @inode: returns dentry->d_inode when the inode was found valid.
+ * Returns: dentry, or NULL
+ *
+ * __d_lookup_rcu is the dcache lookup function for rcu-walk name
+ * resolution (store-free path walking) design described in
+ * Documentation/filesystems/path-lookup.txt.
+ *
+ * This is not to be used outside core vfs.
+ *
+ * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
+ * held, and rcu_read_lock held. The returned dentry must not be stored into
+ * without taking d_lock and checking d_seq sequence count against @seq
+ * returned here.
+ *
+ * A refcount may be taken on the found dentry with the __d_rcu_to_refcount
+ * function.
+ *
+ * Alternatively, __d_lookup_rcu may be called again to look up the child of
+ * the returned dentry, so long as its parent's seqlock is checked after the
+ * child is looked up. Thus, an interlocking stepping of sequence lock checks
+ * is formed, giving integrity down the path walk.
+ */
+struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
+                                unsigned *seq, struct inode **inode)
+{
+        unsigned int len = name->len;
+        unsigned int hash = name->hash;
+        const unsigned char *str = name->name;
+        struct dcache_hash_bucket *b = d_hash(parent, hash);
+        struct hlist_bl_node *node;
+        struct dentry *dentry;
+        /*
+         * Note: There is significant duplication with __d_lookup_rcu which is
+         * required to prevent single threaded performance regressions
+         * especially on architectures where smp_rmb (in seqcounts) are costly.
+         * Keep the two functions in sync.
+         */
+        /*
+         * The hash list is protected using RCU.
+         *
+         * Carefully use d_seq when comparing a candidate dentry, to avoid
+         * races with d_move().
+         *
+         * It is possible that concurrent renames can mess up our list
+         * walk here and result in missing our dentry, resulting in the
+         * false-negative result. d_lookup() protects against concurrent
+         * renames using rename_lock seqlock.
+         *
+         * See Documentation/vfs/dcache-locking.txt for more details.
+         */
+        hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
+                struct inode *i;
+                const char *tname;
+                int tlen;
+                if (dentry->d_name.hash != hash)
+                        continue;
+seqretry:
+                *seq = read_seqcount_begin(&dentry->d_seq);
+                if (dentry->d_parent != parent)
+                        continue;
+                if (d_unhashed(dentry))
+                        continue;
+                tlen = dentry->d_name.len;
+                tname = dentry->d_name.name;
+                i = dentry->d_inode;
+                prefetch(tname);
+                if (i)
+                        prefetch(i);
+                /*
+                 * This seqcount check is required to ensure name and
+                 * len are loaded atomically, so as not to walk off the
+                 * edge of memory when walking. If we could load this
+                 * atomically some other way, we could drop this check.
+                 */
+                if (read_seqcount_retry(&dentry->d_seq, *seq))
+                        goto seqretry;
+                if (parent->d_flags & DCACHE_OP_COMPARE) {
+                        if (parent->d_op->d_compare(parent, *inode,
+                                                dentry, i,
+                                                tlen, tname, name))
+                                continue;
+                } else {
+                        if (dentry_cmp(tname, tlen, str, len))
+                                continue;
+                }
+                /*
+                 * No extra seqcount check is required after the name
+                 * compare. The caller must perform a seqcount check in
+                 * order to do anything useful with the returned dentry
+                 * anyway.
+                 */
+                *inode = i;
+                return dentry;
+        }
+        return NULL;
+}
+/**
 * d_lookup - search for a dentry
 * @parent: parent dentry
 * @name: qstr of name we wish to find
@@ -1352,10 +1834,10 @@ EXPORT_SYMBOL(d_add_ci);
 * dentry is returned. The caller must use dput to free the entry when it has
 * finished using it. %NULL is returned if the dentry does not exist.
 */
-struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
+struct dentry *d_lookup(struct dentry *parent, struct qstr *name)
 {
-        struct dentry * dentry = NULL;
+        struct dentry *dentry;
-        unsigned long seq;
+        unsigned seq;
        do {
                seq = read_seqbegin(&rename_lock);
@@ -1367,7 +1849,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 }
 EXPORT_SYMBOL(d_lookup);
-/*
+/**
 * __d_lookup - search for a dentry (racy)
 * @parent: parent dentry
 * @name: qstr of name we wish to find
@@ -1382,17 +1864,24 @@ EXPORT_SYMBOL(d_lookup);
 *
 * __d_lookup callers must be commented.
 */
-struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
+struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
 {
        unsigned int len = name->len;
        unsigned int hash = name->hash;
        const unsigned char *str = name->name;
-        struct hlist_head *head = d_hash(parent,hash);
+        struct dcache_hash_bucket *b = d_hash(parent, hash);
+        struct hlist_bl_node *node;
        struct dentry *found = NULL;
-        struct hlist_node *node;
        struct dentry *dentry;
        /*
+         * Note: There is significant duplication with __d_lookup_rcu which is
+         * required to prevent single threaded performance regressions
+         * especially on architectures where smp_rmb (in seqcounts) are costly.
+         * Keep the two functions in sync.
+         */
+        /*
         * The hash list is protected using RCU.
         *
         * Take d_lock when comparing a candidate dentry, to avoid races
@@ -1407,25 +1896,16 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
         */
        rcu_read_lock();
        
-        hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
+        hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
-                struct qstr *qstr;
+                const char *tname;
+                int tlen;
                if (dentry->d_name.hash != hash)
                        continue;
-                if (dentry->d_parent != parent)
-                        continue;
                spin_lock(&dentry->d_lock);
-                /*
-                 * Recheck the dentry after taking the lock - d_move may have
-                 * changed things. Don't bother checking the hash because
-                 * we're about to compare the whole name anyway.
-                 */
                if (dentry->d_parent != parent)
                        goto next;
-                /* non-existing due to RCU? */
                if (d_unhashed(dentry))
                        goto next;
@@ -1433,18 +1913,19 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
                 * It is safe to compare names since d_move() cannot
                 * change the qstr (protected by d_lock).
                 */
-                qstr = &dentry->d_name;
+                tlen = dentry->d_name.len;
-                if (parent->d_op && parent->d_op->d_compare) {
+                tname = dentry->d_name.name;
-                        if (parent->d_op->d_compare(parent, qstr, name))
+                if (parent->d_flags & DCACHE_OP_COMPARE) {
+                        if (parent->d_op->d_compare(parent, parent->d_inode,
+                                                dentry, dentry->d_inode,
+                                                tlen, tname, name))
                                goto next;
                } else {
-                        if (qstr->len != len)
+                        if (dentry_cmp(tname, tlen, str, len))
-                                goto next;
-                        if (memcmp(qstr->name, str, len))
                                goto next;
                }
-                atomic_inc(&dentry->d_count);
+                dentry->d_count++;
                found = dentry;
                spin_unlock(&dentry->d_lock);
                break;
@@ -1473,8 +1954,8 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
         * routine may choose to leave the hash value unchanged.
         */
        name->hash = full_name_hash(name->name, name->len);
-        if (dir->d_op && dir->d_op->d_hash) {
+        if (dir->d_flags & DCACHE_OP_HASH) {
-                if (dir->d_op->d_hash(dir, name) < 0)
+                if (dir->d_op->d_hash(dir, dir->d_inode, name) < 0)
                        goto out;
        }
        dentry = d_lookup(dir, name);
@@ -1483,34 +1964,32 @@ out:
 }
 /**
- * d_validate - verify dentry provided from insecure source
+ * d_validate - verify dentry provided from insecure source (deprecated)
 * @dentry: The dentry alleged to be valid child of @dparent
 * @dparent: The parent dentry (known to be valid)
 *
 * An insecure source has sent us a dentry, here we verify it and dget() it.
 * This is used by ncpfs in its readdir implementation.
 * Zero is returned in the dentry is invalid.
+ *
+ * This function is slow for big directories, and deprecated, do not use it.
 */
-int d_validate(struct dentry *dentry, struct dentry *parent)
+int d_validate(struct dentry *dentry, struct dentry *dparent)
 {
-        struct hlist_head *head = d_hash(parent, dentry->d_name.hash);
+        struct dentry *child;
-        struct hlist_node *node;
-        struct dentry *d;
-        /* Check whether the ptr might be valid at all.. */
-        if (!kmem_ptr_validate(dentry_cache, dentry))
-                return 0;
-        if (dentry->d_parent != parent)
-                return 0;
-        rcu_read_lock();
+        spin_lock(&dparent->d_lock);
-        hlist_for_each_entry_rcu(d, node, head, d_hash) {
+        list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) {
-                if (d == dentry) {
+                if (dentry == child) {
-                        dget(dentry);
+                        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+                        __dget_dlock(dentry);
+                        spin_unlock(&dentry->d_lock);
+                        spin_unlock(&dparent->d_lock);
                        return 1;
                }
        }
-        rcu_read_unlock();
+        spin_unlock(&dparent->d_lock);
        return 0;
 }
 EXPORT_SYMBOL(d_validate);
@@ -1538,16 +2017,23 @@ EXPORT_SYMBOL(d_validate);
 
 void d_delete(struct dentry * dentry)
 {
+        struct inode *inode;
        int isdir = 0;
        /*
         * Are we the only user?
         */
-        spin_lock(&dcache_lock);
+again:
        spin_lock(&dentry->d_lock);
-        isdir = S_ISDIR(dentry->d_inode->i_mode);
+        inode = dentry->d_inode;
-        if (atomic_read(&dentry->d_count) == 1) {
+        isdir = S_ISDIR(inode->i_mode);
+        if (dentry->d_count == 1) {
+                if (inode && !spin_trylock(&inode->i_lock)) {
+                        spin_unlock(&dentry->d_lock);
+                        cpu_relax();
+                        goto again;
+                }
                dentry->d_flags &= ~DCACHE_CANT_MOUNT;
-                dentry_iput(dentry);
+                dentry_unlink_inode(dentry);
                fsnotify_nameremove(dentry, isdir);
                return;
        }
@@ -1556,17 +2042,18 @@ void d_delete(struct dentry * dentry)
                __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
        fsnotify_nameremove(dentry, isdir);
 }
 EXPORT_SYMBOL(d_delete);
-static void __d_rehash(struct dentry * entry, struct hlist_head *list)
+static void __d_rehash(struct dentry * entry, struct dcache_hash_bucket *b)
 {
+        BUG_ON(!d_unhashed(entry));
+        spin_lock_bucket(b);
        entry->d_flags &= ~DCACHE_UNHASHED;
-        hlist_add_head_rcu(&entry->d_hash, list);
+        hlist_bl_add_head_rcu(&entry->d_hash, &b->head);
+        spin_unlock_bucket(b);
 }
 static void _d_rehash(struct dentry * entry)
@@ -1583,25 +2070,39 @@ static void _d_rehash(struct dentry * entry)
 
 void d_rehash(struct dentry * entry)
 {
-        spin_lock(&dcache_lock);
        spin_lock(&entry->d_lock);
        _d_rehash(entry);
        spin_unlock(&entry->d_lock);
-        spin_unlock(&dcache_lock);
 }
 EXPORT_SYMBOL(d_rehash);
-/*
+/**
- * When switching names, the actual string doesn't strictly have to
+ * dentry_update_name_case - update case insensitive dentry with a new name
- * be preserved in the target - because we're dropping the target
+ * @dentry: dentry to be updated
- * anyway. As such, we can just do a simple memcpy() to copy over
+ * @name: new name
- * the new name before we switch.
 *
- * Note that we have to be a lot more careful about getting the hash
+ * Update a case insensitive dentry with new case of name.
- * switched - we have to switch the hash value properly even if it
+ *
- * then no longer matches the actual (corrupted) string of the target.
+ * dentry must have been returned by d_lookup with name @name. Old and new
- * The hash value has to match the hash queue that the dentry is on..
+ * name lengths must match (ie. no d_compare which allows mismatched name
+ * lengths).
+ *
+ * Parent inode i_mutex must be held over d_lookup and into this call (to
+ * keep renames and concurrent inserts, and readdir(2) away).
 */
+void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
+{
+        BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+        BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
+        spin_lock(&dentry->d_lock);
+        write_seqcount_begin(&dentry->d_seq);
+        memcpy((unsigned char *)dentry->d_name.name, name->name, name->len);
+        write_seqcount_end(&dentry->d_seq);
+        spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(dentry_update_name_case);
 static void switch_names(struct dentry *dentry, struct dentry *target)
 {
        if (dname_external(target)) {
@@ -1643,54 +2144,84 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
        swap(dentry->d_name.len, target->d_name.len);
 }
+static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
+{
+        /*
+         * XXXX: do we really need to take target->d_lock?
+         */
+        if (IS_ROOT(dentry) || dentry->d_parent == target->d_parent)
+                spin_lock(&target->d_parent->d_lock);
+        else {
+                if (d_ancestor(dentry->d_parent, target->d_parent)) {
+                        spin_lock(&dentry->d_parent->d_lock);
+                        spin_lock_nested(&target->d_parent->d_lock,
+                                                DENTRY_D_LOCK_NESTED);
+                } else {
+                        spin_lock(&target->d_parent->d_lock);
+                        spin_lock_nested(&dentry->d_parent->d_lock,
+                                                DENTRY_D_LOCK_NESTED);
+                }
+        }
+        if (target < dentry) {
+                spin_lock_nested(&target->d_lock, 2);
+                spin_lock_nested(&dentry->d_lock, 3);
+        } else {
+                spin_lock_nested(&dentry->d_lock, 2);
+                spin_lock_nested(&target->d_lock, 3);
+        }
+}
+static void dentry_unlock_parents_for_move(struct dentry *dentry,
+                                        struct dentry *target)
+{
+        if (target->d_parent != dentry->d_parent)
+                spin_unlock(&dentry->d_parent->d_lock);
+        if (target->d_parent != target)
+                spin_unlock(&target->d_parent->d_lock);
+}
 /*
- * We cannibalize "target" when moving dentry on top of it,
+ * When switching names, the actual string doesn't strictly have to
- * because it's going to be thrown away anyway. We could be more
+ * be preserved in the target - because we're dropping the target
- * polite about it, though.
+ * anyway. As such, we can just do a simple memcpy() to copy over
- *
+ * the new name before we switch.
- * This forceful removal will result in ugly /proc output if
+ *
- * somebody holds a file open that got deleted due to a rename.
+ * Note that we have to be a lot more careful about getting the hash
- * We could be nicer about the deleted file, and let it show
+ * switched - we have to switch the hash value properly even if it
- * up under the name it had before it was deleted rather than
+ * then no longer matches the actual (corrupted) string of the target.
- * under the original name of the file that was moved on top of it.
+ * The hash value has to match the hash queue that the dentry is on..
 */
- 
 /*
- * d_move_locked - move a dentry
+ * d_move - move a dentry
 * @dentry: entry to move
 * @target: new dentry
 *
 * Update the dcache to reflect the move of a file name. Negative
 * dcache entries should not be moved in this way.
 */
-static void d_move_locked(struct dentry * dentry, struct dentry * target)
+void d_move(struct dentry * dentry, struct dentry * target)
 {
-        struct hlist_head *list;
        if (!dentry->d_inode)
                printk(KERN_WARNING "VFS: moving negative dcache entry\n");
+        BUG_ON(d_ancestor(dentry, target));
+        BUG_ON(d_ancestor(target, dentry));
        write_seqlock(&rename_lock);
-        /*
-         * XXXX: do we really need to take target->d_lock?
-         */
-        if (target < dentry) {
-                spin_lock(&target->d_lock);
-                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-        } else {
-                spin_lock(&dentry->d_lock);
-                spin_lock_nested(&target->d_lock, DENTRY_D_LOCK_NESTED);
-        }
-        /* Move the dentry to the target hash queue, if on different bucket */
+        dentry_lock_for_move(dentry, target);
-        if (d_unhashed(dentry))
-                goto already_unhashed;
-        hlist_del_rcu(&dentry->d_hash);
+        write_seqcount_begin(&dentry->d_seq);
+        write_seqcount_begin(&target->d_seq);
-already_unhashed:
+        /* __d_drop does write_seqcount_barrier, but they're OK to nest. */
-        list = d_hash(target->d_parent, target->d_name.hash);
-        __d_rehash(dentry, list);
+        /*
+         * Move the dentry to the target hash queue. Don't bother checking
+         * for the same hash queue because of how unlikely it is.
+         */
+        __d_drop(dentry);
+        __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
        /* Unhash the target: dput() will then get rid of it */
        __d_drop(target);
@@ -1715,27 +2246,16 @@ already_unhashed:
        }
        list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
+        write_seqcount_end(&target->d_seq);
+        write_seqcount_end(&dentry->d_seq);
+        dentry_unlock_parents_for_move(dentry, target);
        spin_unlock(&target->d_lock);
        fsnotify_d_move(dentry);
        spin_unlock(&dentry->d_lock);
        write_sequnlock(&rename_lock);
 }
-/**
- * d_move - move a dentry
- * @dentry: entry to move
- * @target: new dentry
- *
- * Update the dcache to reflect the move of a file name. Negative
- * dcache entries should not be moved in this way.
- */
-void d_move(struct dentry * dentry, struct dentry * target)
-{
-        spin_lock(&dcache_lock);
-        d_move_locked(dentry, target);
-        spin_unlock(&dcache_lock);
-}
 EXPORT_SYMBOL(d_move);
 /**
@@ -1761,13 +2281,13 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
 * This helper attempts to cope with remotely renamed directories
 *
 * It assumes that the caller is already holding
- * dentry->d_parent->d_inode->i_mutex and the dcache_lock
+ * dentry->d_parent->d_inode->i_mutex and the inode->i_lock
 *
 * Note: If ever the locking in lock_rename() changes, then please
 * remember to update this too...
 */
-static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
+static struct dentry *__d_unalias(struct inode *inode,
-        __releases(dcache_lock)
+                struct dentry *dentry, struct dentry *alias)
 {
        struct mutex *m1 = NULL, *m2 = NULL;
        struct dentry *ret;
@@ -1790,10 +2310,10 @@ static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
                goto out_err;
        m2 = &alias->d_parent->d_inode->i_mutex;
 out_unalias:
-        d_move_locked(alias, dentry);
+        d_move(alias, dentry);
        ret = alias;
 out_err:
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        if (m2)
                mutex_unlock(m2);
        if (m1)
@@ -1804,17 +2324,23 @@ out_err:
 /*
 * Prepare an anonymous dentry for life in the superblock's dentry tree as a
 * named dentry in place of the dentry to be replaced.
+ * returns with anon->d_lock held!
 */
 static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 {
        struct dentry *dparent, *aparent;
-        switch_names(dentry, anon);
+        dentry_lock_for_move(anon, dentry);
-        swap(dentry->d_name.hash, anon->d_name.hash);
+        write_seqcount_begin(&dentry->d_seq);
+        write_seqcount_begin(&anon->d_seq);
        dparent = dentry->d_parent;
        aparent = anon->d_parent;
+        switch_names(dentry, anon);
+        swap(dentry->d_name.hash, anon->d_name.hash);
        dentry->d_parent = (aparent == anon) ? dentry : aparent;
        list_del(&dentry->d_u.d_child);
        if (!IS_ROOT(dentry))
@@ -1829,6 +2355,13 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
        else
                INIT_LIST_HEAD(&anon->d_u.d_child);
+        write_seqcount_end(&dentry->d_seq);
+        write_seqcount_end(&anon->d_seq);
+        dentry_unlock_parents_for_move(anon, dentry);
+        spin_unlock(&dentry->d_lock);
+        /* anon->d_lock still locked, returns locked */
        anon->d_flags &= ~DCACHE_DISCONNECTED;
 }
@@ -1846,14 +2379,15 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
        BUG_ON(!d_unhashed(dentry));
-        spin_lock(&dcache_lock);
        if (!inode) {
                actual = dentry;
                __d_instantiate(dentry, NULL);
-                goto found_lock;
+                d_rehash(actual);
+                goto out_nolock;
        }
+        spin_lock(&inode->i_lock);
        if (S_ISDIR(inode->i_mode)) {
                struct dentry *alias;
@@ -1864,13 +2398,12 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
                        /* Is this an anonymous mountpoint that we could splice
                         * into our tree? */
                        if (IS_ROOT(alias)) {
-                                spin_lock(&alias->d_lock);
                                __d_materialise_dentry(dentry, alias);
                                __d_drop(alias);
                                goto found;
                        }
                        /* Nope, but we must(!) avoid directory aliasing */
-                        actual = __d_unalias(dentry, alias);
+                        actual = __d_unalias(inode, dentry, alias);
                        if (IS_ERR(actual))
                                dput(alias);
                        goto out_nolock;
@@ -1881,15 +2414,14 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
        actual = __d_instantiate_unique(dentry, inode);
        if (!actual)
                actual = dentry;
-        else if (unlikely(!d_unhashed(actual)))
+        else
-                goto shouldnt_be_hashed;
+                BUG_ON(!d_unhashed(actual));
-found_lock:
        spin_lock(&actual->d_lock);
 found:
        _d_rehash(actual);
        spin_unlock(&actual->d_lock);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
 out_nolock:
        if (actual == dentry) {
                security_d_instantiate(dentry, inode);
@@ -1898,10 +2430,6 @@ out_nolock:
        iput(inode);
        return actual;
-shouldnt_be_hashed:
-        spin_unlock(&dcache_lock);
-        BUG();
 }
 EXPORT_SYMBOL_GPL(d_materialise_unique);
@@ -1928,7 +2456,7 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
 * @buffer: pointer to the end of the buffer
 * @buflen: pointer to buffer length
 *
- * Caller holds the dcache_lock.
+ * Caller holds the rename_lock.
 *
 * If path is not reachable from the supplied root, then the value of
 * root is changed (without modifying refcounts).
@@ -1956,7 +2484,9 @@ static int prepend_path(const struct path *path, struct path *root,
                }
                parent = dentry->d_parent;
                prefetch(parent);
+                spin_lock(&dentry->d_lock);
                error = prepend_name(buffer, buflen, &dentry->d_name);
+                spin_unlock(&dentry->d_lock);
                if (!error)
                        error = prepend(buffer, buflen, "/", 1);
                if (error)
@@ -2012,9 +2542,9 @@ char *__d_path(const struct path *path, struct path *root,
        int error;
        prepend(&res, &buflen, "\0", 1);
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        error = prepend_path(path, root, &res, &buflen);
-        spin_unlock(&dcache_lock);
+        write_sequnlock(&rename_lock);
        if (error)
                return ERR_PTR(error);
@@ -2076,12 +2606,12 @@ char *d_path(const struct path *path, char *buf, int buflen)
                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
        get_fs_root(current->fs, &root);
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        tmp = root;
        error = path_with_deleted(path, &tmp, &res, &buflen);
        if (error)
                res = ERR_PTR(error);
-        spin_unlock(&dcache_lock);
+        write_sequnlock(&rename_lock);
        path_put(&root);
        return res;
 }
@@ -2107,12 +2637,12 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
        get_fs_root(current->fs, &root);
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        tmp = root;
        error = path_with_deleted(path, &tmp, &res, &buflen);
        if (!error && !path_equal(&tmp, &root))
                error = prepend_unreachable(&res, &buflen);
-        spin_unlock(&dcache_lock);
+        write_sequnlock(&rename_lock);
        path_put(&root);
        if (error)
                res =  ERR_PTR(error);
@@ -2144,7 +2674,7 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
 /*
 * Write full pathname from the root of the filesystem into the buffer.
 */
-char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
+static char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
 {
        char *end = buf + buflen;
        char *retval;
@@ -2158,10 +2688,13 @@ char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
        while (!IS_ROOT(dentry)) {
                struct dentry *parent = dentry->d_parent;
+                int error;
                prefetch(parent);
-                if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
+                spin_lock(&dentry->d_lock);
-                    (prepend(&end, &buflen, "/", 1) != 0))
+                error = prepend_name(&end, &buflen, &dentry->d_name);
+                spin_unlock(&dentry->d_lock);
+                if (error != 0 || prepend(&end, &buflen, "/", 1) != 0)
                        goto Elong;
                retval = end;
@@ -2171,14 +2704,25 @@ char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
 Elong:
        return ERR_PTR(-ENAMETOOLONG);
 }
-EXPORT_SYMBOL(__dentry_path);
+char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
+{
+        char *retval;
+        write_seqlock(&rename_lock);
+        retval = __dentry_path(dentry, buf, buflen);
+        write_sequnlock(&rename_lock);
+        return retval;
+}
+EXPORT_SYMBOL(dentry_path_raw);
 char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 {
        char *p = NULL;
        char *retval;
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        if (d_unlinked(dentry)) {
                p = buf + buflen;
                if (prepend(&p, &buflen, "//deleted", 10) != 0)
@@ -2186,12 +2730,11 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
                buflen++;
        }
        retval = __dentry_path(dentry, buf, buflen);
-        spin_unlock(&dcache_lock);
+        write_sequnlock(&rename_lock);
        if (!IS_ERR(retval) && p)
                *p = '/';       /* restore '/' overriden with '\0' */
        return retval;
 Elong:
-        spin_unlock(&dcache_lock);
        return ERR_PTR(-ENAMETOOLONG);
 }
@@ -2225,7 +2768,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
        get_fs_root_and_pwd(current->fs, &root, &pwd);
        error = -ENOENT;
-        spin_lock(&dcache_lock);
+        write_seqlock(&rename_lock);
        if (!d_unlinked(pwd.dentry)) {
                unsigned long len;
                struct path tmp = root;
@@ -2234,7 +2777,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
                prepend(&cwd, &buflen, "\0", 1);
                error = prepend_path(&pwd, &tmp, &cwd, &buflen);
-                spin_unlock(&dcache_lock);
+                write_sequnlock(&rename_lock);
                if (error)
                        goto out;
@@ -2253,8 +2796,9 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
                        if (copy_to_user(buf, cwd, len))
                                error = -EFAULT;
                }
-        } else
+        } else {
-                spin_unlock(&dcache_lock);
+                write_sequnlock(&rename_lock);
+        }
 out:
        path_put(&pwd);
@@ -2282,25 +2826,25 @@ out:
 int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
 {
        int result;
-        unsigned long seq;
+        unsigned seq;
        if (new_dentry == old_dentry)
                return 1;
-        /*
-         * Need rcu_readlock to protect against the d_parent trashing
-         * due to d_move
-         */
-        rcu_read_lock();
        do {
                /* for restarting inner loop in case of seq retry */
                seq = read_seqbegin(&rename_lock);
+                /*
+                 * Need rcu_readlock to protect against the d_parent trashing
+                 * due to d_move
+                 */
+                rcu_read_lock();
                if (d_ancestor(old_dentry, new_dentry))
                        result = 1;
                else
                        result = 0;
+                rcu_read_unlock();
        } while (read_seqretry(&rename_lock, seq));
-        rcu_read_unlock();
        return result;
 }
@@ -2332,10 +2876,15 @@ EXPORT_SYMBOL(path_is_under);
 void d_genocide(struct dentry *root)
 {
-        struct dentry *this_parent = root;
+        struct dentry *this_parent;
        struct list_head *next;
+        unsigned seq;
+        int locked = 0;
-        spin_lock(&dcache_lock);
+        seq = read_seqbegin(&rename_lock);
+again:
+        this_parent = root;
+        spin_lock(&this_parent->d_lock);
 repeat:
        next = this_parent->d_subdirs.next;
 resume:
@@ -2343,21 +2892,62 @@ resume:
                struct list_head *tmp = next;
                struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
                next = tmp->next;
-                if (d_unhashed(dentry)||!dentry->d_inode)
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+                if (d_unhashed(dentry) || !dentry->d_inode) {
+                        spin_unlock(&dentry->d_lock);
                        continue;
+                }
                if (!list_empty(&dentry->d_subdirs)) {
+                        spin_unlock(&this_parent->d_lock);
+                        spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
                        this_parent = dentry;
+                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
-                atomic_dec(&dentry->d_count);
+                if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
+                        dentry->d_flags |= DCACHE_GENOCIDE;
+                        dentry->d_count--;
+                }
+                spin_unlock(&dentry->d_lock);
        }
        if (this_parent != root) {
-                next = this_parent->d_u.d_child.next;
+                struct dentry *tmp;
-                atomic_dec(&this_parent->d_count);
+                struct dentry *child;
-                this_parent = this_parent->d_parent;
+                tmp = this_parent->d_parent;
+                if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
+                        this_parent->d_flags |= DCACHE_GENOCIDE;
+                        this_parent->d_count--;
+                }
+                rcu_read_lock();
+                spin_unlock(&this_parent->d_lock);
+                child = this_parent;
+                this_parent = tmp;
+                spin_lock(&this_parent->d_lock);
+                /* might go back up the wrong parent if we have had a rename
+                 * or deletion */
+                if (this_parent != child->d_parent ||
+                         (!locked && read_seqretry(&rename_lock, seq))) {
+                        spin_unlock(&this_parent->d_lock);
+                        rcu_read_unlock();
+                        goto rename_retry;
+                }
+                rcu_read_unlock();
+                next = child->d_u.d_child.next;
                goto resume;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&this_parent->d_lock);
+        if (!locked && read_seqretry(&rename_lock, seq))
+                goto rename_retry;
+        if (locked)
+                write_sequnlock(&rename_lock);
+        return;
+rename_retry:
+        locked = 1;
+        write_seqlock(&rename_lock);
+        goto again;
 }
 /**
@@ -2411,7 +3001,7 @@ static void __init dcache_init_early(void)
        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
-                                        sizeof(struct hlist_head),
+                                        sizeof(struct dcache_hash_bucket),
                                        dhash_entries,
                                        13,
                                        HASH_EARLY,
@@ -2420,16 +3010,13 @@ static void __init dcache_init_early(void)
                                        0);
        for (loop = 0; loop < (1 << d_hash_shift); loop++)
-                INIT_HLIST_HEAD(&dentry_hashtable[loop]);
+                INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head);
 }
 static void __init dcache_init(void)
 {
        int loop;
-        percpu_counter_init(&nr_dentry, 0);
-        percpu_counter_init(&nr_dentry_unused, 0);
        /* 
         * A constructor could be added for stable state like the lists,
         * but it is probably not worth it because of the cache nature
@@ -2446,7 +3033,7 @@ static void __init dcache_init(void)
        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
-                                        sizeof(struct hlist_head),
+                                        sizeof(struct dcache_hash_bucket),
                                        dhash_entries,
                                        13,
                                        0,
@@ -2455,7 +3042,7 @@ static void __init dcache_init(void)
                                        0);
        for (loop = 0; loop < (1 << d_hash_shift); loop++)
-                INIT_HLIST_HEAD(&dentry_hashtable[loop]);
+                INIT_HLIST_BL_HEAD(&dentry_hashtable[loop].head);
 }
 /* SLAB cache for __getname() consumers */
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 906e803f7f79..6fc4f319b550 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -44,12 +44,17 @@
 */
 static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        struct dentry *lower_dentry;
-        struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+        struct vfsmount *lower_mnt;
        struct dentry *dentry_save;
        struct vfsmount *vfsmount_save;
        int rc = 1;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
        if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
                goto out;
        dentry_save = nd->path.dentry;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 9d1a22d62765..337352a94751 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -260,7 +260,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                                   ecryptfs_dentry->d_parent));
        lower_inode = lower_dentry->d_inode;
        fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
-        BUG_ON(!atomic_read(&lower_dentry->d_count));
+        BUG_ON(!lower_dentry->d_count);
        ecryptfs_set_dentry_private(ecryptfs_dentry,
                                    kmem_cache_alloc(ecryptfs_dentry_info_cache,
                                                     GFP_KERNEL));
@@ -441,7 +441,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        struct qstr lower_name;
        int rc = 0;
-        ecryptfs_dentry->d_op = &ecryptfs_dops;
+        d_set_d_op(ecryptfs_dentry, &ecryptfs_dops);
        if ((ecryptfs_dentry->d_name.len == 1
             && !strcmp(ecryptfs_dentry->d_name.name, "."))
            || (ecryptfs_dentry->d_name.len == 2
@@ -454,7 +454,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        lower_name.hash = ecryptfs_dentry->d_name.hash;
        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
-                                                    &lower_name);
+                                lower_dir_dentry->d_inode, &lower_name);
                if (rc < 0)
                        goto out_d_drop;
        }
@@ -489,7 +489,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
-                                                    &lower_name);
+                                lower_dir_dentry->d_inode, &lower_name);
                if (rc < 0)
                        goto out_d_drop;
        }
@@ -980,8 +980,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 }
 static int
-ecryptfs_permission(struct inode *inode, int mask)
+ecryptfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        return inode_permission(ecryptfs_inode_to_lower(inode), mask);
 }
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index a9dbd62518e6..351038675376 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -189,7 +189,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
        if (special_file(lower_inode->i_mode))
                init_special_inode(inode, lower_inode->i_mode,
                                   lower_inode->i_rdev);
-        dentry->d_op = &ecryptfs_dops;
+        d_set_d_op(dentry, &ecryptfs_dops);
        fsstack_copy_attr_all(inode, lower_inode);
        /* This size will be overwritten for real files w/ headers and
         * other metadata */
@@ -594,7 +594,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
                deactivate_locked_super(s);
                goto out;
        }
-        s->s_root->d_op = &ecryptfs_dops;
+        d_set_d_op(s->s_root, &ecryptfs_dops);
        s->s_root->d_sb = s;
        s->s_root->d_parent = s->s_root;
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 253732382d37..3042fe123a34 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -28,7 +28,6 @@
 #include <linux/key.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include "ecryptfs_kernel.h"
@@ -63,6 +62,16 @@ out:
        return inode;
 }
+static void ecryptfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct ecryptfs_inode_info *inode_info;
+        inode_info = ecryptfs_inode_to_private(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
+}
 /**
 * ecryptfs_destroy_inode
 * @inode: The ecryptfs inode
@@ -89,7 +98,7 @@ static void ecryptfs_destroy_inode(struct inode *inode)
                }
        }
        ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
-        kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
+        call_rcu(&inode->i_rcu, ecryptfs_i_callback);
 }
 /**
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 5073a07652cc..0f31acb0131c 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -65,11 +65,18 @@ static struct inode *efs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void efs_destroy_inode(struct inode *inode)
+static void efs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
 }
+static void efs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, efs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct efs_inode_info *ei = (struct efs_inode_info *) foo;
diff --git a/fs/exec.c b/fs/exec.c
index 99d33a1371e9..c62efcb959c7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -164,7 +164,26 @@ out:
 #ifdef CONFIG_MMU
-static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+        struct mm_struct *mm = current->mm;
+        long diff = (long)(pages - bprm->vma_pages);
+        if (!mm || !diff)
+                return;
+        bprm->vma_pages = pages;
+#ifdef SPLIT_RSS_COUNTING
+        add_mm_counter(mm, MM_ANONPAGES, diff);
+#else
+        spin_lock(&mm->page_table_lock);
+        add_mm_counter(mm, MM_ANONPAGES, diff);
+        spin_unlock(&mm->page_table_lock);
+#endif
+}
+struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
 {
        struct page *page;
@@ -186,6 +205,8 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
                struct rlimit *rlim;
+                acct_arg_size(bprm, size / PAGE_SIZE);
                /*
                 * We've historically supported up to 32 pages (ARG_MAX)
                 * of argument strings even with small stacks
@@ -254,6 +275,11 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
        vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
        INIT_LIST_HEAD(&vma->anon_vma_chain);
+        err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
+        if (err)
+                goto err;
        err = insert_vm_struct(mm, vma);
        if (err)
                goto err;
@@ -276,7 +302,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len)
 #else
-static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+}
+struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
 {
        struct page *page;
@@ -1003,6 +1033,7 @@ int flush_old_exec(struct linux_binprm * bprm)
        /*
         * Release all of the old mmap stuff
         */
+        acct_arg_size(bprm, 0);
        retval = exec_mmap(bprm->mm);
        if (retval)
                goto out;
@@ -1426,8 +1457,10 @@ int do_execve(const char * filename,
        return retval;
 out:
-        if (bprm->mm)
+        if (bprm->mm) {
-                mmput (bprm->mm);
+                acct_arg_size(bprm, 0);
+                mmput(bprm->mm);
+        }
 out_file:
        if (bprm->file) {
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 79c3ae6e0456..8c6c4669b381 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -150,12 +150,19 @@ static struct inode *exofs_alloc_inode(struct super_block *sb)
        return &oi->vfs_inode;
 }
+static void exofs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+}
 /*
 * Remove an inode from the cache
 */
 static void exofs_destroy_inode(struct inode *inode)
 {
-        kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+        call_rcu(&inode->i_rcu, exofs_i_callback);
 }
 /*
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 51b304056f10..4b6825740dd5 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -43,24 +43,26 @@ find_acceptable_alias(struct dentry *result,
                void *context)
 {
        struct dentry *dentry, *toput = NULL;
+        struct inode *inode;
        if (acceptable(context, result))
                return result;
-        spin_lock(&dcache_lock);
+        inode = result->d_inode;
-        list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) {
+        spin_lock(&inode->i_lock);
-                dget_locked(dentry);
+        list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
-                spin_unlock(&dcache_lock);
+                dget(dentry);
+                spin_unlock(&inode->i_lock);
                if (toput)
                        dput(toput);
                if (dentry != result && acceptable(context, dentry)) {
                        dput(result);
                        return dentry;
                }
-                spin_lock(&dcache_lock);
+                spin_lock(&inode->i_lock);
                toput = dentry;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        if (toput)
                dput(toput);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 2bcc0431bada..7b4180554a62 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -232,10 +232,17 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 }
 int
-ext2_check_acl(struct inode *inode, int mask)
+ext2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
+        struct posix_acl *acl;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
+        acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 3ff6cbb9ac44..c939b7b12099 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,7 +54,7 @@ static inline int ext2_acl_count(size_t size)
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
 /* acl.c */
-extern int ext2_check_acl (struct inode *, int);
+extern int ext2_check_acl (struct inode *, int, unsigned int);
 extern int ext2_acl_chmod (struct inode *);
 extern int ext2_init_acl (struct inode *, struct inode *);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index d89e0b6a2d78..e0c6380ff992 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -161,11 +161,18 @@ static struct inode *ext2_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void ext2_destroy_inode(struct inode *inode)
+static void ext2_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
 }
+static void ext2_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, ext2_i_callback);
+}
 static void init_once(void *foo)
 {
        struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 8a11fe212183..e4fa49e6c539 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -240,10 +240,17 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 }
 int
-ext3_check_acl(struct inode *inode, int mask)
+ext3_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
+        struct posix_acl *acl;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
+        acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 597334626de9..5faf8048e906 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,7 +54,7 @@ static inline int ext3_acl_count(size_t size)
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
 /* acl.c */
-extern int ext3_check_acl (struct inode *, int);
+extern int ext3_check_acl (struct inode *, int, unsigned int);
 extern int ext3_acl_chmod (struct inode *);
 extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 2fedaf8b5012..77ce1616f725 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -27,7 +27,6 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/parser.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/vfs.h>
@@ -480,6 +479,13 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
+static void ext3_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
+}
 static void ext3_destroy_inode(struct inode *inode)
 {
        if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
@@ -490,7 +496,7 @@ static void ext3_destroy_inode(struct inode *inode)
                                false);
                dump_stack();
        }
-        kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
+        call_rcu(&inode->i_rcu, ext3_i_callback);
 }
 static void init_once(void *foo)
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 5e2ed4504ead..e0270d1f8d82 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -238,10 +238,17 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 }
 int
-ext4_check_acl(struct inode *inode, int mask)
+ext4_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
+        struct posix_acl *acl;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
+        acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 9d843d5deac4..dec821168fd4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,7 +54,7 @@ static inline int ext4_acl_count(size_t size)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 /* acl.c */
-extern int ext4_check_acl(struct inode *, int);
+extern int ext4_check_acl(struct inode *, int, unsigned int);
 extern int ext4_acl_chmod(struct inode *);
 extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8b5dd6369f82..94ce3d7a1c4b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -177,7 +177,7 @@ struct mpage_da_data {
 struct ext4_io_page {
        struct page     *p_page;
-        int             p_count;
+        atomic_t        p_count;
 };
 #define MAX_IO_PAGES 128
@@ -858,6 +858,7 @@ struct ext4_inode_info {
        spinlock_t i_completed_io_lock;
        /* current io_end structure for async DIO write*/
        ext4_io_end_t *cur_aio_dio;
+        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
        /*
         * Transactions that contain inode's metadata needed to complete
@@ -909,6 +910,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_JOURNAL_CHECKSUM     0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
+#define EXT4_MOUNT_MBLK_IO_SUBMIT       0x4000000 /* multi-block io submits */
 #define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT       0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY       0x20000000 /* Block validity checking */
@@ -2060,6 +2062,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
 extern void ext4_exit_pageio(void);
+extern void ext4_ioend_wait(struct inode *);
 extern void ext4_free_io_end(ext4_io_end_t *io);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
 extern int ext4_end_io_nolock(ext4_io_end_t *io);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4d78342f3bf0..e659597b690b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -53,6 +53,7 @@
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
 {
+        trace_ext4_begin_ordered_truncate(inode, new_size);
        return jbd2_journal_begin_ordered_truncate(
                                        EXT4_SB(inode->i_sb)->s_journal,
                                        &EXT4_I(inode)->jinode,
@@ -178,6 +179,7 @@ void ext4_evict_inode(struct inode *inode)
        handle_t *handle;
        int err;
+        trace_ext4_evict_inode(inode);
        if (inode->i_nlink) {
                truncate_inode_pages(&inode->i_data, 0);
                goto no_delete;
@@ -2123,9 +2125,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                         */
                        if (unlikely(journal_data && PageChecked(page)))
                                err = __ext4_journalled_writepage(page, len);
-                        else
+                        else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
                                err = ext4_bio_write_page(&io_submit, page,
                                                          len, mpd->wbc);
+                        else
+                                err = block_write_full_page(page,
+                                        noalloc_get_block_write, mpd->wbc);
                        if (!err)
                                mpd->pages_written++;
@@ -5647,6 +5652,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
        int err, ret;
        might_sleep();
+        trace_ext4_mark_inode_dirty(inode, _RET_IP_);
        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (ext4_handle_valid(handle) &&
            EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bf5ae883b1bd..eb3bc2fe647e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -331,6 +331,30 @@ mext_out:
                return err;
        }
+        case FITRIM:
+        {
+                struct super_block *sb = inode->i_sb;
+                struct fstrim_range range;
+                int ret = 0;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&range, (struct fstrim_range *)arg,
+                    sizeof(range)))
+                        return -EFAULT;
+                ret = ext4_trim_fs(sb, &range);
+                if (ret < 0)
+                        return ret;
+                if (copy_to_user((struct fstrim_range *)arg, &range,
+                    sizeof(range)))
+                        return -EFAULT;
+                return 0;
+        }
        default:
                return -ENOTTY;
        }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c58eba34724a..5b4d4e3a4d58 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4640,8 +4640,6 @@ do_more:
                 * with group lock held. generate_buddy look at
                 * them with group lock_held
                 */
-                if (test_opt(sb, DISCARD))
-                        ext4_issue_discard(sb, block_group, bit, count);
                ext4_lock_group(sb, block_group);
                mb_clear_bits(bitmap_bh->b_data, bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 92203b8a099f..dc40e75cba88 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -872,7 +872,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
        if (namelen > EXT4_NAME_LEN)
                return NULL;
        if ((namelen <= 2) && (name[0] == '.') &&
-            (name[1] == '.' || name[1] == '0')) {
+            (name[1] == '.' || name[1] == '\0')) {
                /*
                 * "." or ".." will only be in the first block
                 * NFS may look up ".."; "." should be handled by the VFS
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 46a7d6a9d976..beacce11ac50 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,8 +32,14 @@
 static struct kmem_cache *io_page_cachep, *io_end_cachep;
+#define WQ_HASH_SZ              37
+#define to_ioend_wq(v)  (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
+static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
 int __init ext4_init_pageio(void)
 {
+        int i;
        io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
        if (io_page_cachep == NULL)
                return -ENOMEM;
@@ -42,6 +48,8 @@ int __init ext4_init_pageio(void)
                kmem_cache_destroy(io_page_cachep);
                return -ENOMEM;
        }
+        for (i = 0; i < WQ_HASH_SZ; i++)
+                init_waitqueue_head(&ioend_wq[i]);
        return 0;
 }
@@ -52,24 +60,37 @@ void ext4_exit_pageio(void)
        kmem_cache_destroy(io_page_cachep);
 }
+void ext4_ioend_wait(struct inode *inode)
+{
+        wait_queue_head_t *wq = to_ioend_wq(inode);
+        wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+}
+static void put_io_page(struct ext4_io_page *io_page)
+{
+        if (atomic_dec_and_test(&io_page->p_count)) {
+                end_page_writeback(io_page->p_page);
+                put_page(io_page->p_page);
+                kmem_cache_free(io_page_cachep, io_page);
+        }
+}
 void ext4_free_io_end(ext4_io_end_t *io)
 {
        int i;
+        wait_queue_head_t *wq;
        BUG_ON(!io);
        if (io->page)
                put_page(io->page);
-        for (i = 0; i < io->num_io_pages; i++) {
+        for (i = 0; i < io->num_io_pages; i++)
-                if (--io->pages[i]->p_count == 0) {
+                put_io_page(io->pages[i]);
-                        struct page *page = io->pages[i]->p_page;
-                        end_page_writeback(page);
-                        put_page(page);
-                        kmem_cache_free(io_page_cachep, io->pages[i]);
-                }
-        }
        io->num_io_pages = 0;
-        iput(io->inode);
+        wq = to_ioend_wq(io->inode);
+        if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
+            waitqueue_active(wq))
+                wake_up_all(wq);
        kmem_cache_free(io_end_cachep, io);
 }
@@ -142,8 +163,8 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
        io = kmem_cache_alloc(io_end_cachep, flags);
        if (io) {
                memset(io, 0, sizeof(*io));
-                io->inode = igrab(inode);
+                atomic_inc(&EXT4_I(inode)->i_ioend_count);
-                BUG_ON(!io->inode);
+                io->inode = inode;
                INIT_WORK(&io->work, ext4_end_io_work);
                INIT_LIST_HEAD(&io->list);
        }
@@ -171,35 +192,15 @@ static void ext4_end_bio(struct bio *bio, int error)
        struct workqueue_struct *wq;
        struct inode *inode;
        unsigned long flags;
-        ext4_fsblk_t err_block;
        int i;
        BUG_ON(!io_end);
-        inode = io_end->inode;
        bio->bi_private = NULL;
        bio->bi_end_io = NULL;
        if (test_bit(BIO_UPTODATE, &bio->bi_flags))
                error = 0;
-        err_block = bio->bi_sector >> (inode->i_blkbits - 9);
        bio_put(bio);
-        if (!(inode->i_sb->s_flags & MS_ACTIVE)) {
-                pr_err("sb umounted, discard end_io request for inode %lu\n",
-                        io_end->inode->i_ino);
-                ext4_free_io_end(io_end);
-                return;
-        }
-        if (error) {
-                io_end->flag |= EXT4_IO_END_ERROR;
-                ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
-                             "(offset %llu size %ld starting block %llu)",
-                             inode->i_ino,
-                             (unsigned long long) io_end->offset,
-                             (long) io_end->size,
-                             (unsigned long long) err_block);
-        }
        for (i = 0; i < io_end->num_io_pages; i++) {
                struct page *page = io_end->pages[i]->p_page;
                struct buffer_head *bh, *head;
@@ -236,14 +237,6 @@ static void ext4_end_bio(struct bio *bio, int error)
                        } while (bh != head);
                }
-                if (--io_end->pages[i]->p_count == 0) {
-                        struct page *page = io_end->pages[i]->p_page;
-                        end_page_writeback(page);
-                        put_page(page);
-                        kmem_cache_free(io_page_cachep, io_end->pages[i]);
-                }
                /*
                 * If this is a partial write which happened to make
                 * all buffers uptodate then we can optimize away a
@@ -253,9 +246,22 @@ static void ext4_end_bio(struct bio *bio, int error)
                 */
                if (!partial_write)
                        SetPageUptodate(page);
-        }
+                put_io_page(io_end->pages[i]);
+        }
        io_end->num_io_pages = 0;
+        inode = io_end->inode;
+        if (error) {
+                io_end->flag |= EXT4_IO_END_ERROR;
+                ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+                             "(offset %llu size %ld starting block %llu)",
+                             inode->i_ino,
+                             (unsigned long long) io_end->offset,
+                             (long) io_end->size,
+                             (unsigned long long)
+                             bio->bi_sector >> (inode->i_blkbits - 9));
+        }
        /* Add the io_end to per-inode completed io list*/
        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -305,7 +311,6 @@ static int io_submit_init(struct ext4_io_submit *io,
        bio->bi_private = io->io_end = io_end;
        bio->bi_end_io = ext4_end_bio;
-        io_end->inode = inode;
        io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
        io->io_bio = bio;
@@ -360,7 +365,7 @@ submit_and_retry:
        if ((io_end->num_io_pages == 0) ||
            (io_end->pages[io_end->num_io_pages-1] != io_page)) {
                io_end->pages[io_end->num_io_pages++] = io_page;
-                io_page->p_count++;
+                atomic_inc(&io_page->p_count);
        }
        return 0;
 }
@@ -389,7 +394,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
                return -ENOMEM;
        }
        io_page->p_page = page;
-        io_page->p_count = 0;
+        atomic_set(&io_page->p_count, 1);
        get_page(page);
        for (bh = head = page_buffers(page), block_start = 0;
@@ -421,10 +426,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
         * PageWriteback bit from the page to prevent the system from
         * wedging later on.
         */
-        if (io_page->p_count == 0) {
+        put_io_page(io_page);
-                put_page(page);
-                end_page_writeback(page);
-                kmem_cache_free(io_page_cachep, io_page);
-        }
        return ret;
 }
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index dc963929de65..981c8477adab 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -232,6 +232,8 @@ static int setup_new_group_blocks(struct super_block *sb,
                               GFP_NOFS);
        if (err)
                goto exit_bh;
+        for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++)
+                ext4_set_bit(bit, bh->b_data);
        ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
                   input->block_bitmap - start);
@@ -247,6 +249,9 @@ static int setup_new_group_blocks(struct super_block *sb,
        err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
        if (err)
                goto exit_bh;
+        for (i = 0, bit = input->inode_table - start;
+             i < sbi->s_itb_per_group; i++, bit++)
+                ext4_set_bit(bit, bh->b_data);
        if ((err = extend_or_restart_transaction(handle, 2, bh)))
                goto exit_bh;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 40131b777af6..cd37f9d5e447 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -828,12 +828,29 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei->cur_aio_dio = NULL;
        ei->i_sync_tid = 0;
        ei->i_datasync_tid = 0;
+        atomic_set(&ei->i_ioend_count, 0);
        return &ei->vfs_inode;
 }
+static int ext4_drop_inode(struct inode *inode)
+{
+        int drop = generic_drop_inode(inode);
+        trace_ext4_drop_inode(inode, drop);
+        return drop;
+}
+static void ext4_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+}
 static void ext4_destroy_inode(struct inode *inode)
 {
+        ext4_ioend_wait(inode);
        if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
                ext4_msg(inode->i_sb, KERN_ERR,
                         "Inode %lu (%p): orphan list check failed!",
@@ -843,7 +860,7 @@ static void ext4_destroy_inode(struct inode *inode)
                                true);
                dump_stack();
        }
-        kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+        call_rcu(&inode->i_rcu, ext4_i_callback);
 }
 static void init_once(void *foo)
@@ -1016,6 +1033,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
            !(def_mount_opts & EXT4_DEFM_NODELALLOC))
                seq_puts(seq, ",nodelalloc");
+        if (test_opt(sb, MBLK_IO_SUBMIT))
+                seq_puts(seq, ",mblk_io_submit");
        if (sbi->s_stripe)
                seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
        /*
@@ -1173,6 +1192,7 @@ static const struct super_operations ext4_sops = {
        .destroy_inode  = ext4_destroy_inode,
        .write_inode    = ext4_write_inode,
        .dirty_inode    = ext4_dirty_inode,
+        .drop_inode     = ext4_drop_inode,
        .evict_inode    = ext4_evict_inode,
        .put_super      = ext4_put_super,
        .sync_fs        = ext4_sync_fs,
@@ -1186,7 +1206,6 @@ static const struct super_operations ext4_sops = {
        .quota_write    = ext4_quota_write,
 #endif
        .bdev_try_to_free_page = bdev_try_to_free_page,
-        .trim_fs        = ext4_trim_fs
 };
 static const struct super_operations ext4_nojournal_sops = {
@@ -1194,6 +1213,7 @@ static const struct super_operations ext4_nojournal_sops = {
        .destroy_inode  = ext4_destroy_inode,
        .write_inode    = ext4_write_inode,
        .dirty_inode    = ext4_dirty_inode,
+        .drop_inode     = ext4_drop_inode,
        .evict_inode    = ext4_evict_inode,
        .write_super    = ext4_write_super,
        .put_super      = ext4_put_super,
@@ -1228,8 +1248,8 @@ enum {
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
        Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
        Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
-        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+        Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
-        Opt_block_validity, Opt_noblock_validity,
+        Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio,
        Opt_dioread_nolock, Opt_dioread_lock,
        Opt_discard, Opt_nodiscard,
@@ -1293,6 +1313,8 @@ static const match_table_t tokens = {
        {Opt_resize, "resize"},
        {Opt_delalloc, "delalloc"},
        {Opt_nodelalloc, "nodelalloc"},
+        {Opt_mblk_io_submit, "mblk_io_submit"},
+        {Opt_nomblk_io_submit, "nomblk_io_submit"},
        {Opt_block_validity, "block_validity"},
        {Opt_noblock_validity, "noblock_validity"},
        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
@@ -1714,6 +1736,12 @@ set_qf_format:
                case Opt_nodelalloc:
                        clear_opt(sbi->s_mount_opt, DELALLOC);
                        break;
+                case Opt_mblk_io_submit:
+                        set_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
+                        break;
+                case Opt_nomblk_io_submit:
+                        clear_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
+                        break;
                case Opt_stripe:
                        if (match_int(&args[0], &option))
                                return 0;
@@ -2699,7 +2727,6 @@ static int ext4_lazyinit_thread(void *arg)
        struct ext4_li_request *elr;
        unsigned long next_wakeup;
        DEFINE_WAIT(wait);
-        int ret;
        BUG_ON(NULL == eli);
@@ -2723,13 +2750,12 @@ cont_thread:
                        elr = list_entry(pos, struct ext4_li_request,
                                         lr_request);
-                        if (time_after_eq(jiffies, elr->lr_next_sched))
+                        if (time_after_eq(jiffies, elr->lr_next_sched)) {
-                                ret = ext4_run_li_request(elr);
+                                if (ext4_run_li_request(elr) != 0) {
+                                        /* error, remove the lazy_init job */
-                        if (ret) {
+                                        ext4_remove_li_request(elr);
-                                ret = 0;
+                                        continue;
-                                ext4_remove_li_request(elr);
+                                }
-                                continue;
                        }
                        if (time_before(elr->lr_next_sched, next_wakeup))
@@ -2740,7 +2766,8 @@ cont_thread:
                if (freezing(current))
                        refrigerator();
-                if (time_after_eq(jiffies, next_wakeup)) {
+                if ((time_after_eq(jiffies, next_wakeup)) ||
+                    (MAX_JIFFY_OFFSET == next_wakeup)) {
                        cond_resched();
                        continue;
                }
@@ -2788,9 +2815,6 @@ static void ext4_clear_request_list(void)
        struct ext4_li_request *elr;
        mutex_lock(&ext4_li_info->li_list_mtx);
-        if (list_empty(&ext4_li_info->li_request_list))
-                return;
        list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
                elr = list_entry(pos, struct ext4_li_request,
                                 lr_request);
@@ -3257,13 +3281,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         * Test whether we have more sectors than will fit in sector_t,
         * and whether the max offset is addressable by the page cache.
         */
-        ret = generic_check_addressable(sb->s_blocksize_bits,
+        err = generic_check_addressable(sb->s_blocksize_bits,
                                        ext4_blocks_count(es));
-        if (ret) {
+        if (err) {
                ext4_msg(sb, KERN_ERR, "filesystem"
                         " too large to mount safely on this system");
                if (sizeof(sector_t) < 8)
                        ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
+                ret = err;
                goto failed_mount;
        }
@@ -3348,6 +3373,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
+        err = percpu_counter_init(&sbi->s_freeblocks_counter,
+                        ext4_count_free_blocks(sb));
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_freeinodes_counter,
+                                ext4_count_free_inodes(sb));
+        }
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_dirs_counter,
+                                ext4_count_dirs(sb));
+        }
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+        }
+        if (err) {
+                ext4_msg(sb, KERN_ERR, "insufficient memory");
+                goto failed_mount3;
+        }
        sbi->s_stripe = ext4_get_stripe_size(sbi);
        sbi->s_max_writeback_mb_bump = 128;
@@ -3446,22 +3489,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
-no_journal:
+        /*
-        err = percpu_counter_init(&sbi->s_freeblocks_counter,
+         * The journal may have updated the bg summary counts, so we
-                                  ext4_count_free_blocks(sb));
+         * need to update the global counters.
-        if (!err)
+         */
-                err = percpu_counter_init(&sbi->s_freeinodes_counter,
+        percpu_counter_set(&sbi->s_freeblocks_counter,
-                                          ext4_count_free_inodes(sb));
+                           ext4_count_free_blocks(sb));
-        if (!err)
+        percpu_counter_set(&sbi->s_freeinodes_counter,
-                err = percpu_counter_init(&sbi->s_dirs_counter,
+                           ext4_count_free_inodes(sb));
-                                          ext4_count_dirs(sb));
+        percpu_counter_set(&sbi->s_dirs_counter,
-        if (!err)
+                           ext4_count_dirs(sb));
-                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+        percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
-        if (err) {
-                ext4_msg(sb, KERN_ERR, "insufficient memory");
-                goto failed_mount_wq;
-        }
+no_journal:
        EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
        if (!EXT4_SB(sb)->dio_unwritten_wq) {
                printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
@@ -3611,10 +3651,6 @@ failed_mount_wq:
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
-        percpu_counter_destroy(&sbi->s_freeblocks_counter);
-        percpu_counter_destroy(&sbi->s_freeinodes_counter);
-        percpu_counter_destroy(&sbi->s_dirs_counter);
-        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount3:
        if (sbi->s_flex_groups) {
                if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3622,6 +3658,10 @@ failed_mount3:
                else
                        kfree(sbi->s_flex_groups);
        }
+        percpu_counter_destroy(&sbi->s_freeblocks_counter);
+        percpu_counter_destroy(&sbi->s_freeinodes_counter);
+        percpu_counter_destroy(&sbi->s_dirs_counter);
+        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -3949,13 +3989,11 @@ static int ext4_commit_super(struct super_block *sb, int sync)
        else
                es->s_kbytes_written =
                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
-        if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter))
+        ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
-                ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
+                                           &EXT4_SB(sb)->s_freeblocks_counter));
-                                        &EXT4_SB(sb)->s_freeblocks_counter));
+        es->s_free_inodes_count =
-        if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
+                cpu_to_le32(percpu_counter_sum_positive(
-                es->s_free_inodes_count =
+                                &EXT4_SB(sb)->s_freeinodes_counter));
-                        cpu_to_le32(percpu_counter_sum_positive(
-                                        &EXT4_SB(sb)->s_freeinodes_counter));
        sb->s_dirt = 0;
        BUFFER_TRACE(sbh, "marking dirty");
        mark_buffer_dirty(sbh);
@@ -4556,12 +4594,10 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 static int ext4_quota_off(struct super_block *sb, int type)
 {
-        /* Force all delayed allocation blocks to be allocated */
+        /* Force all delayed allocation blocks to be allocated.
-        if (test_opt(sb, DELALLOC)) {
+         * Caller already holds s_umount sem */
-                down_read(&sb->s_umount);
+        if (test_opt(sb, DELALLOC))
                sync_filesystem(sb);
-                up_read(&sb->s_umount);
-        }
        return dquot_quota_off(sb, type);
 }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index ad6998a92c30..206351af7c58 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -514,11 +514,18 @@ static struct inode *fat_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void fat_destroy_inode(struct inode *inode)
+static void fat_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
 }
+static void fat_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, fat_i_callback);
+}
 static void init_once(void *foo)
 {
        struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
@@ -743,7 +750,7 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
         */
        result = d_obtain_alias(inode);
        if (!IS_ERR(result))
-                result->d_op = sb->s_root->d_op;
+                d_set_d_op(result, sb->s_root->d_op);
        return result;
 }
@@ -793,7 +800,7 @@ static struct dentry *fat_get_parent(struct dentry *child)
        parent = d_obtain_alias(inode);
        if (!IS_ERR(parent))
-                parent->d_op = sb->s_root->d_op;
+                d_set_d_op(parent, sb->s_root->d_op);
 out:
        unlock_super(sb);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 3345aabd1dd7..35ffe43afa4b 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -148,7 +148,8 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len,
 * that the existing dentry can be used. The msdos fs routines will
 * return ENOENT or EINVAL as appropriate.
 */
-static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
+static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
+               struct qstr *qstr)
 {
        struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
        unsigned char msdos_name[MSDOS_NAME];
@@ -164,16 +165,18 @@ static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
 * Compare two msdos names. If either of the names are invalid,
 * we fall back to doing the standard name comparison.
 */
-static int msdos_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int msdos_cmp(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
+        struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options;
        unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME];
        int error;
-        error = msdos_format_name(a->name, a->len, a_msdos_name, options);
+        error = msdos_format_name(name->name, name->len, a_msdos_name, options);
        if (error)
                goto old_compare;
-        error = msdos_format_name(b->name, b->len, b_msdos_name, options);
+        error = msdos_format_name(str, len, b_msdos_name, options);
        if (error)
                goto old_compare;
        error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME);
@@ -182,8 +185,8 @@ out:
 old_compare:
        error = 1;
-        if (a->len == b->len)
+        if (name->len == len)
-                error = memcmp(a->name, b->name, a->len);
+                error = memcmp(name->name, str, len);
        goto out;
 }
@@ -224,10 +227,10 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
        }
 out:
        unlock_super(sb);
-        dentry->d_op = &msdos_dentry_operations;
+        d_set_d_op(dentry, &msdos_dentry_operations);
        dentry = d_splice_alias(inode, dentry);
        if (dentry)
-                dentry->d_op = &msdos_dentry_operations;
+                d_set_d_op(dentry, &msdos_dentry_operations);
        return dentry;
 error:
@@ -670,7 +673,7 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent)
        }
        sb->s_flags |= MS_NOATIME;
-        sb->s_root->d_op = &msdos_dentry_operations;
+        d_set_d_op(sb->s_root, &msdos_dentry_operations);
        unlock_super(sb);
        return 0;
 }
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index b936703b8924..e3ffc5e12332 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -43,6 +43,9 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
 static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        /* This is not negative dentry. Always valid. */
        if (dentry->d_inode)
                return 1;
@@ -51,6 +54,9 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 {
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        /*
         * This is not negative dentry. Always valid.
         *
@@ -85,22 +91,26 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 }
 /* returns the length of a struct qstr, ignoring trailing dots */
-static unsigned int vfat_striptail_len(struct qstr *qstr)
+static unsigned int __vfat_striptail_len(unsigned int len, const char *name)
 {
-        unsigned int len = qstr->len;
+        while (len && name[len - 1] == '.')
-        while (len && qstr->name[len - 1] == '.')
                len--;
        return len;
 }
+static unsigned int vfat_striptail_len(const struct qstr *qstr)
+{
+        return __vfat_striptail_len(qstr->len, qstr->name);
+}
 /*
 * Compute the hash for the vfat name corresponding to the dentry.
 * Note: if the name is invalid, we leave the hash code unchanged so
 * that the existing dentry can be used. The vfat fs routines will
 * return ENOENT or EINVAL as appropriate.
 */
-static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
+static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
        return 0;
@@ -112,9 +122,10 @@ static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
 * that the existing dentry can be used. The vfat fs routines will
 * return ENOENT or EINVAL as appropriate.
 */
-static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
+static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
-        struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
+        struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
        const unsigned char *name;
        unsigned int len;
        unsigned long hash;
@@ -133,16 +144,18 @@ static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
 /*
 * Case insensitive compare of two vfat names.
 */
-static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
+        struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
        unsigned int alen, blen;
        /* A filename cannot end in '.' or we treat it like it has none */
-        alen = vfat_striptail_len(a);
+        alen = vfat_striptail_len(name);
-        blen = vfat_striptail_len(b);
+        blen = __vfat_striptail_len(len, str);
        if (alen == blen) {
-                if (nls_strnicmp(t, a->name, b->name, alen) == 0)
+                if (nls_strnicmp(t, name->name, str, alen) == 0)
                        return 0;
        }
        return 1;
@@ -151,15 +164,17 @@ static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
 /*
 * Case sensitive compare of two vfat names.
 */
-static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int vfat_cmp(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
        unsigned int alen, blen;
        /* A filename cannot end in '.' or we treat it like it has none */
-        alen = vfat_striptail_len(a);
+        alen = vfat_striptail_len(name);
-        blen = vfat_striptail_len(b);
+        blen = __vfat_striptail_len(len, str);
        if (alen == blen) {
-                if (strncmp(a->name, b->name, alen) == 0)
+                if (strncmp(name->name, str, alen) == 0)
                        return 0;
        }
        return 1;
@@ -757,11 +772,11 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 out:
        unlock_super(sb);
-        dentry->d_op = sb->s_root->d_op;
+        d_set_d_op(dentry, sb->s_root->d_op);
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        dentry = d_splice_alias(inode, dentry);
        if (dentry) {
-                dentry->d_op = sb->s_root->d_op;
+                d_set_d_op(dentry, sb->s_root->d_op);
                dentry->d_time = dentry->d_parent->d_inode->i_version;
        }
        return dentry;
@@ -1063,9 +1078,9 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (MSDOS_SB(sb)->options.name_check != 's')
-                sb->s_root->d_op = &vfat_ci_dentry_ops;
+                d_set_d_op(sb->s_root, &vfat_ci_dentry_ops);
        else
-                sb->s_root->d_op = &vfat_dentry_ops;
+                d_set_d_op(sb->s_root, &vfat_dentry_ops);
        unlock_super(sb);
        return 0;
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 68ba492d8eef..751d6b255a12 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -115,6 +115,9 @@ int unregister_filesystem(struct file_system_type * fs)
                tmp = &(*tmp)->next;
        }
        write_unlock(&file_systems_lock);
+        synchronize_rcu();
        return -EINVAL;
 }
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 8c04eac5079d..2ba6719ac612 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -337,6 +337,13 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
        return ip;
 }
+static void vxfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(vxfs_inode_cachep, inode->i_private);
+}
 /**
 * vxfs_evict_inode - remove inode from main memory
 * @ip:         inode to discard.
@@ -350,5 +357,5 @@ vxfs_evict_inode(struct inode *ip)
 {
        truncate_inode_pages(&ip->i_data, 0);
        end_writeback(ip);
-        kmem_cache_free(vxfs_inode_cachep, ip->i_private);
+        call_rcu(&ip->i_rcu, vxfs_i_callback);
 }
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index ed45a9cf5f3d..68ca487bedb1 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -14,12 +14,14 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
        struct path old_root;
        spin_lock(&fs->lock);
+        write_seqcount_begin(&fs->seq);
        old_root = fs->root;
        fs->root = *path;
-        path_get(path);
+        path_get_long(path);
+        write_seqcount_end(&fs->seq);
        spin_unlock(&fs->lock);
        if (old_root.dentry)
-                path_put(&old_root);
+                path_put_long(&old_root);
 }
 /*
@@ -31,13 +33,15 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
        struct path old_pwd;
        spin_lock(&fs->lock);
+        write_seqcount_begin(&fs->seq);
        old_pwd = fs->pwd;
        fs->pwd = *path;
-        path_get(path);
+        path_get_long(path);
+        write_seqcount_end(&fs->seq);
        spin_unlock(&fs->lock);
        if (old_pwd.dentry)
-                path_put(&old_pwd);
+                path_put_long(&old_pwd);
 }
 void chroot_fs_refs(struct path *old_root, struct path *new_root)
@@ -52,31 +56,33 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
                fs = p->fs;
                if (fs) {
                        spin_lock(&fs->lock);
+                        write_seqcount_begin(&fs->seq);
                        if (fs->root.dentry == old_root->dentry
                            && fs->root.mnt == old_root->mnt) {
-                                path_get(new_root);
+                                path_get_long(new_root);
                                fs->root = *new_root;
                                count++;
                        }
                        if (fs->pwd.dentry == old_root->dentry
                            && fs->pwd.mnt == old_root->mnt) {
-                                path_get(new_root);
+                                path_get_long(new_root);
                                fs->pwd = *new_root;
                                count++;
                        }
+                        write_seqcount_end(&fs->seq);
                        spin_unlock(&fs->lock);
                }
                task_unlock(p);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
        while (count--)
-                path_put(old_root);
+                path_put_long(old_root);
 }
 void free_fs_struct(struct fs_struct *fs)
 {
-        path_put(&fs->root);
+        path_put_long(&fs->root);
-        path_put(&fs->pwd);
+        path_put_long(&fs->pwd);
        kmem_cache_free(fs_cachep, fs);
 }
@@ -88,8 +94,10 @@ void exit_fs(struct task_struct *tsk)
                int kill;
                task_lock(tsk);
                spin_lock(&fs->lock);
+                write_seqcount_begin(&fs->seq);
                tsk->fs = NULL;
                kill = !--fs->users;
+                write_seqcount_end(&fs->seq);
                spin_unlock(&fs->lock);
                task_unlock(tsk);
                if (kill)
@@ -105,8 +113,15 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
                fs->users = 1;
                fs->in_exec = 0;
                spin_lock_init(&fs->lock);
+                seqcount_init(&fs->seq);
                fs->umask = old->umask;
-                get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
+                spin_lock(&old->lock);
+                fs->root = old->root;
+                path_get_long(&fs->root);
+                fs->pwd = old->pwd;
+                path_get_long(&fs->pwd);
+                spin_unlock(&old->lock);
        }
        return fs;
 }
@@ -144,6 +159,7 @@ EXPORT_SYMBOL(current_umask);
 struct fs_struct init_fs = {
        .users          = 1,
        .lock           = __SPIN_LOCK_UNLOCKED(init_fs.lock),
+        .seq            = SEQCNT_ZERO,
        .umask          = 0022,
 };
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c9627c95482d..f738599fd8cd 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -156,8 +156,12 @@ u64 fuse_get_attr_version(struct fuse_conn *fc)
 */
 static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
-        struct inode *inode = entry->d_inode;
+        struct inode *inode;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = entry->d_inode;
        if (inode && is_bad_inode(inode))
                return 0;
        else if (fuse_dentry_time(entry) < get_jiffies_64()) {
@@ -347,7 +351,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
        }
        entry = newent ? newent : entry;
-        entry->d_op = &fuse_dentry_operations;
+        d_set_d_op(entry, &fuse_dentry_operations);
        if (outarg_valid)
                fuse_change_entry_timeout(entry, &outarg);
        else
@@ -981,12 +985,15 @@ static int fuse_access(struct inode *inode, int mask)
 * access request is sent.  Execute permission is still checked
 * locally based on file mode.
 */
-static int fuse_permission(struct inode *inode, int mask)
+static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
        bool refreshed = false;
        int err = 0;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        if (!fuse_allow_task(fc, current))
                return -EACCES;
@@ -1001,7 +1008,7 @@ static int fuse_permission(struct inode *inode, int mask)
        }
        if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
-                err = generic_permission(inode, mask, NULL);
+                err = generic_permission(inode, mask, flags, NULL);
                /* If permission is denied, try to refresh file
                   attributes.  This is also needed, because the root
@@ -1009,7 +1016,8 @@ static int fuse_permission(struct inode *inode, int mask)
                if (err == -EACCES && !refreshed) {
                        err = fuse_do_getattr(inode, NULL, NULL);
                        if (!err)
-                                err = generic_permission(inode, mask, NULL);
+                                err = generic_permission(inode, mask,
+                                                        flags, NULL);
                }
                /* Note: the opposite of the above test does not
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c8224587123f..8b984a2cebbd 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/compat.h>
 static const struct file_operations fuse_direct_io_file_operations;
@@ -134,6 +135,7 @@ EXPORT_SYMBOL_GPL(fuse_do_open);
 void fuse_finish_open(struct inode *inode, struct file *file)
 {
        struct fuse_file *ff = file->private_data;
+        struct fuse_conn *fc = get_fuse_conn(inode);
        if (ff->open_flags & FOPEN_DIRECT_IO)
                file->f_op = &fuse_direct_io_file_operations;
@@ -141,6 +143,15 @@ void fuse_finish_open(struct inode *inode, struct file *file)
                invalidate_inode_pages2(inode->i_mapping);
        if (ff->open_flags & FOPEN_NONSEEKABLE)
                nonseekable_open(inode, file);
+        if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
+                struct fuse_inode *fi = get_fuse_inode(inode);
+                spin_lock(&fc->lock);
+                fi->attr_version = ++fc->attr_version;
+                i_size_write(inode, 0);
+                spin_unlock(&fc->lock);
+                fuse_invalidate_attr(inode);
+        }
 }
 int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
@@ -1618,6 +1629,58 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
 }
 /*
+ * CUSE servers compiled on 32bit broke on 64bit kernels because the
+ * ABI was defined to be 'struct iovec' which is different on 32bit
+ * and 64bit.  Fortunately we can determine which structure the server
+ * used from the size of the reply.
+ */
+static int fuse_copy_ioctl_iovec(struct iovec *dst, void *src,
+                                 size_t transferred, unsigned count,
+                                 bool is_compat)
+{
+#ifdef CONFIG_COMPAT
+        if (count * sizeof(struct compat_iovec) == transferred) {
+                struct compat_iovec *ciov = src;
+                unsigned i;
+                /*
+                 * With this interface a 32bit server cannot support
+                 * non-compat (i.e. ones coming from 64bit apps) ioctl
+                 * requests
+                 */
+                if (!is_compat)
+                        return -EINVAL;
+                for (i = 0; i < count; i++) {
+                        dst[i].iov_base = compat_ptr(ciov[i].iov_base);
+                        dst[i].iov_len = ciov[i].iov_len;
+                }
+                return 0;
+        }
+#endif
+        if (count * sizeof(struct iovec) != transferred)
+                return -EIO;
+        memcpy(dst, src, transferred);
+        return 0;
+}
+/* Make sure iov_length() won't overflow */
+static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
+{
+        size_t n;
+        u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
+        for (n = 0; n < count; n++) {
+                if (iov->iov_len > (size_t) max)
+                        return -ENOMEM;
+                max -= iov->iov_len;
+        }
+        return 0;
+}
+/*
 * For ioctls, there is no generic way to determine how much memory
 * needs to be read and/or written.  Furthermore, ioctls are allowed
 * to dereference the passed pointer, so the parameter requires deep
@@ -1798,18 +1861,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
                    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
                        goto out;
-                err = -EIO;
-                if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
-                        goto out;
-                /* okay, copy in iovs and retry */
                vaddr = kmap_atomic(pages[0], KM_USER0);
-                memcpy(page_address(iov_page), vaddr, transferred);
+                err = fuse_copy_ioctl_iovec(page_address(iov_page), vaddr,
+                                            transferred, in_iovs + out_iovs,
+                                            (flags & FUSE_IOCTL_COMPAT) != 0);
                kunmap_atomic(vaddr, KM_USER0);
+                if (err)
+                        goto out;
                in_iov = page_address(iov_page);
                out_iov = in_iov + in_iovs;
+                err = fuse_verify_ioctl_iov(in_iov, in_iovs);
+                if (err)
+                        goto out;
+                err = fuse_verify_ioctl_iov(out_iov, out_iovs);
+                if (err)
+                        goto out;
                goto retry;
        }
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index cfce3ad86a92..a8b31da19b93 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -99,6 +99,13 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
        return inode;
 }
+static void fuse_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(fuse_inode_cachep, inode);
+}
 static void fuse_destroy_inode(struct inode *inode)
 {
        struct fuse_inode *fi = get_fuse_inode(inode);
@@ -106,7 +113,7 @@ static void fuse_destroy_inode(struct inode *inode)
        BUG_ON(!list_empty(&fi->queued_writes));
        if (fi->forget_req)
                fuse_request_free(fi->forget_req);
-        kmem_cache_free(fuse_inode_cachep, inode);
+        call_rcu(&inode->i_rcu, fuse_i_callback);
 }
 void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
@@ -619,7 +626,7 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
        entry = d_obtain_alias(inode);
        if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) {
-                entry->d_op = &fuse_dentry_operations;
+                d_set_d_op(entry, &fuse_dentry_operations);
                fuse_invalidate_entry_cache(entry);
        }
@@ -721,7 +728,7 @@ static struct dentry *fuse_get_parent(struct dentry *child)
        parent = d_obtain_alias(inode);
        if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) {
-                parent->d_op = &fuse_dentry_operations;
+                d_set_d_op(parent, &fuse_dentry_operations);
                fuse_invalidate_entry_cache(parent);
        }
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 6bc9e3a5a693..06c48a891832 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -190,14 +190,20 @@ generic_acl_chmod(struct inode *inode)
 }
 int
-generic_check_acl(struct inode *inode, int mask)
+generic_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
-        if (acl) {
+                        return -ECHILD;
-                int error = posix_acl_permission(inode, acl, mask);
+        } else {
-                posix_acl_release(acl);
+                struct posix_acl *acl;
-                return error;
+                acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
+                if (acl) {
+                        int error = posix_acl_permission(inode, acl, mask);
+                        posix_acl_release(acl);
+                        return error;
+                }
        }
        return -EAGAIN;
 }
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 48171f4c943d..7118f1a780a9 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -75,11 +75,14 @@ static struct posix_acl *gfs2_acl_get(struct gfs2_inode *ip, int type)
 * Returns: errno
 */
-int gfs2_check_acl(struct inode *inode, int mask)
+int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
        struct posix_acl *acl;
        int error;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index b522b0cb39ea..a93907c8159b 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -16,7 +16,7 @@
 #define GFS2_POSIX_ACL_DEFAULT          "posix_acl_default"
 #define GFS2_ACL_MAX_ENTRIES            25
-extern int gfs2_check_acl(struct inode *inode, int mask);
+extern int gfs2_check_acl(struct inode *inode, int mask, unsigned int);
 extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
 extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
 extern const struct xattr_handler gfs2_xattr_system_handler;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5476c066d4ee..3c4039d5eef1 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -763,7 +763,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        int metadata;
        unsigned int revokes = 0;
        int x;
-        int error;
+        int error = 0;
        if (!*top)
                sm->sm_first = 0;
@@ -780,7 +780,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        if (metadata)
                revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
-        error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+        if (ip != GFS2_I(sdp->sd_rindex))
+                error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+        else if (!sdp->sd_rgrps)
+                error = gfs2_ri_update(ip);
        if (error)
                return error;
@@ -879,7 +883,8 @@ out_rg_gunlock:
 out_rlist:
        gfs2_rlist_free(&rlist);
 out:
-        gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
+        if (ip != GFS2_I(sdp->sd_rindex))
+                gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
        return error;
 }
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 6798755b3858..4a456338b873 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -11,6 +11,7 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/namei.h>
 #include <linux/crc32.h>
 #include "gfs2.h"
@@ -34,15 +35,23 @@
 static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct dentry *parent = dget_parent(dentry);
+        struct dentry *parent;
-        struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
+        struct gfs2_sbd *sdp;
-        struct gfs2_inode *dip = GFS2_I(parent->d_inode);
+        struct gfs2_inode *dip;
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
        struct gfs2_holder d_gh;
        struct gfs2_inode *ip = NULL;
        int error;
        int had_lock = 0;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        parent = dget_parent(dentry);
+        sdp = GFS2_SB(parent->d_inode);
+        dip = GFS2_I(parent->d_inode);
+        inode = dentry->d_inode;
        if (inode) {
                if (is_bad_inode(inode))
                        goto invalid;
@@ -100,13 +109,14 @@ fail:
        return 0;
 }
-static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
+static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *str)
 {
        str->hash = gfs2_disk_hash(str->name, str->len);
        return 0;
 }
-static int gfs2_dentry_delete(struct dentry *dentry)
+static int gfs2_dentry_delete(const struct dentry *dentry)
 {
        struct gfs2_inode *ginode;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 06d582732d34..97012ecff560 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -130,7 +130,7 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
        dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
        if (!IS_ERR(dentry))
-                dentry->d_op = &gfs2_dops;
+                d_set_d_op(dentry, &gfs2_dops);
        return dentry;
 }
@@ -138,10 +138,8 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
                                      struct gfs2_inum_host *inum)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
-        struct gfs2_holder i_gh;
        struct inode *inode;
        struct dentry *dentry;
-        int error;
        inode = gfs2_ilookup(sb, inum->no_addr);
        if (inode) {
@@ -152,52 +150,16 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
                goto out_inode;
        }
-        error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
+        inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino,
-                                  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+                                    GFS2_BLKST_DINODE);
-        if (error)
+        if (IS_ERR(inode))
-                return ERR_PTR(error);
+                return ERR_CAST(inode);
-        error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
-        if (error)
-                goto fail;
-        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
-        if (IS_ERR(inode)) {
-                error = PTR_ERR(inode);
-                goto fail;
-        }
-        error = gfs2_inode_refresh(GFS2_I(inode));
-        if (error) {
-                iput(inode);
-                goto fail;
-        }
-        /* Pick up the works we bypass in gfs2_inode_lookup */
-        if (inode->i_state & I_NEW) 
-                gfs2_set_iop(inode);
-        if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
-                iput(inode);
-                goto fail;
-        }
-        error = -EIO;
-        if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
-                iput(inode);
-                goto fail;
-        }
-        gfs2_glock_dq_uninit(&i_gh);
 out_inode:
        dentry = d_obtain_alias(inode);
        if (!IS_ERR(dentry))
-                dentry->d_op = &gfs2_dops;
+                d_set_d_op(dentry, &gfs2_dops);
        return dentry;
-fail:
-        gfs2_glock_dq_uninit(&i_gh);
-        return ERR_PTR(error);
 }
 static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index aa996471ec5c..fca6689e12e6 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -241,7 +241,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
            !capable(CAP_LINUX_IMMUTABLE))
                goto out;
        if (!IS_IMMUTABLE(inode)) {
-                error = gfs2_permission(inode, MAY_WRITE);
+                error = gfs2_permission(inode, MAY_WRITE, 0);
                if (error)
                        goto out;
        }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 87778857f099..08a8beb152e6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -541,21 +541,6 @@ out_locked:
        spin_unlock(&gl->gl_spin);
 }
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-                                 unsigned int req_state,
-                                 unsigned int flags)
-{
-        int ret = LM_OUT_ERROR;
-        if (!sdp->sd_lockstruct.ls_ops->lm_lock)
-                return req_state == LM_ST_UNLOCKED ? 0 : req_state;
-        if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
-                                                         req_state, flags);
-        return ret;
-}
 /**
 * do_xmote - Calls the DLM to change the state of a lock
 * @gl: The lock state
@@ -575,13 +560,14 @@ __acquires(&gl->gl_spin)
        lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
                      LM_FLAG_PRIORITY);
-        BUG_ON(gl->gl_state == target);
+        GLOCK_BUG_ON(gl, gl->gl_state == target);
-        BUG_ON(gl->gl_state == gl->gl_target);
+        GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
        if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
            glops->go_inval) {
                set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
                do_error(gl, 0); /* Fail queued try locks */
        }
+        gl->gl_req = target;
        spin_unlock(&gl->gl_spin);
        if (glops->go_xmote_th)
                glops->go_xmote_th(gl);
@@ -594,15 +580,17 @@ __acquires(&gl->gl_spin)
            gl->gl_state == LM_ST_DEFERRED) &&
            !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
                lck_flags |= LM_FLAG_TRY_1CB;
-        ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
-        if (!(ret & LM_OUT_ASYNC)) {
+        if (sdp->sd_lockstruct.ls_ops->lm_lock) {
-                finish_xmote(gl, ret);
+                /* lock_dlm */
+                ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
+                GLOCK_BUG_ON(gl, ret);
+        } else { /* lock_nolock */
+                finish_xmote(gl, target);
                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                        gfs2_glock_put(gl);
-        } else {
-                GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
        }
        spin_lock(&gl->gl_spin);
 }
@@ -686,21 +674,20 @@ static void delete_work_func(struct work_struct *work)
 {
        struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
        struct gfs2_sbd *sdp = gl->gl_sbd;
-        struct gfs2_inode *ip = NULL;
+        struct gfs2_inode *ip;
        struct inode *inode;
-        u64 no_addr = 0;
+        u64 no_addr = gl->gl_name.ln_number;
+        ip = gl->gl_object;
+        /* Note: Unsafe to dereference ip as we don't hold right refs/locks */
-        spin_lock(&gl->gl_spin);
-        ip = (struct gfs2_inode *)gl->gl_object;
        if (ip)
-                no_addr = ip->i_no_addr;
-        spin_unlock(&gl->gl_spin);
-        if (ip) {
                inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
-                if (inode) {
+        else
-                        d_prune_aliases(inode);
+                inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
-                        iput(inode);
+        if (inode && !IS_ERR(inode)) {
-                }
+                d_prune_aliases(inode);
+                iput(inode);
        }
        gfs2_glock_put(gl);
 }
@@ -952,17 +939,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        va_start(args, fmt);
        if (seq) {
                struct gfs2_glock_iter *gi = seq->private;
                vsprintf(gi->string, fmt, args);
                seq_printf(seq, gi->string);
        } else {
-                printk(KERN_ERR " ");
+                vaf.fmt = fmt;
-                vprintk(fmt, args);
+                vaf.va = &args;
+                printk(KERN_ERR " %pV", &vaf);
        }
        va_end(args);
 }
@@ -1362,24 +1354,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
 * @gl: Pointer to the glock
 * @ret: The return value from the dlm
 *
+ * The gl_reply field is under the gl_spin lock so that it is ok
+ * to use a bitfield shared with other glock state fields.
 */
 void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 {
        struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+        spin_lock(&gl->gl_spin);
        gl->gl_reply = ret;
        if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
-                spin_lock(&gl->gl_spin);
                if (gfs2_should_freeze(gl)) {
                        set_bit(GLF_FROZEN, &gl->gl_flags);
                        spin_unlock(&gl->gl_spin);
                        return;
                }
-                spin_unlock(&gl->gl_spin);
        }
+        spin_unlock(&gl->gl_spin);
        set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+        smp_wmb();
        gfs2_glock_hold(gl);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                gfs2_glock_put(gl);
@@ -1627,18 +1623,17 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
 static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 {
        struct task_struct *gh_owner = NULL;
-        char buffer[KSYM_SYMBOL_LEN];
        char flags_buf[32];
-        sprint_symbol(buffer, gh->gh_ip);
        if (gh->gh_owner_pid)
                gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
-        gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
+        gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
-                  state2str(gh->gh_state),
+                       state2str(gh->gh_state),
-                  hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+                       hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
-                  gh->gh_error, 
+                       gh->gh_error,
-                  gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+                       gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
-                  gh_owner ? gh_owner->comm : "(ended)", buffer);
+                       gh_owner ? gh_owner->comm : "(ended)",
+                       (void *)gh->gh_ip);
        return 0;
 }
@@ -1783,12 +1778,13 @@ int __init gfs2_glock_init(void)
        }
 #endif
-        glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
+        glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
                                          WQ_HIGHPRI | WQ_FREEZEABLE, 0);
        if (IS_ERR(glock_workqueue))
                return PTR_ERR(glock_workqueue);
-        gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
+        gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
-                                                WQ_FREEZEABLE, 0);
+                                                WQ_MEM_RECLAIM | WQ_FREEZEABLE,
+                                                0);
        if (IS_ERR(gfs2_delete_workqueue)) {
                destroy_workqueue(glock_workqueue);
                return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index db1c26d6d220..691851ceb615 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -87,11 +87,10 @@ enum {
 #define GL_ASYNC                0x00000040
 #define GL_EXACT                0x00000080
 #define GL_SKIP                 0x00000100
-#define GL_ATIME                0x00000200
 #define GL_NOCACHE              0x00000400
  
 /*
- * lm_lock() and lm_async_cb return flags
+ * lm_async_cb return flags
 *
 * LM_OUT_ST_MASK
 * Masks the lower two bits of lock state in the returned value.
@@ -99,15 +98,11 @@ enum {
 * LM_OUT_CANCELED
 * The lock request was canceled.
 *
- * LM_OUT_ASYNC
- * The result of the request will be returned in an LM_CB_ASYNC callback.
- *
 */
 #define LM_OUT_ST_MASK          0x00000003
 #define LM_OUT_CANCELED         0x00000008
-#define LM_OUT_ASYNC            0x00000080
+#define LM_OUT_ERROR            0x00000004
-#define LM_OUT_ERROR            0x00000100
 /*
 * lm_recovery_done() messages
@@ -124,25 +119,12 @@ struct lm_lockops {
        void (*lm_unmount) (struct gfs2_sbd *sdp);
        void (*lm_withdraw) (struct gfs2_sbd *sdp);
        void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
-        unsigned int (*lm_lock) (struct gfs2_glock *gl,
+        int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
-                                 unsigned int req_state, unsigned int flags);
+                        unsigned int flags);
        void (*lm_cancel) (struct gfs2_glock *gl);
        const match_table_t *lm_tokens;
 };
-#define LM_FLAG_TRY             0x00000001
-#define LM_FLAG_TRY_1CB         0x00000002
-#define LM_FLAG_NOEXP           0x00000004
-#define LM_FLAG_ANY             0x00000008
-#define LM_FLAG_PRIORITY        0x00000010
-#define GL_ASYNC                0x00000040
-#define GL_EXACT                0x00000080
-#define GL_SKIP                 0x00000100
-#define GL_NOCACHE              0x00000400
-#define GLR_TRYFAILED           13
 extern struct workqueue_struct *gfs2_delete_workqueue;
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
@@ -212,6 +194,8 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
 int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+__attribute__ ((format(printf, 2, 3)))
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 /**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 0d149dcc04e5..263561bf1a50 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -325,7 +325,6 @@ static void trans_go_sync(struct gfs2_glock *gl)
        if (gl->gl_state != LM_ST_UNLOCKED &&
            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-                flush_workqueue(gfs2_delete_workqueue);
                gfs2_meta_syncfs(sdp);
                gfs2_log_shutdown(sdp);
        }
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 764fbb49efc8..8d3d2b4a0a7d 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -207,12 +207,14 @@ struct gfs2_glock {
        spinlock_t gl_spin;
-        unsigned int gl_state;
+        /* State fields protected by gl_spin */
-        unsigned int gl_target;
+        unsigned int gl_state:2,        /* Current state */
-        unsigned int gl_reply;
+                     gl_target:2,       /* Target state */
+                     gl_demote_state:2, /* State requested by remote node */
+                     gl_req:2,          /* State in last dlm request */
+                     gl_reply:8;        /* Last reply from the dlm */
        unsigned int gl_hash;
-        unsigned int gl_req;
-        unsigned int gl_demote_state; /* state requested by remote node */
        unsigned long gl_demote_time; /* time of first demote request */
        struct list_head gl_holders;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 06370f8bd8cf..2232b3c780bd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -73,49 +73,6 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
        return iget5_locked(sb, hash, iget_test, iget_set, &no_addr);
 }
-struct gfs2_skip_data {
-        u64     no_addr;
-        int     skipped;
-};
-static int iget_skip_test(struct inode *inode, void *opaque)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_skip_data *data = opaque;
-        if (ip->i_no_addr == data->no_addr) {
-                if (inode->i_state & (I_FREEING|I_WILL_FREE)){
-                        data->skipped = 1;
-                        return 0;
-                }
-                return 1;
-        }
-        return 0;
-}
-static int iget_skip_set(struct inode *inode, void *opaque)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_skip_data *data = opaque;
-        if (data->skipped)
-                return 1;
-        inode->i_ino = (unsigned long)(data->no_addr);
-        ip->i_no_addr = data->no_addr;
-        return 0;
-}
-static struct inode *gfs2_iget_skip(struct super_block *sb,
-                                    u64 no_addr)
-{
-        struct gfs2_skip_data data;
-        unsigned long hash = (unsigned long)no_addr;
-        data.no_addr = no_addr;
-        data.skipped = 0;
-        return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data);
-}
 /**
 * GFS2 lookup code fills in vfs inode contents based on info obtained
 * from directory entry inside gfs2_inode_lookup(). This has caused issues
@@ -243,93 +200,54 @@ fail:
        return ERR_PTR(error);
 }
-/**
+struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
- * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
+                                  u64 *no_formal_ino, unsigned int blktype)
- *                               and try to reclaim it by doing iput.
- *
- * This function assumes no rgrp locks are currently held.
- *
- * @sb: The super block
- * no_addr: The inode number
- *
- */
-void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
 {
-        struct gfs2_sbd *sdp;
+        struct super_block *sb = sdp->sd_vfs;
-        struct gfs2_inode *ip;
+        struct gfs2_holder i_gh;
-        struct gfs2_glock *io_gl = NULL;
-        int error;
-        struct gfs2_holder gh;
        struct inode *inode;
+        int error;
-        inode = gfs2_iget_skip(sb, no_addr);
+        error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops,
+                                  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-        if (!inode)
+        if (error)
-                return;
+                return ERR_PTR(error);
-        /* If it's not a new inode, someone's using it, so leave it alone. */
-        if (!(inode->i_state & I_NEW)) {
-                iput(inode);
-                return;
-        }
-        ip = GFS2_I(inode);
-        sdp = GFS2_SB(inode);
-        ip->i_no_formal_ino = -1;
-        error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+        error = gfs2_check_blk_type(sdp, no_addr, blktype);
-        if (unlikely(error))
+        if (error)
                goto fail;
-        ip->i_gl->gl_object = ip;
-        error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
-        if (unlikely(error))
-                goto fail_put;
-        set_bit(GIF_INVALID, &ip->i_flags);
+        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
-        error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
+        if (IS_ERR(inode))
-                                   &ip->i_iopen_gh);
+                goto fail;
-        if (unlikely(error))
-                goto fail_iopen;
-        ip->i_iopen_gh.gh_gl->gl_object = ip;
+        error = gfs2_inode_refresh(GFS2_I(inode));
-        gfs2_glock_put(io_gl);
+        if (error)
-        io_gl = NULL;
+                goto fail_iput;
-        inode->i_mode = DT2IF(DT_UNKNOWN);
+        /* Pick up the works we bypass in gfs2_inode_lookup */
+        if (inode->i_state & I_NEW) 
+                gfs2_set_iop(inode);
-        /*
+        /* Two extra checks for NFS only */
-         * We must read the inode in order to work out its type in
+        if (no_formal_ino) {
-         * this case. Note that this doesn't happen often as we normally
+                error = -ESTALE;
-         * know the type beforehand. This code path only occurs during
+                if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino)
-         * unlinked inode recovery (where it is safe to do this glock,
+                        goto fail_iput;
-         * which is not true in the general case).
-         */
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
-                                   &gh);
-        if (unlikely(error))
-                goto fail_glock;
-        /* Inode is now uptodate */
+                error = -EIO;
-        gfs2_glock_dq_uninit(&gh);
+                if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM)
-        gfs2_set_iop(inode);
+                        goto fail_iput;
-        /* The iput will cause it to be deleted. */
+                error = 0;
-        iput(inode);
+        }
-        return;
-fail_glock:
-        gfs2_glock_dq(&ip->i_iopen_gh);
-fail_iopen:
-        if (io_gl)
-                gfs2_glock_put(io_gl);
-fail_put:
-        ip->i_gl->gl_object = NULL;
-        gfs2_glock_put(ip->i_gl);
 fail:
-        iget_failed(inode);
+        gfs2_glock_dq_uninit(&i_gh);
-        return;
+        return error ? ERR_PTR(error) : inode;
+fail_iput:
+        iput(inode);
+        goto fail;
 }
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
@@ -591,7 +509,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
        }
        if (!is_root) {
-                error = gfs2_permission(dir, MAY_EXEC);
+                error = gfs2_permission(dir, MAY_EXEC, 0);
                if (error)
                        goto out;
        }
@@ -621,7 +539,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 {
        int error;
-        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
        if (error)
                return error;
@@ -998,17 +916,8 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
        if (error)
                return error;
-        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode)) {
-                error = vmtruncate(inode, attr->ia_size);
-                if (error)
-                        return error;
-        }
        setattr_copy(inode, attr);
        mark_inode_dirty(inode);
-        gfs2_assert_warn(GFS2_SB(inode), !error);
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
        brelse(dibh);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 6720d7d5fbc6..732a183efdb3 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -99,7 +99,9 @@ err:
 extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
                                       u64 no_addr, u64 no_formal_ino);
-extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
+extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
+                                         u64 *no_formal_ino,
+                                         unsigned int blktype);
 extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
 extern int gfs2_inode_refresh(struct gfs2_inode *ip);
@@ -111,7 +113,7 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
                                  const struct qstr *name,
                                  unsigned int mode, dev_t dev);
-extern int gfs2_permission(struct inode *inode, int mask);
+extern int gfs2_permission(struct inode *inode, int mask, unsigned int flags);
 extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 1c09425b45fd..6e493aee28f8 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -146,15 +146,13 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
        return lkf;
 }
-static unsigned int gdlm_lock(struct gfs2_glock *gl,
+static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
-                              unsigned int req_state, unsigned int flags)
+                     unsigned int flags)
 {
        struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
-        int error;
        int req;
        u32 lkf;
-        gl->gl_req = req_state;
        req = make_mode(req_state);
        lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
@@ -162,13 +160,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
         * Submit the actual lock request.
         */
-        error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
+        return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
-                         GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
+                        GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
-        if (error == -EAGAIN)
-                return 0;
-        if (error)
-                return LM_OUT_ERROR;
-        return LM_OUT_ASYNC;
 }
 static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3eb1393f7b81..2aeabd4218cc 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -440,7 +440,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
                iput(inode);
                return -ENOMEM;
        }
-        dentry->d_op = &gfs2_dops;
+        d_set_d_op(dentry, &gfs2_dops);
        *dptr = dentry;
        return 0;
 }
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 12cbea7502c2..1501db4f0e6d 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -106,7 +106,7 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
 {
        struct inode *inode = NULL;
-        dentry->d_op = &gfs2_dops;
+        d_set_d_op(dentry, &gfs2_dops);
        inode = gfs2_lookupi(dir, &dentry->d_name, 0);
        if (inode && IS_ERR(inode))
@@ -166,7 +166,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (error)
                goto out_child;
-        error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
+        error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0);
        if (error)
                goto out_gunlock;
@@ -289,7 +289,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
        if (IS_APPEND(&dip->i_inode))
                return -EPERM;
-        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
        if (error)
                return error;
@@ -822,7 +822,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        }
                }
        } else {
-                error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
+                error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0);
                if (error)
                        goto out_gunlock;
@@ -857,7 +857,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        /* Check out the dir to be renamed */
        if (dir_rename) {
-                error = gfs2_permission(odentry->d_inode, MAY_WRITE);
+                error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0);
                if (error)
                        goto out_gunlock;
        }
@@ -1041,13 +1041,17 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 * Returns: errno
 */
-int gfs2_permission(struct inode *inode, int mask)
+int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_inode *ip;
        struct gfs2_holder i_gh;
        int error;
        int unlock = 0;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        ip = GFS2_I(inode);
        if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
                if (error)
@@ -1058,7 +1062,7 @@ int gfs2_permission(struct inode *inode, int mask)
        if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
                error = -EACCES;
        else
-                error = generic_permission(inode, mask, gfs2_check_acl);
+                error = generic_permission(inode, mask, flags, gfs2_check_acl);
        if (unlock)
                gfs2_glock_dq_uninit(&i_gh);
@@ -1069,7 +1073,6 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        struct buffer_head *dibh;
        u32 ouid, ogid, nuid, ngid;
        int error;
@@ -1100,25 +1103,10 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        if (error)
                goto out_gunlock_q;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
+        error = gfs2_setattr_simple(ip, attr);
        if (error)
                goto out_end_trans;
-        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode)) {
-                int error;
-                error = vmtruncate(inode, attr->ia_size);
-                gfs2_assert_warn(sdp, !error);
-        }
-        setattr_copy(inode, attr);
-        mark_inode_dirty(inode);
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-        gfs2_dinode_out(ip, dibh->b_data);
-        brelse(dibh);
        if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
                u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
                gfs2_quota_change(ip, -blocks, ouid, ogid);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 58a9b9998b42..a689901963de 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -631,6 +631,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
                             struct fs_disk_quota *fdq)
 {
        struct inode *inode = &ip->i_inode;
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct address_space *mapping = inode->i_mapping;
        unsigned long index = loc >> PAGE_CACHE_SHIFT;
        unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
@@ -658,13 +659,17 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        qd->qd_qb.qb_value = qp->qu_value;
        if (fdq) {
                if (fdq->d_fieldmask & FS_DQ_BSOFT) {
-                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
+                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
                        qd->qd_qb.qb_warn = qp->qu_warn;
                }
                if (fdq->d_fieldmask & FS_DQ_BHARD) {
-                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
+                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
                        qd->qd_qb.qb_limit = qp->qu_limit;
                }
+                if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
+                        qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+                        qd->qd_qb.qb_value = qp->qu_value;
+                }
        }
        /* Write the quota into the quota file on disk */
@@ -1497,9 +1502,9 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
        fdq->d_version = FS_DQUOT_VERSION;
        fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
        fdq->d_id = id;
-        fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
+        fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
-        fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
+        fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
-        fdq->d_bcount = be64_to_cpu(qlvb->qb_value);
+        fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
        gfs2_glock_dq_uninit(&q_gh);
 out:
@@ -1508,7 +1513,7 @@ out:
 }
 /* GFS2 only supports a subset of the XFS fields */
-#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
 static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
                          struct fs_disk_quota *fdq)
@@ -1566,11 +1571,17 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
        /* If nothing has changed, this is a no-op */
        if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
-            (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn)))
+            ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
                fdq->d_fieldmask ^= FS_DQ_BSOFT;
        if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
-            (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit)))
+            ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
                fdq->d_fieldmask ^= FS_DQ_BHARD;
+        if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
+            ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+                fdq->d_fieldmask ^= FS_DQ_BCOUNT;
        if (fdq->d_fieldmask == 0)
                goto out_i;
@@ -1619,4 +1630,3 @@ const struct quotactl_ops gfs2_quotactl_ops = {
        .get_dqblk      = gfs2_get_dqblk,
        .set_dqblk      = gfs2_set_dqblk,
 };
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index bef3ab6cf5c1..7293ea27020c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
        for (rgrps = 0;; rgrps++) {
                loff_t pos = rgrps * sizeof(struct gfs2_rindex);
-                if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
+                if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
                        break;
                error = gfs2_internal_read(ip, &ra_state, buf, &pos,
                                           sizeof(struct gfs2_rindex));
@@ -583,7 +583,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
 * Returns: 0 on successful update, error code otherwise
 */
-static int gfs2_ri_update(struct gfs2_inode *ip)
+int gfs2_ri_update(struct gfs2_inode *ip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct inode *inode = &ip->i_inode;
@@ -614,46 +614,6 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
 }
 /**
- * gfs2_ri_update_special - Pull in a new resource index from the disk
- *
- * This is a special version that's safe to call from gfs2_inplace_reserve_i.
- * In this case we know that we don't have any resource groups in memory yet.
- *
- * @ip: pointer to the rindex inode
- *
- * Returns: 0 on successful update, error code otherwise
- */
-static int gfs2_ri_update_special(struct gfs2_inode *ip)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct inode *inode = &ip->i_inode;
-        struct file_ra_state ra_state;
-        struct gfs2_rgrpd *rgd;
-        unsigned int max_data = 0;
-        int error;
-        file_ra_state_init(&ra_state, inode->i_mapping);
-        for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
-                /* Ignore partials */
-                if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-                    i_size_read(inode))
-                        break;
-                error = read_rindex_entry(ip, &ra_state);
-                if (error) {
-                        clear_rgrpdi(sdp);
-                        return error;
-                }
-        }
-        list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
-                if (rgd->rd_data > max_data)
-                        max_data = rgd->rd_data;
-        sdp->sd_max_rg_data = max_data;
-        sdp->sd_rindex_uptodate = 1;
-        return 0;
-}
-/**
 * gfs2_rindex_hold - Grab a lock on the rindex
 * @sdp: The GFS2 superblock
 * @ri_gh: the glock holder
@@ -963,17 +923,18 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 *          The inode, if one has been found, in inode.
 */
-static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
+static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
-                           u64 skip)
 {
        u32 goal = 0, block;
        u64 no_addr;
        struct gfs2_sbd *sdp = rgd->rd_sbd;
        unsigned int n;
+        struct gfs2_glock *gl;
+        struct gfs2_inode *ip;
+        int error;
+        int found = 0;
-        for(;;) {
+        while (goal < rgd->rd_data) {
-                if (goal >= rgd->rd_data)
-                        break;
                down_write(&sdp->sd_log_flush_lock);
                n = 1;
                block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
@@ -990,11 +951,32 @@ static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
                if (no_addr == skip)
                        continue;
                *last_unlinked = no_addr;
-                return no_addr;
+                error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl);
+                if (error)
+                        continue;
+                /* If the inode is already in cache, we can ignore it here
+                 * because the existing inode disposal code will deal with
+                 * it when all refs have gone away. Accessing gl_object like
+                 * this is not safe in general. Here it is ok because we do
+                 * not dereference the pointer, and we only need an approx
+                 * answer to whether it is NULL or not.
+                 */
+                ip = gl->gl_object;
+                if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+                        gfs2_glock_put(gl);
+                else
+                        found++;
+                /* Limit reclaim to sensible number of tasks */
+                if (found > 2*NR_CPUS)
+                        return;
        }
        rgd->rd_flags &= ~GFS2_RDF_CHECK;
-        return 0;
+        return;
 }
 /**
@@ -1075,11 +1057,9 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
 * Try to acquire rgrp in way which avoids contending with others.
 *
 * Returns: errno
- *          unlinked: the block address of an unlinked block to be reclaimed
 */
-static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
+static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
-                          u64 *last_unlinked)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd, *begin = NULL;
@@ -1089,7 +1069,6 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
        int loops = 0;
        int error, rg_locked;
-        *unlinked = 0;
        rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
        while (rgd) {
@@ -1106,17 +1085,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
-                        /* If the rg came in already locked, there's no
+                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                           way we can recover from a failed try_rgrp_unlink
+                                try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
-                           because that would require an iput which can only
-                           happen after the rgrp is unlocked. */
-                        if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
-                                *unlinked = try_rgrp_unlink(rgd, last_unlinked,
-                                                           ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        if (*unlinked)
-                                return -EAGAIN;
                        /* fall through */
                case GLR_TRYFAILED:
                        rgd = recent_rgrp_next(rgd);
@@ -1145,13 +1117,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
-                        if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
+                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                                *unlinked = try_rgrp_unlink(rgd, last_unlinked,
+                                try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
-                                                            ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        if (*unlinked)
-                                return -EAGAIN;
                        break;
                case GLR_TRYFAILED:
@@ -1204,12 +1173,12 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al = ip->i_alloc;
        int error = 0;
-        u64 last_unlinked = NO_BLOCK, unlinked;
+        u64 last_unlinked = NO_BLOCK;
+        int tries = 0;
        if (gfs2_assert_warn(sdp, al->al_requested))
                return -EINVAL;
-try_again:
        if (hold_rindex) {
                /* We need to hold the rindex unless the inode we're using is
                   the rindex itself, in which case it's already held. */
@@ -1217,32 +1186,33 @@ try_again:
                        error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
                else if (!sdp->sd_rgrps) /* We may not have the rindex read
                                            in, so: */
-                        error = gfs2_ri_update_special(ip);
+                        error = gfs2_ri_update(ip);
+                if (error)
+                        return error;
        }
-        if (error)
+try_again:
-                return error;
+        do {
+                error = get_local_rgrp(ip, &last_unlinked);
+                /* If there is no space, flushing the log may release some */
+                if (error) {
+                        if (ip == GFS2_I(sdp->sd_rindex) &&
+                            !sdp->sd_rindex_uptodate) {
+                                error = gfs2_ri_update(ip);
+                                if (error)
+                                        return error;
+                                goto try_again;
+                        }
+                        gfs2_log_flush(sdp, NULL);
+                }
+        } while (error && tries++ < 3);
-        /* Find an rgrp suitable for allocation.  If it encounters any unlinked
-           dinodes along the way, error will equal -EAGAIN and unlinked will
-           contains it block address. We then need to look up that inode and
-           try to free it, and try the allocation again. */
-        error = get_local_rgrp(ip, &unlinked, &last_unlinked);
        if (error) {
                if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
                        gfs2_glock_dq_uninit(&al->al_ri_gh);
-                if (error != -EAGAIN)
+                return error;
-                        return error;
-                gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
-                /* regardless of whether or not gfs2_process_unlinked_inode
-                   was successful, we don't want to repeat it again. */
-                last_unlinked = unlinked;
-                gfs2_log_flush(sdp, NULL);
-                error = 0;
-                goto try_again;
        }
        /* no error, so we have the rgrp set in the inode's allocation. */
        al->al_file = file;
        al->al_line = line;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 0e35c0466f9a..50c2bb04369c 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -48,6 +48,7 @@ extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
+extern int gfs2_ri_update(struct gfs2_inode *ip);
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
 extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2b2c4997430b..16c2ecac7eb7 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1405,11 +1405,18 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
        return &ip->i_inode;
 }
-static void gfs2_destroy_inode(struct inode *inode)
+static void gfs2_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(gfs2_inode_cachep, inode);
 }
+static void gfs2_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, gfs2_i_callback);
+}
 const struct super_operations gfs2_super_ops = {
        .alloc_inode            = gfs2_alloc_inode,
        .destroy_inode          = gfs2_destroy_inode,
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 30b58f07c8a6..439b61c03262 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1296,10 +1296,8 @@ fail:
 int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 {
-        struct inode *inode = &ip->i_inode;
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_ea_location el;
-        struct buffer_head *dibh;
        int error;
        error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
@@ -1321,26 +1319,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
        if (error)
                return error;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
+        error = gfs2_setattr_simple(ip, attr);
-        if (error)
-                goto out_trans_end;
-        if ((attr->ia_valid & ATTR_SIZE) &&
-            attr->ia_size != i_size_read(inode)) {
-                int error;
-                error = vmtruncate(inode, attr->ia_size);
-                gfs2_assert_warn(GFS2_SB(inode), !error);
-        }
-        setattr_copy(inode, attr);
-        mark_inode_dirty(inode);
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-        gfs2_dinode_out(ip, dibh->b_data);
-        brelse(dibh);
-out_trans_end:
        gfs2_trans_end(sdp);
        return error;
 }
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 2b3b8611b41b..ea4aefe7c652 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -25,7 +25,7 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
        struct inode *inode = NULL;
        int res;
-        dentry->d_op = &hfs_dentry_operations;
+        d_set_d_op(dentry, &hfs_dentry_operations);
        hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
        hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index c8cffb81e849..ad97c2d58287 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -213,10 +213,14 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
 /* string.c */
 extern const struct dentry_operations hfs_dentry_operations;
-extern int hfs_hash_dentry(struct dentry *, struct qstr *);
+extern int hfs_hash_dentry(const struct dentry *, const struct inode *,
+                struct qstr *);
 extern int hfs_strcmp(const unsigned char *, unsigned int,
                      const unsigned char *, unsigned int);
-extern int hfs_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
+extern int hfs_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 /* trans.c */
 extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *);
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index 927a5af79428..495a976a3cc9 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -51,7 +51,8 @@ static unsigned char caseorder[256] = {
 /*
 * Hash a string to an integer in a case-independent way
 */
-int hfs_hash_dentry(struct dentry *dentry, struct qstr *this)
+int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *this)
 {
        const unsigned char *name = this->name;
        unsigned int hash, len = this->len;
@@ -92,21 +93,21 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1,
 * Test for equality of two strings in the HFS filename character ordering.
 * return 1 on failure and 0 on success
 */
-int hfs_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *s2)
+int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
        const unsigned char *n1, *n2;
-        int len;
-        len = s1->len;
        if (len >= HFS_NAMELEN) {
-                if (s2->len < HFS_NAMELEN)
+                if (name->len < HFS_NAMELEN)
                        return 1;
                len = HFS_NAMELEN;
-        } else if (len != s2->len)
+        } else if (len != name->len)
                return 1;
-        n1 = s1->name;
+        n1 = str;
-        n2 = s2->name;
+        n2 = name->name;
        while (len--) {
                if (caseorder[*n1++] != caseorder[*n2++])
                        return 1;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 4824c27cebb8..0bef62aa4f42 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -167,11 +167,18 @@ static struct inode *hfs_alloc_inode(struct super_block *sb)
        return i ? &i->vfs_inode : NULL;
 }
-static void hfs_destroy_inode(struct inode *inode)
+static void hfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(hfs_inode_cachep, HFS_I(inode));
 }
+static void hfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hfs_i_callback);
+}
 static const struct super_operations hfs_super_operations = {
        .alloc_inode    = hfs_alloc_inode,
        .destroy_inode  = hfs_destroy_inode,
@@ -427,7 +434,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
        if (!sb->s_root)
                goto bail_iput;
-        sb->s_root->d_op = &hfs_dentry_operations;
+        d_set_d_op(sb->s_root, &hfs_dentry_operations);
        /* everything's okay */
        return 0;
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index 7478f5c219aa..19cf291eb91f 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -8,15 +8,20 @@
 * This file contains the code to do various system dependent things.
 */
+#include <linux/namei.h>
 #include "hfs_fs.h"
 /* dentry case-handling: just lowercase everything */
 static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
        int diff;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
        if(!inode)
                return 1;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index f611d55c9f5e..f896dc843026 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -37,7 +37,7 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
        sb = dir->i_sb;
-        dentry->d_op = &hfsplus_dentry_operations;
+        d_set_d_op(dentry, &hfsplus_dentry_operations);
        dentry->d_fsdata = NULL;
        hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
        hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index f7cbdf89ac9b..d6857523336d 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -426,9 +426,12 @@ int hfsplus_uni2asc(struct super_block *,
                const struct hfsplus_unistr *, char *, int *);
 int hfsplus_asc2uni(struct super_block *,
                struct hfsplus_unistr *, const char *, int);
-int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str);
+int hfsplus_hash_dentry(const struct dentry *dentry,
-int hfsplus_compare_dentry(struct dentry *dentry,
+                const struct inode *inode, struct qstr *str);
-                struct qstr *s1, struct qstr *s2);
+int hfsplus_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 /* wrapper.c */
 int hfsplus_read_wrapper(struct super_block *);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 3c9f30e9cd36..6ee6ad20acf2 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -450,7 +450,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                err = -ENOMEM;
                goto cleanup;
        }
-        sb->s_root->d_op = &hfsplus_dentry_operations;
+        d_set_d_op(sb->s_root, &hfsplus_dentry_operations);
        str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
        str.name = HFSP_HIDDENDIR_NAME;
@@ -516,11 +516,19 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
        return i ? &i->vfs_inode : NULL;
 }
-static void hfsplus_destroy_inode(struct inode *inode)
+static void hfsplus_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
 }
+static void hfsplus_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hfsplus_i_callback);
+}
 #define HFSPLUS_INODE_SIZE      sizeof(struct hfsplus_inode_info)
 static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 7dd90a540546..a3f0bfcc881e 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -324,7 +324,8 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
 * Composed unicode characters are decomposed and case-folding is performed
 * if the appropriate bits are (un)set on the superblock.
 */
-int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
+int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *str)
 {
        struct super_block *sb = dentry->d_sb;
        const char *astr;
@@ -367,10 +368,12 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
 * Composed unicode characters are decomposed and case-folding is performed
 * if the appropriate bits are (un)set on the superblock.
 */
-int hfsplus_compare_dentry(struct dentry *dentry,
+int hfsplus_compare_dentry(const struct dentry *parent,
-                struct qstr *s1, struct qstr *s2)
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct super_block *sb = dentry->d_sb;
+        struct super_block *sb = parent->d_sb;
        int casefold, decompose, size;
        int dsize1, dsize2, len1, len2;
        const u16 *dstr1, *dstr2;
@@ -380,10 +383,10 @@ int hfsplus_compare_dentry(struct dentry *dentry,
        casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
        decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
-        astr1 = s1->name;
+        astr1 = str;
-        len1 = s1->len;
+        len1 = len;
-        astr2 = s2->name;
+        astr2 = name->name;
-        len2 = s2->len;
+        len2 = name->len;
        dsize1 = dsize2 = 0;
        dstr1 = dstr2 = NULL;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2c0f148a49e6..d3244d949a4e 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -32,7 +32,7 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
 #define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode)
-static int hostfs_d_delete(struct dentry *dentry)
+static int hostfs_d_delete(const struct dentry *dentry)
 {
        return 1;
 }
@@ -92,12 +92,10 @@ __uml_setup("hostfs=", hostfs_args,
 static char *__dentry_name(struct dentry *dentry, char *name)
 {
-        char *p = __dentry_path(dentry, name, PATH_MAX);
+        char *p = dentry_path_raw(dentry, name, PATH_MAX);
        char *root;
        size_t len;
-        spin_unlock(&dcache_lock);
        root = dentry->d_sb->s_fs_info;
        len = strlen(root);
        if (IS_ERR(p)) {
@@ -123,25 +121,23 @@ static char *dentry_name(struct dentry *dentry)
        if (!name)
                return NULL;
-        spin_lock(&dcache_lock);
        return __dentry_name(dentry, name); /* will unlock */
 }
 static char *inode_name(struct inode *ino)
 {
        struct dentry *dentry;
-        char *name = __getname();
+        char *name;
-        if (!name)
-                return NULL;
-        spin_lock(&dcache_lock);
+        dentry = d_find_alias(ino);
-        if (list_empty(&ino->i_dentry)) {
+        if (!dentry)
-                spin_unlock(&dcache_lock);
-                __putname(name);
                return NULL;
-        }
-        dentry = list_first_entry(&ino->i_dentry, struct dentry, d_alias);
+        name = dentry_name(dentry);
-        return __dentry_name(dentry, name); /* will unlock */
+        dput(dentry);
+        return name;
 }
 static char *follow_link(char *link)
@@ -251,11 +247,18 @@ static void hostfs_evict_inode(struct inode *inode)
        }
 }
-static void hostfs_destroy_inode(struct inode *inode)
+static void hostfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kfree(HOSTFS_I(inode));
 }
+static void hostfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hostfs_i_callback);
+}
 static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
        const char *root_path = vfs->mnt_sb->s_fs_info;
@@ -609,7 +612,7 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
                goto out_put;
        d_add(dentry, inode);
-        dentry->d_op = &hostfs_dentry_ops;
+        d_set_d_op(dentry, &hostfs_dentry_ops);
        return NULL;
 out_put:
@@ -746,11 +749,14 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
        return err;
 }
-int hostfs_permission(struct inode *ino, int desired)
+int hostfs_permission(struct inode *ino, int desired, unsigned int flags)
 {
        char *name;
        int r = 0, w = 0, x = 0, err;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        if (desired & MAY_READ) r = 1;
        if (desired & MAY_WRITE) w = 1;
        if (desired & MAY_EXEC) x = 1;
@@ -765,7 +771,7 @@ int hostfs_permission(struct inode *ino, int desired)
                err = access_file(name, r, w, x);
        __putname(name);
        if (!err)
-                err = generic_permission(ino, desired, NULL);
+                err = generic_permission(ino, desired, flags, NULL);
        return err;
 }
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 67d9d36b3d5f..32c13a94e1e9 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -12,7 +12,8 @@
 * Note: the dentry argument is the parent dentry.
 */
-static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
+static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        unsigned long    hash;
        int              i;
@@ -34,19 +35,25 @@ static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
        return 0;
 }
-static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+static int hpfs_compare_dentry(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        unsigned al=a->len;
+        unsigned al = len;
-        unsigned bl=b->len;
+        unsigned bl = name->len;
-        hpfs_adjust_length(a->name, &al);
+        hpfs_adjust_length(str, &al);
        /*hpfs_adjust_length(b->name, &bl);*/
-        /* 'a' is the qstr of an already existing dentry, so the name
-         * must be valid. 'b' must be validated first.
+        /*
+         * 'str' is the nane of an already existing dentry, so the name
+         * must be valid. 'name' must be validated first.
         */
-        if (hpfs_chk_name(b->name, &bl))
+        if (hpfs_chk_name(name->name, &bl))
                return 1;
-        if (hpfs_compare_names(dentry->d_sb, a->name, al, b->name, bl, 0))
+        if (hpfs_compare_names(parent->d_sb, str, al, name->name, bl, 0))
                return 1;
        return 0;
 }
@@ -58,5 +65,5 @@ static const struct dentry_operations hpfs_dentry_operations = {
 void hpfs_set_dentry_operations(struct dentry *dentry)
 {
-        dentry->d_op = &hpfs_dentry_operations;
+        d_set_d_op(dentry, &hpfs_dentry_operations);
 }
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 11c2b4080f65..f4ad9e31ddc4 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -419,7 +419,7 @@ again:
                        unlock_kernel();
                        return -ENOSPC;
                }
-                if (generic_permission(inode, MAY_WRITE, NULL) ||
+                if (generic_permission(inode, MAY_WRITE, 0, NULL) ||
                    !S_ISREG(inode->i_mode) ||
                    get_write_access(inode)) {
                        d_rehash(dentry);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 6c5f01597c3a..49935ba78db8 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -177,11 +177,18 @@ static struct inode *hpfs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void hpfs_destroy_inode(struct inode *inode)
+static void hpfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
 }
+static void hpfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hpfs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index f702b5f713fc..87ed48e0343d 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -632,11 +632,18 @@ void hppfs_evict_inode(struct inode *ino)
        mntput(ino->i_sb->s_fs_info);
 }
-static void hppfs_destroy_inode(struct inode *inode)
+static void hppfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kfree(HPPFS_I(inode));
 }
+static void hppfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, hppfs_i_callback);
+}
 static const struct super_operations hppfs_sbops = {
        .alloc_inode    = hppfs_alloc_inode,
        .destroy_inode  = hppfs_destroy_inode,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d6cfac1f0a40..9885082b470f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -663,11 +663,18 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
        return &p->vfs_inode;
 }
+static void hugetlbfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
+}
 static void hugetlbfs_destroy_inode(struct inode *inode)
 {
        hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
        mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
-        kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
+        call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
 }
 static const struct address_space_operations hugetlbfs_aops = {
@@ -932,8 +939,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
        if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
                *user = current_user();
                if (user_shm_lock(size, *user)) {
-                        WARN_ONCE(1,
+                        printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n");
-                          "Using mlock ulimits for SHM_HUGETLB deprecated\n");
                } else {
                        *user = NULL;
                        return ERR_PTR(-EPERM);
diff --git a/fs/inode.c b/fs/inode.c
index ae2727ab0c3a..da85e56378f3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -102,26 +102,29 @@ static DECLARE_RWSEM(iprune_sem);
 */
 struct inodes_stat_t inodes_stat;
-static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(unsigned int, nr_inodes);
-static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
 static struct kmem_cache *inode_cachep __read_mostly;
-static inline int get_nr_inodes(void)
+static int get_nr_inodes(void)
 {
-        return percpu_counter_sum_positive(&nr_inodes);
+        int i;
+        int sum = 0;
+        for_each_possible_cpu(i)
+                sum += per_cpu(nr_inodes, i);
+        return sum < 0 ? 0 : sum;
 }
 static inline int get_nr_inodes_unused(void)
 {
-        return percpu_counter_sum_positive(&nr_inodes_unused);
+        return inodes_stat.nr_unused;
 }
 int get_nr_dirty_inodes(void)
 {
+        /* not actually dirty inodes, but a wild approximation */
        int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
        return nr_dirty > 0 ? nr_dirty : 0;
 }
 /*
@@ -132,7 +135,6 @@ int proc_nr_inodes(ctl_table *table, int write,
                   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        inodes_stat.nr_inodes = get_nr_inodes();
-        inodes_stat.nr_unused = get_nr_inodes_unused();
        return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #endif
@@ -224,7 +226,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        inode->i_fsnotify_mask = 0;
 #endif
-        percpu_counter_inc(&nr_inodes);
+        this_cpu_inc(nr_inodes);
        return 0;
 out:
@@ -255,6 +257,12 @@ static struct inode *alloc_inode(struct super_block *sb)
        return inode;
 }
+void free_inode_nonrcu(struct inode *inode)
+{
+        kmem_cache_free(inode_cachep, inode);
+}
+EXPORT_SYMBOL(free_inode_nonrcu);
 void __destroy_inode(struct inode *inode)
 {
        BUG_ON(inode_has_buffers(inode));
@@ -266,10 +274,17 @@ void __destroy_inode(struct inode *inode)
        if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
                posix_acl_release(inode->i_default_acl);
 #endif
-        percpu_counter_dec(&nr_inodes);
+        this_cpu_dec(nr_inodes);
 }
 EXPORT_SYMBOL(__destroy_inode);
+static void i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(inode_cachep, inode);
+}
 static void destroy_inode(struct inode *inode)
 {
        BUG_ON(!list_empty(&inode->i_lru));
@@ -277,7 +292,7 @@ static void destroy_inode(struct inode *inode)
        if (inode->i_sb->s_op->destroy_inode)
                inode->i_sb->s_op->destroy_inode(inode);
        else
-                kmem_cache_free(inode_cachep, (inode));
+                call_rcu(&inode->i_rcu, i_callback);
 }
 /*
@@ -335,7 +350,7 @@ static void inode_lru_list_add(struct inode *inode)
 {
        if (list_empty(&inode->i_lru)) {
                list_add(&inode->i_lru, &inode_lru);
-                percpu_counter_inc(&nr_inodes_unused);
+                inodes_stat.nr_unused++;
        }
 }
@@ -343,7 +358,7 @@ static void inode_lru_list_del(struct inode *inode)
 {
        if (!list_empty(&inode->i_lru)) {
                list_del_init(&inode->i_lru);
-                percpu_counter_dec(&nr_inodes_unused);
+                inodes_stat.nr_unused--;
        }
 }
@@ -430,6 +445,7 @@ void end_writeback(struct inode *inode)
        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(inode->i_state & I_CLEAR);
        inode_sync_wait(inode);
+        /* don't need i_lock here, no concurrent mods to i_state */
        inode->i_state = I_FREEING | I_CLEAR;
 }
 EXPORT_SYMBOL(end_writeback);
@@ -513,7 +529,7 @@ void evict_inodes(struct super_block *sb)
                list_move(&inode->i_lru, &dispose);
                list_del_init(&inode->i_wb_list);
                if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-                        percpu_counter_dec(&nr_inodes_unused);
+                        inodes_stat.nr_unused--;
        }
        spin_unlock(&inode_lock);
@@ -554,7 +570,7 @@ int invalidate_inodes(struct super_block *sb)
                list_move(&inode->i_lru, &dispose);
                list_del_init(&inode->i_wb_list);
                if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-                        percpu_counter_dec(&nr_inodes_unused);
+                        inodes_stat.nr_unused--;
        }
        spin_unlock(&inode_lock);
@@ -616,7 +632,7 @@ static void prune_icache(int nr_to_scan)
                if (atomic_read(&inode->i_count) ||
                    (inode->i_state & ~I_REFERENCED)) {
                        list_del_init(&inode->i_lru);
-                        percpu_counter_dec(&nr_inodes_unused);
+                        inodes_stat.nr_unused--;
                        continue;
                }
@@ -650,7 +666,7 @@ static void prune_icache(int nr_to_scan)
                 */
                list_move(&inode->i_lru, &freeable);
                list_del_init(&inode->i_wb_list);
-                percpu_counter_dec(&nr_inodes_unused);
+                inodes_stat.nr_unused--;
        }
        if (current_is_kswapd())
                __count_vm_events(KSWAPD_INODESTEAL, reap);
@@ -1648,8 +1664,6 @@ void __init inode_init(void)
                                         SLAB_MEM_SPREAD),
                                         init_once);
        register_shrinker(&icache_shrinker);
-        percpu_counter_init(&nr_inodes, 0);
-        percpu_counter_init(&nr_inodes_unused, 0);
        /* Hash may have been set up in inode_init_early */
        if (!hashdist)
diff --git a/fs/internal.h b/fs/internal.h
index e43b9a4dbf4e..9687c2ee2735 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -63,6 +63,7 @@ extern int copy_mount_string(const void __user *, char **);
 extern void free_vfsmnt(struct vfsmount *);
 extern struct vfsmount *alloc_vfsmnt(const char *);
+extern unsigned int mnt_get_count(struct vfsmount *mnt);
 extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
 extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
                                struct vfsmount *);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index e92fdbb3bc3a..d6cc16476620 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -6,7 +6,6 @@
 #include <linux/syscalls.h>
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/capability.h>
 #include <linux/file.h>
 #include <linux/fs.h>
@@ -530,41 +529,6 @@ static int ioctl_fsthaw(struct file *filp)
        return thaw_super(sb);
 }
-static int ioctl_fstrim(struct file *filp, void __user *argp)
-{
-        struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
-        struct fstrim_range range;
-        int ret = 0;
-        if (!capable(CAP_SYS_ADMIN))
-                return -EPERM;
-        /* If filesystem doesn't support trim feature, return. */
-        if (sb->s_op->trim_fs == NULL)
-                return -EOPNOTSUPP;
-        /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
-        if (sb->s_bdev == NULL)
-                return -EINVAL;
-        if (argp == NULL) {
-                range.start = 0;
-                range.len = ULLONG_MAX;
-                range.minlen = 0;
-        } else if (copy_from_user(&range, argp, sizeof(range)))
-                return -EFAULT;
-        ret = sb->s_op->trim_fs(sb, &range);
-        if (ret < 0)
-                return ret;
-        if ((argp != NULL) &&
-            (copy_to_user(argp, &range, sizeof(range))))
-                return -EFAULT;
-        return 0;
-}
 /*
 * When you add any new common ioctls to the switches above and below
 * please update compat_sys_ioctl() too.
@@ -615,10 +579,6 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
                error = ioctl_fsthaw(filp);
                break;
-        case FITRIM:
-                error = ioctl_fstrim(filp, argp);
-                break;
        case FS_IOC_FIEMAP:
                return ioctl_fiemap(filp, arg);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 748cfb92dcc6..7da2a06508e5 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -103,12 +103,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
        }
        ret = -ESRCH;
-        /*
+        rcu_read_lock();
-         * We want IOPRIO_WHO_PGRP/IOPRIO_WHO_USER to be "atomic",
-         * so we can't use rcu_read_lock(). See re-copy of ->ioprio
-         * in copy_process().
-         */
-        read_lock(&tasklist_lock);
        switch (which) {
                case IOPRIO_WHO_PROCESS:
                        if (!who)
@@ -153,7 +148,7 @@ free_uid:
                        ret = -EINVAL;
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return ret;
 }
@@ -197,7 +192,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
        int ret = -ESRCH;
        int tmpio;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        switch (which) {
                case IOPRIO_WHO_PROCESS:
                        if (!who)
@@ -250,6 +245,6 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
                        ret = -EINVAL;
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return ret;
 }
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index bfdeb82a53be..844a7903c72f 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -26,16 +26,32 @@
 #define BEQUIET
-static int isofs_hashi(struct dentry *parent, struct qstr *qstr);
+static int isofs_hashi(const struct dentry *parent, const struct inode *inode,
-static int isofs_hash(struct dentry *parent, struct qstr *qstr);
+                struct qstr *qstr);
-static int isofs_dentry_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b);
+static int isofs_hash(const struct dentry *parent, const struct inode *inode,
-static int isofs_dentry_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b);
+                struct qstr *qstr);
+static int isofs_dentry_cmpi(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
+static int isofs_dentry_cmp(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 #ifdef CONFIG_JOLIET
-static int isofs_hashi_ms(struct dentry *parent, struct qstr *qstr);
+static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode,
-static int isofs_hash_ms(struct dentry *parent, struct qstr *qstr);
+                struct qstr *qstr);
-static int isofs_dentry_cmpi_ms(struct dentry *dentry, struct qstr *a, struct qstr *b);
+static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode,
-static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qstr *b);
+                struct qstr *qstr);
+static int isofs_dentry_cmpi_ms(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
+static int isofs_dentry_cmp_ms(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name);
 #endif
 static void isofs_put_super(struct super_block *sb)
@@ -65,11 +81,18 @@ static struct inode *isofs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void isofs_destroy_inode(struct inode *inode)
+static void isofs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
 }
+static void isofs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, isofs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct iso_inode_info *ei = foo;
@@ -160,7 +183,7 @@ struct iso9660_options{
 * Compute the hash for the isofs name corresponding to the dentry.
 */
 static int
-isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms)
+isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms)
 {
        const char *name;
        int len;
@@ -181,7 +204,7 @@ isofs_hash_common(struct dentry *dentry, struct qstr *qstr, int ms)
 * Compute the hash for the isofs name corresponding to the dentry.
 */
 static int
-isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms)
+isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms)
 {
        const char *name;
        int len;
@@ -206,100 +229,94 @@ isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms)
 }
 /*
- * Case insensitive compare of two isofs names.
+ * Compare of two isofs names.
- */
-static int isofs_dentry_cmpi_common(struct dentry *dentry, struct qstr *a,
-                                struct qstr *b, int ms)
-{
-        int alen, blen;
-        /* A filename cannot end in '.' or we treat it like it has none */
-        alen = a->len;
-        blen = b->len;
-        if (ms) {
-                while (alen && a->name[alen-1] == '.')
-                        alen--;
-                while (blen && b->name[blen-1] == '.')
-                        blen--;
-        }
-        if (alen == blen) {
-                if (strnicmp(a->name, b->name, alen) == 0)
-                        return 0;
-        }
-        return 1;
-}
-/*
- * Case sensitive compare of two isofs names.
 */
-static int isofs_dentry_cmp_common(struct dentry *dentry, struct qstr *a,
+static int isofs_dentry_cmp_common(
-                                        struct qstr *b, int ms)
+                unsigned int len, const char *str,
+                const struct qstr *name, int ms, int ci)
 {
        int alen, blen;
        /* A filename cannot end in '.' or we treat it like it has none */
-        alen = a->len;
+        alen = name->len;
-        blen = b->len;
+        blen = len;
        if (ms) {
-                while (alen && a->name[alen-1] == '.')
+                while (alen && name->name[alen-1] == '.')
                        alen--;
-                while (blen && b->name[blen-1] == '.')
+                while (blen && str[blen-1] == '.')
                        blen--;
        }
        if (alen == blen) {
-                if (strncmp(a->name, b->name, alen) == 0)
+                if (ci) {
-                        return 0;
+                        if (strnicmp(name->name, str, alen) == 0)
+                                return 0;
+                } else {
+                        if (strncmp(name->name, str, alen) == 0)
+                                return 0;
+                }
        }
        return 1;
 }
 static int
-isofs_hash(struct dentry *dentry, struct qstr *qstr)
+isofs_hash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        return isofs_hash_common(dentry, qstr, 0);
 }
 static int
-isofs_hashi(struct dentry *dentry, struct qstr *qstr)
+isofs_hashi(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        return isofs_hashi_common(dentry, qstr, 0);
 }
 static int
-isofs_dentry_cmp(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return isofs_dentry_cmp_common(dentry, a, b, 0);
+        return isofs_dentry_cmp_common(len, str, name, 0, 0);
 }
 static int
-isofs_dentry_cmpi(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return isofs_dentry_cmpi_common(dentry, a, b, 0);
+        return isofs_dentry_cmp_common(len, str, name, 0, 1);
 }
 #ifdef CONFIG_JOLIET
 static int
-isofs_hash_ms(struct dentry *dentry, struct qstr *qstr)
+isofs_hash_ms(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        return isofs_hash_common(dentry, qstr, 1);
 }
 static int
-isofs_hashi_ms(struct dentry *dentry, struct qstr *qstr)
+isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        return isofs_hashi_common(dentry, qstr, 1);
 }
 static int
-isofs_dentry_cmp_ms(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return isofs_dentry_cmp_common(dentry, a, b, 1);
+        return isofs_dentry_cmp_common(len, str, name, 1, 0);
 }
 static int
-isofs_dentry_cmpi_ms(struct dentry *dentry,struct qstr *a,struct qstr *b)
+isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        return isofs_dentry_cmpi_common(dentry, a, b, 1);
+        return isofs_dentry_cmp_common(len, str, name, 1, 1);
 }
 #endif
@@ -932,7 +949,7 @@ root_found:
                table += 2;
        if (opt.check == 'r')
                table++;
-        s->s_root->d_op = &isofs_dentry_ops[table];
+        d_set_d_op(s->s_root, &isofs_dentry_ops[table]);
        kfree(opt.iocharset);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 0d23abfd4280..679a849c3b27 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -37,7 +37,8 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
        qstr.name = compare;
        qstr.len = dlen;
-        return dentry->d_op->d_compare(dentry, &dentry->d_name, &qstr);
+        return dentry->d_op->d_compare(NULL, NULL, NULL, NULL,
+                        dentry->d_name.len, dentry->d_name.name, &qstr);
 }
 /*
@@ -171,7 +172,7 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
        struct inode *inode;
        struct page *page;
-        dentry->d_op = dir->i_sb->s_root->d_op;
+        d_set_d_op(dentry, dir->i_sb->s_root->d_op);
        page = alloc_page(GFP_USER);
        if (!page)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c590d155c095..f837ba953529 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -899,6 +899,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
        /* journal descriptor can store up to n blocks -bzzz */
        journal->j_blocksize = blocksize;
+        journal->j_dev = bdev;
+        journal->j_fs_dev = fs_dev;
+        journal->j_blk_offset = start;
+        journal->j_maxlen = len;
+        bdevname(journal->j_dev, journal->j_devname);
+        p = journal->j_devname;
+        while ((p = strchr(p, '/')))
+                *p = '!';
        jbd2_stats_proc_init(journal);
        n = journal->j_blocksize / sizeof(journal_block_tag_t);
        journal->j_wbufsize = n;
@@ -908,14 +916,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
                        __func__);
                goto out_err;
        }
-        journal->j_dev = bdev;
-        journal->j_fs_dev = fs_dev;
-        journal->j_blk_offset = start;
-        journal->j_maxlen = len;
-        bdevname(journal->j_dev, journal->j_devname);
-        p = journal->j_devname;
-        while ((p = strchr(p, '/')))
-                *p = '!';
        bh = __getblk(journal->j_dev, start, journal->j_blocksize);
        if (!bh) {
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 54a92fd02bbd..95b79672150a 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -259,11 +259,14 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
        return rc;
 }
-int jffs2_check_acl(struct inode *inode, int mask)
+int jffs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
        struct posix_acl *acl;
        int rc;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 5e42de8d9541..3119f59253d3 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
-extern int jffs2_check_acl(struct inode *, int);
+extern int jffs2_check_acl(struct inode *, int, unsigned int);
 extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index c86041b866a4..853b8e300084 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -40,11 +40,18 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb)
        return &f->vfs_inode;
 }
-static void jffs2_destroy_inode(struct inode *inode)
+static void jffs2_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
 }
+static void jffs2_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, jffs2_i_callback);
+}
 static void jffs2_i_init_once(void *foo)
 {
        struct jffs2_inode_info *f = foo;
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 1057a4998e4e..e5de9422fa32 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -114,10 +114,14 @@ out:
        return rc;
 }
-int jfs_check_acl(struct inode *inode, int mask)
+int jfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
+        struct posix_acl *acl;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 54e07559878d..f9285c4900fa 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
 #ifdef CONFIG_JFS_POSIX_ACL
-int jfs_check_acl(struct inode *, int);
+int jfs_check_acl(struct inode *, int, unsigned int flags);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
 int jfs_acl_chmod(struct inode *inode);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 231ca4af9bce..4414e3a42264 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -18,6 +18,7 @@
 */
 #include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/ctype.h>
 #include <linux/quotaops.h>
 #include <linux/exportfs.h>
@@ -1465,7 +1466,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
        jfs_info("jfs_lookup: name = %s", name);
        if (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2)
-                dentry->d_op = &jfs_ci_dentry_operations;
+                d_set_d_op(dentry, &jfs_ci_dentry_operations);
        if ((name[0] == '.') && (len == 1))
                inum = dip->i_ino;
@@ -1494,7 +1495,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
        dentry = d_splice_alias(ip, dentry);
        if (dentry && (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2))
-                dentry->d_op = &jfs_ci_dentry_operations;
+                d_set_d_op(dentry, &jfs_ci_dentry_operations);
        return dentry;
 }
@@ -1573,7 +1574,8 @@ const struct file_operations jfs_dir_operations = {
        .llseek         = generic_file_llseek,
 };
-static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
+static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
+                struct qstr *this)
 {
        unsigned long hash;
        int i;
@@ -1586,32 +1588,63 @@ static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
        return 0;
 }
-static int jfs_ci_compare(struct dentry *dir, struct qstr *a, struct qstr *b)
+static int jfs_ci_compare(const struct dentry *parent,
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
        int i, result = 1;
-        if (a->len != b->len)
+        if (len != name->len)
                goto out;
-        for (i=0; i < a->len; i++) {
+        for (i=0; i < len; i++) {
-                if (tolower(a->name[i]) != tolower(b->name[i]))
+                if (tolower(str[i]) != tolower(name->name[i]))
                        goto out;
        }
        result = 0;
+out:
+        return result;
+}
+static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        /*
-         * We want creates to preserve case.  A negative dentry, a, that
+         * This is not negative dentry. Always valid.
-         * has a different case than b may cause a new entry to be created
+         *
-         * with the wrong case.  Since we can't tell if a comes from a negative
+         * Note, rename() to existing directory entry will have ->d_inode,
-         * dentry, we blindly replace it with b.  This should be harmless if
+         * and will use existing name which isn't specified name by user.
-         * a is not a negative dentry.
+         *
+         * We may be able to drop this positive dentry here. But dropping
+         * positive dentry isn't good idea. So it's unsupported like
+         * rename("filename", "FILENAME") for now.
         */
-        memcpy((unsigned char *)a->name, b->name, a->len);
+        if (dentry->d_inode)
-out:
+                return 1;
-        return result;
+        /*
+         * This may be nfsd (or something), anyway, we can't see the
+         * intent of this. So, since this can be for creation, drop it.
+         */
+        if (!nd)
+                return 0;
+        /*
+         * Drop the negative dentry, in order to make sure to use the
+         * case sensitive name which is specified by user if this is
+         * for creation.
+         */
+        if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+                if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+                        return 0;
+        }
+        return 1;
 }
 const struct dentry_operations jfs_ci_dentry_operations =
 {
        .d_hash = jfs_ci_hash,
        .d_compare = jfs_ci_compare,
+        .d_revalidate = jfs_ci_revalidate,
 };
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 0669fc1cc3bf..3150d766e0d4 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -115,6 +115,14 @@ static struct inode *jfs_alloc_inode(struct super_block *sb)
        return &jfs_inode->vfs_inode;
 }
+static void jfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct jfs_inode_info *ji = JFS_IP(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(jfs_inode_cachep, ji);
+}
 static void jfs_destroy_inode(struct inode *inode)
 {
        struct jfs_inode_info *ji = JFS_IP(inode);
@@ -128,7 +136,7 @@ static void jfs_destroy_inode(struct inode *inode)
                ji->active_ag = -1;
        }
        spin_unlock_irq(&ji->ag_lock);
-        kmem_cache_free(jfs_inode_cachep, ji);
+        call_rcu(&inode->i_rcu, jfs_i_callback);
 }
 static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -517,7 +525,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
                goto out_no_root;
        if (sbi->mntflag & JFS_OS2)
-                sb->s_root->d_op = &jfs_ci_dentry_operations;
+                d_set_d_op(sb->s_root, &jfs_ci_dentry_operations);
        /* logical blocks are represented by 40 bits in pxd_t, etc. */
        sb->s_maxbytes = ((u64) sb->s_blocksize) << 40;
diff --git a/fs/libfs.c b/fs/libfs.c
index a3accdf528ad..889311e3d06b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -16,6 +16,11 @@
 #include <asm/uaccess.h>
+static inline int simple_positive(struct dentry *dentry)
+{
+        return dentry->d_inode && !d_unhashed(dentry);
+}
 int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
                   struct kstat *stat)
 {
@@ -37,7 +42,7 @@ int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
 * Retaining negative dentries for an in-memory filesystem just wastes
 * memory and lookup time: arrange for them to be deleted immediately.
 */
-static int simple_delete_dentry(struct dentry *dentry)
+static int simple_delete_dentry(const struct dentry *dentry)
 {
        return 1;
 }
@@ -54,7 +59,7 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
-        dentry->d_op = &simple_dentry_operations;
+        d_set_d_op(dentry, &simple_dentry_operations);
        d_add(dentry, NULL);
        return NULL;
 }
@@ -76,7 +81,8 @@ int dcache_dir_close(struct inode *inode, struct file *file)
 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
 {
-        mutex_lock(&file->f_path.dentry->d_inode->i_mutex);
+        struct dentry *dentry = file->f_path.dentry;
+        mutex_lock(&dentry->d_inode->i_mutex);
        switch (origin) {
                case 1:
                        offset += file->f_pos;
@@ -84,7 +90,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
                        if (offset >= 0)
                                break;
                default:
-                        mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+                        mutex_unlock(&dentry->d_inode->i_mutex);
                        return -EINVAL;
        }
        if (offset != file->f_pos) {
@@ -94,21 +100,24 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin)
                        struct dentry *cursor = file->private_data;
                        loff_t n = file->f_pos - 2;
-                        spin_lock(&dcache_lock);
+                        spin_lock(&dentry->d_lock);
+                        /* d_lock not required for cursor */
                        list_del(&cursor->d_u.d_child);
-                        p = file->f_path.dentry->d_subdirs.next;
+                        p = dentry->d_subdirs.next;
-                        while (n && p != &file->f_path.dentry->d_subdirs) {
+                        while (n && p != &dentry->d_subdirs) {
                                struct dentry *next;
                                next = list_entry(p, struct dentry, d_u.d_child);
-                                if (!d_unhashed(next) && next->d_inode)
+                                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+                                if (simple_positive(next))
                                        n--;
+                                spin_unlock(&next->d_lock);
                                p = p->next;
                        }
                        list_add_tail(&cursor->d_u.d_child, p);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&dentry->d_lock);
                }
        }
-        mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+        mutex_unlock(&dentry->d_inode->i_mutex);
        return offset;
 }
@@ -148,29 +157,35 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
                        i++;
                        /* fallthrough */
                default:
-                        spin_lock(&dcache_lock);
+                        spin_lock(&dentry->d_lock);
                        if (filp->f_pos == 2)
                                list_move(q, &dentry->d_subdirs);
                        for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
                                struct dentry *next;
                                next = list_entry(p, struct dentry, d_u.d_child);
-                                if (d_unhashed(next) || !next->d_inode)
+                                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+                                if (!simple_positive(next)) {
+                                        spin_unlock(&next->d_lock);
                                        continue;
+                                }
-                                spin_unlock(&dcache_lock);
+                                spin_unlock(&next->d_lock);
+                                spin_unlock(&dentry->d_lock);
                                if (filldir(dirent, next->d_name.name, 
                                            next->d_name.len, filp->f_pos, 
                                            next->d_inode->i_ino, 
                                            dt_type(next->d_inode)) < 0)
                                        return 0;
-                                spin_lock(&dcache_lock);
+                                spin_lock(&dentry->d_lock);
+                                spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
                                /* next is still alive */
                                list_move(q, p);
+                                spin_unlock(&next->d_lock);
                                p = q;
                                filp->f_pos++;
                        }
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&dentry->d_lock);
        }
        return 0;
 }
@@ -259,23 +274,23 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
        return 0;
 }
-static inline int simple_positive(struct dentry *dentry)
-{
-        return dentry->d_inode && !d_unhashed(dentry);
-}
 int simple_empty(struct dentry *dentry)
 {
        struct dentry *child;
        int ret = 0;
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
-        list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child)
+        list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) {
-                if (simple_positive(child))
+                spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+                if (simple_positive(child)) {
+                        spin_unlock(&child->d_lock);
                        goto out;
+                }
+                spin_unlock(&child->d_lock);
+        }
        ret = 1;
 out:
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
        return ret;
 }
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index d5bb86866e6c..25509eb28fd7 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -14,7 +14,6 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/smp_lock.h>
 #include <linux/kthread.h>
 #define NLMDBG_FACILITY         NLMDBG_CLIENT
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 47ea1e1925b8..332c54cf75e0 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -7,7 +7,6 @@
 */
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/errno.h>
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 25e21e4023b2..ed0c59fe23ce 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -124,7 +124,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
                        continue;
                if (host->h_server != ni->server)
                        continue;
-                if (ni->server &&
+                if (ni->server && ni->src_len != 0 &&
                    !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
                        continue;
@@ -167,6 +167,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
        host->h_addrlen = ni->salen;
        rpc_set_port(nlm_addr(host), 0);
        memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
+        host->h_srcaddrlen = ni->src_len;
        host->h_version    = ni->version;
        host->h_proto      = ni->protocol;
        host->h_rpcclnt    = NULL;
@@ -238,9 +239,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                                     const char *hostname,
                                     int noresvport)
 {
-        const struct sockaddr source = {
-                .sa_family      = AF_UNSPEC,
-        };
        struct nlm_lookup_host_info ni = {
                .server         = 0,
                .sap            = sap,
@@ -249,8 +247,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                .version        = version,
                .hostname       = hostname,
                .hostname_len   = strlen(hostname),
-                .src_sap        = &source,
-                .src_len        = sizeof(source),
                .noresvport     = noresvport,
        };
@@ -357,7 +353,6 @@ nlm_bind_host(struct nlm_host *host)
                        .protocol       = host->h_proto,
                        .address        = nlm_addr(host),
                        .addrsize       = host->h_addrlen,
-                        .saddress       = nlm_srcaddr(host),
                        .timeout        = &timeparms,
                        .servername     = host->h_name,
                        .program        = &nlm_program,
@@ -376,6 +371,8 @@ nlm_bind_host(struct nlm_host *host)
                        args.flags |= RPC_CLNT_CREATE_HARDRTRY;
                if (host->h_noresvport)
                        args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+                if (host->h_srcaddrlen)
+                        args.saddress = nlm_srcaddr(host);
                clnt = rpc_create(&args);
                if (!IS_ERR(clnt))
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a336e832475d..38d261192453 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,7 +9,6 @@
 #include <linux/types.h>
 #include <linux/time.h>
-#include <linux/smp_lock.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index c462d346acbd..ef5659b211e9 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -25,7 +25,6 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/nlm.h>
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index c3069f38d602..0caea5310ac3 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,7 +9,6 @@
 #include <linux/types.h>
 #include <linux/time.h>
-#include <linux/smp_lock.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
diff --git a/fs/locks.c b/fs/locks.c
index 65765cb6afed..08415b2a6d36 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -122,7 +122,6 @@
 #include <linux/module.h>
 #include <linux/security.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/syscalls.h>
 #include <linux/time.h>
 #include <linux/rcupdate.h>
@@ -1390,7 +1389,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
                if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
                        goto out;
                if ((arg == F_WRLCK)
-                    && ((atomic_read(&dentry->d_count) > 1)
+                    && ((dentry->d_count > 1)
                        || (atomic_read(&inode->i_count) > 1)))
                        goto out;
        }
@@ -1504,9 +1503,8 @@ static int do_fcntl_delete_lease(struct file *filp)
 static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 {
-        struct file_lock *fl;
+        struct file_lock *fl, *ret;
        struct fasync_struct *new;
-        struct inode *inode = filp->f_path.dentry->d_inode;
        int error;
        fl = lease_alloc(filp, arg);
@@ -1518,13 +1516,16 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
                locks_free_lock(fl);
                return -ENOMEM;
        }
+        ret = fl;
        lock_flocks();
-        error = __vfs_setlease(filp, arg, &fl);
+        error = __vfs_setlease(filp, arg, &ret);
        if (error) {
                unlock_flocks();
                locks_free_lock(fl);
                goto out_free_fasync;
        }
+        if (ret != fl)
+                locks_free_lock(fl);
        /*
         * fasync_insert_entry() returns the old entry if any.
@@ -1532,17 +1533,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
         * inserted it into the fasync list. Clear new so that
         * we don't release it here.
         */
-        if (!fasync_insert_entry(fd, filp, &fl->fl_fasync, new))
+        if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new))
                new = NULL;
-        if (error < 0) {
+        error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-                /* remove lease just inserted by setlease */
-                fl->fl_type = F_UNLCK | F_INPROGRESS;
-                fl->fl_break_time = jiffies - 10;
-                time_out_leases(inode);
-        } else {
-                error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-        }
        unlock_flocks();
 out_free_fasync:
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 409dfd65e9a1..f9ddf0c388c8 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -555,9 +555,11 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry,
        return __logfs_create(dir, dentry, inode, target, destlen);
 }
-static int logfs_permission(struct inode *inode, int mask)
+static int logfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        return generic_permission(inode, mask, NULL);
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        return generic_permission(inode, mask, flags, NULL);
 }
 static int logfs_link(struct dentry *old_dentry, struct inode *dir,
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index d8c71ece098f..03b8c240aeda 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -141,13 +141,20 @@ struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
        return __logfs_iget(sb, ino);
 }
+static void logfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
+}
 static void __logfs_destroy_inode(struct inode *inode)
 {
        struct logfs_inode *li = logfs_inode(inode);
        BUG_ON(li->li_block);
        list_del(&li->li_freeing_list);
-        kmem_cache_free(logfs_inode_cache, li);
+        call_rcu(&inode->i_rcu, logfs_i_callback);
 }
 static void logfs_destroy_inode(struct inode *inode)
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index f46ee8b0e135..9da29706f91c 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -828,7 +828,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
                super->s_journal_seg[i] = segno;
                super->s_journal_ec[i] = ec;
                logfs_set_segment_reserved(sb, segno);
-                err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
+                err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
                BUG_ON(err); /* mempool should prevent this */
                err = logfs_erase_segment(sb, segno, 1);
                BUG_ON(err); /* FIXME: remount-ro would be nicer */
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 6127baf0e188..ee99a9f5dfd3 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1994,6 +1994,9 @@ static int do_write_inode(struct inode *inode)
        /* FIXME: transaction is part of logfs_block now.  Is that enough? */
        err = logfs_write_buf(master_inode, page, 0);
+        if (err)
+                move_page_to_inode(inode, page);
        logfs_put_write_page(page);
        return err;
 }
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index fb2020858a34..ae0b83f476a6 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -68,11 +68,18 @@ static struct inode *minix_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void minix_destroy_inode(struct inode *inode)
+static void minix_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(minix_inode_cachep, minix_i(inode));
 }
+static void minix_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, minix_i_callback);
+}
 static void init_once(void *foo)
 {
        struct minix_inode_info *ei = (struct minix_inode_info *) foo;
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index c0d35a3accef..1b9e07728a9f 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -23,7 +23,7 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, st
        struct inode * inode = NULL;
        ino_t ino;
-        dentry->d_op = dir->i_sb->s_root->d_op;
+        d_set_d_op(dentry, dir->i_sb->s_root->d_op);
        if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen)
                return ERR_PTR(-ENAMETOOLONG);
diff --git a/fs/namei.c b/fs/namei.c
index 5362af9b7372..19433cdba011 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,8 +169,8 @@ EXPORT_SYMBOL(putname);
 /*
 * This does basic POSIX ACL permission checking
 */
-static int acl_permission_check(struct inode *inode, int mask,
+static int acl_permission_check(struct inode *inode, int mask, unsigned int flags,
-                int (*check_acl)(struct inode *inode, int mask))
+                int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
 {
        umode_t                 mode = inode->i_mode;
@@ -180,7 +180,7 @@ static int acl_permission_check(struct inode *inode, int mask,
                mode >>= 6;
        else {
                if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
-                        int error = check_acl(inode, mask);
+                        int error = check_acl(inode, mask, flags);
                        if (error != -EAGAIN)
                                return error;
                }
@@ -198,25 +198,30 @@ static int acl_permission_check(struct inode *inode, int mask,
 }
 /**
- * generic_permission  -  check for access rights on a Posix-like filesystem
+ * generic_permission -  check for access rights on a Posix-like filesystem
 * @inode:      inode to check access rights for
 * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 * @check_acl:  optional callback to check for Posix ACLs
+ * @flags       IPERM_FLAG_ flags.
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
- * are used for other things..
+ * are used for other things.
+ *
+ * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
+ * request cannot be satisfied (eg. requires blocking or too much complexity).
+ * It would then be called again in ref-walk mode.
 */
-int generic_permission(struct inode *inode, int mask,
+int generic_permission(struct inode *inode, int mask, unsigned int flags,
-                int (*check_acl)(struct inode *inode, int mask))
+        int (*check_acl)(struct inode *inode, int mask, unsigned int flags))
 {
        int ret;
        /*
         * Do the basic POSIX ACL permission checks.
         */
-        ret = acl_permission_check(inode, mask, check_acl);
+        ret = acl_permission_check(inode, mask, flags, check_acl);
        if (ret != -EACCES)
                return ret;
@@ -271,9 +276,10 @@ int inode_permission(struct inode *inode, int mask)
        }
        if (inode->i_op->permission)
-                retval = inode->i_op->permission(inode, mask);
+                retval = inode->i_op->permission(inode, mask, 0);
        else
-                retval = generic_permission(inode, mask, inode->i_op->check_acl);
+                retval = generic_permission(inode, mask, 0,
+                                inode->i_op->check_acl);
        if (retval)
                return retval;
@@ -362,6 +368,18 @@ void path_get(struct path *path)
 EXPORT_SYMBOL(path_get);
 /**
+ * path_get_long - get a long reference to a path
+ * @path: path to get the reference to
+ *
+ * Given a path increment the reference count to the dentry and the vfsmount.
+ */
+void path_get_long(struct path *path)
+{
+        mntget_long(path->mnt);
+        dget(path->dentry);
+}
+/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
@@ -375,6 +393,185 @@ void path_put(struct path *path)
 EXPORT_SYMBOL(path_put);
 /**
+ * path_put_long - put a long reference to a path
+ * @path: path to put the reference to
+ *
+ * Given a path decrement the reference count to the dentry and the vfsmount.
+ */
+void path_put_long(struct path *path)
+{
+        dput(path->dentry);
+        mntput_long(path->mnt);
+}
+/**
+ * nameidata_drop_rcu - drop this nameidata out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * @Returns: 0 on success, -ECHLID on failure
+ *
+ * Path walking has 2 modes, rcu-walk and ref-walk (see
+ * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
+ * to drop out of rcu-walk mode and take normal reference counts on dentries
+ * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
+ * refcounts at the last known good point before rcu-walk got stuck, so
+ * ref-walk may continue from there. If this is not successful (eg. a seqcount
+ * has changed), then failure is returned and path walk restarts from the
+ * beginning in ref-walk mode.
+ *
+ * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
+ * ref-walk. Must be called from rcu-walk context.
+ */
+static int nameidata_drop_rcu(struct nameidata *nd)
+{
+        struct fs_struct *fs = current->fs;
+        struct dentry *dentry = nd->path.dentry;
+        BUG_ON(!(nd->flags & LOOKUP_RCU));
+        if (nd->root.mnt) {
+                spin_lock(&fs->lock);
+                if (nd->root.mnt != fs->root.mnt ||
+                                nd->root.dentry != fs->root.dentry)
+                        goto err_root;
+        }
+        spin_lock(&dentry->d_lock);
+        if (!__d_rcu_to_refcount(dentry, nd->seq))
+                goto err;
+        BUG_ON(nd->inode != dentry->d_inode);
+        spin_unlock(&dentry->d_lock);
+        if (nd->root.mnt) {
+                path_get(&nd->root);
+                spin_unlock(&fs->lock);
+        }
+        mntget(nd->path.mnt);
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        nd->flags &= ~LOOKUP_RCU;
+        return 0;
+err:
+        spin_unlock(&dentry->d_lock);
+err_root:
+        if (nd->root.mnt)
+                spin_unlock(&fs->lock);
+        return -ECHILD;
+}
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
+{
+        if (nd->flags & LOOKUP_RCU)
+                return nameidata_drop_rcu(nd);
+        return 0;
+}
+/**
+ * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * @dentry: dentry to drop
+ * @Returns: 0 on success, -ECHLID on failure
+ *
+ * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
+ * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
+ * @nd. Must be called from rcu-walk context.
+ */
+static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
+{
+        struct fs_struct *fs = current->fs;
+        struct dentry *parent = nd->path.dentry;
+        BUG_ON(!(nd->flags & LOOKUP_RCU));
+        if (nd->root.mnt) {
+                spin_lock(&fs->lock);
+                if (nd->root.mnt != fs->root.mnt ||
+                                nd->root.dentry != fs->root.dentry)
+                        goto err_root;
+        }
+        spin_lock(&parent->d_lock);
+        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+        if (!__d_rcu_to_refcount(dentry, nd->seq))
+                goto err;
+        /*
+         * If the sequence check on the child dentry passed, then the child has
+         * not been removed from its parent. This means the parent dentry must
+         * be valid and able to take a reference at this point.
+         */
+        BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+        BUG_ON(!parent->d_count);
+        parent->d_count++;
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&parent->d_lock);
+        if (nd->root.mnt) {
+                path_get(&nd->root);
+                spin_unlock(&fs->lock);
+        }
+        mntget(nd->path.mnt);
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        nd->flags &= ~LOOKUP_RCU;
+        return 0;
+err:
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&parent->d_lock);
+err_root:
+        if (nd->root.mnt)
+                spin_unlock(&fs->lock);
+        return -ECHILD;
+}
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
+{
+        if (nd->flags & LOOKUP_RCU)
+                return nameidata_dentry_drop_rcu(nd, dentry);
+        return 0;
+}
+/**
+ * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
+ * @nd: nameidata pathwalk data to drop
+ * @Returns: 0 on success, -ECHLID on failure
+ *
+ * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
+ * nd->path should be the final element of the lookup, so nd->root is discarded.
+ * Must be called from rcu-walk context.
+ */
+static int nameidata_drop_rcu_last(struct nameidata *nd)
+{
+        struct dentry *dentry = nd->path.dentry;
+        BUG_ON(!(nd->flags & LOOKUP_RCU));
+        nd->flags &= ~LOOKUP_RCU;
+        nd->root.mnt = NULL;
+        spin_lock(&dentry->d_lock);
+        if (!__d_rcu_to_refcount(dentry, nd->seq))
+                goto err_unlock;
+        BUG_ON(nd->inode != dentry->d_inode);
+        spin_unlock(&dentry->d_lock);
+        mntget(nd->path.mnt);
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        return 0;
+err_unlock:
+        spin_unlock(&dentry->d_lock);
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        return -ECHILD;
+}
+/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
+static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
+{
+        if (likely(nd->flags & LOOKUP_RCU))
+                return nameidata_drop_rcu_last(nd);
+        return 0;
+}
+/**
 * release_open_intent - free up open intent resources
 * @nd: pointer to nameidata
 */
@@ -386,10 +583,26 @@ void release_open_intent(struct nameidata *nd)
                fput(nd->intent.open.file);
 }
+static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        int status;
+        status = dentry->d_op->d_revalidate(dentry, nd);
+        if (status == -ECHILD) {
+                if (nameidata_dentry_drop_rcu(nd, dentry))
+                        return status;
+                status = dentry->d_op->d_revalidate(dentry, nd);
+        }
+        return status;
+}
 static inline struct dentry *
 do_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        int status = dentry->d_op->d_revalidate(dentry, nd);
+        int status;
+        status = d_revalidate(dentry, nd);
        if (unlikely(status <= 0)) {
                /*
                 * The dentry failed validation.
@@ -397,19 +610,36 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
                 * the dentry otherwise d_revalidate is asking us
                 * to return a fail status.
                 */
-                if (!status) {
+                if (status < 0) {
+                        /* If we're in rcu-walk, we don't have a ref */
+                        if (!(nd->flags & LOOKUP_RCU))
+                                dput(dentry);
+                        dentry = ERR_PTR(status);
+                } else {
+                        /* Don't d_invalidate in rcu-walk mode */
+                        if (nameidata_dentry_drop_rcu_maybe(nd, dentry))
+                                return ERR_PTR(-ECHILD);
                        if (!d_invalidate(dentry)) {
                                dput(dentry);
                                dentry = NULL;
                        }
-                } else {
-                        dput(dentry);
-                        dentry = ERR_PTR(status);
                }
        }
        return dentry;
 }
+static inline int need_reval_dot(struct dentry *dentry)
+{
+        if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
+                return 0;
+        if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
+                return 0;
+        return 1;
+}
 /*
 * force_reval_path - force revalidation of a dentry
 *
@@ -433,13 +663,12 @@ force_reval_path(struct path *path, struct nameidata *nd)
        /*
         * only check on filesystems where it's possible for the dentry to
-         * become stale. It's assumed that if this flag is set then the
+         * become stale.
-         * d_revalidate op will also be defined.
         */
-        if (!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))
+        if (!need_reval_dot(dentry))
                return 0;
-        status = dentry->d_op->d_revalidate(dentry, nd);
+        status = d_revalidate(dentry, nd);
        if (status > 0)
                return 0;
@@ -459,26 +688,27 @@ force_reval_path(struct path *path, struct nameidata *nd)
 * short-cut DAC fails, then call ->permission() to do more
 * complete permission check.
 */
-static int exec_permission(struct inode *inode)
+static inline int exec_permission(struct inode *inode, unsigned int flags)
 {
        int ret;
        if (inode->i_op->permission) {
-                ret = inode->i_op->permission(inode, MAY_EXEC);
+                ret = inode->i_op->permission(inode, MAY_EXEC, flags);
-                if (!ret)
+        } else {
-                        goto ok;
+                ret = acl_permission_check(inode, MAY_EXEC, flags,
-                return ret;
+                                inode->i_op->check_acl);
        }
-        ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl);
+        if (likely(!ret))
-        if (!ret)
                goto ok;
+        if (ret == -ECHILD)
+                return ret;
        if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
                goto ok;
        return ret;
 ok:
-        return security_inode_permission(inode, MAY_EXEC);
+        return security_inode_exec_permission(inode, flags);
 }
 static __always_inline void set_root(struct nameidata *nd)
@@ -489,8 +719,23 @@ static __always_inline void set_root(struct nameidata *nd)
 static int link_path_walk(const char *, struct nameidata *);
+static __always_inline void set_root_rcu(struct nameidata *nd)
+{
+        if (!nd->root.mnt) {
+                struct fs_struct *fs = current->fs;
+                unsigned seq;
+                do {
+                        seq = read_seqcount_begin(&fs->seq);
+                        nd->root = fs->root;
+                } while (read_seqcount_retry(&fs->seq, seq));
+        }
+}
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
+        int ret;
        if (IS_ERR(link))
                goto fail;
@@ -500,8 +745,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
                nd->path = nd->root;
                path_get(&nd->root);
        }
+        nd->inode = nd->path.dentry->d_inode;
-        return link_path_walk(link, nd);
+        ret = link_path_walk(link, nd);
+        return ret;
 fail:
        path_put(&nd->path);
        return PTR_ERR(link);
@@ -516,11 +763,12 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
 static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
 {
-        dput(nd->path.dentry);
+        if (!(nd->flags & LOOKUP_RCU)) {
-        if (nd->path.mnt != path->mnt) {
+                dput(nd->path.dentry);
-                mntput(nd->path.mnt);
+                if (nd->path.mnt != path->mnt)
-                nd->path.mnt = path->mnt;
+                        mntput(nd->path.mnt);
        }
+        nd->path.mnt = path->mnt;
        nd->path.dentry = path->dentry;
 }
@@ -535,9 +783,11 @@ __do_follow_link(struct path *path, struct nameidata *nd, void **p)
        if (path->mnt != nd->path.mnt) {
                path_to_nameidata(path, nd);
+                nd->inode = nd->path.dentry->d_inode;
                dget(dentry);
        }
        mntget(path->mnt);
        nd->last_type = LAST_BIND;
        *p = dentry->d_inode->i_op->follow_link(dentry, nd);
        error = PTR_ERR(*p);
@@ -591,6 +841,20 @@ loop:
        return err;
 }
+static int follow_up_rcu(struct path *path)
+{
+        struct vfsmount *parent;
+        struct dentry *mountpoint;
+        parent = path->mnt->mnt_parent;
+        if (parent == path->mnt)
+                return 0;
+        mountpoint = path->mnt->mnt_mountpoint;
+        path->dentry = mountpoint;
+        path->mnt = parent;
+        return 1;
+}
 int follow_up(struct path *path)
 {
        struct vfsmount *parent;
@@ -612,9 +876,24 @@ int follow_up(struct path *path)
        return 1;
 }
-/* no need for dcache_lock, as serialization is taken care in
+/*
- * namespace.c
+ * serialization is taken care of in namespace.c
 */
+static void __follow_mount_rcu(struct nameidata *nd, struct path *path,
+                                struct inode **inode)
+{
+        while (d_mountpoint(path->dentry)) {
+                struct vfsmount *mounted;
+                mounted = __lookup_mnt(path->mnt, path->dentry, 1);
+                if (!mounted)
+                        return;
+                path->mnt = mounted;
+                path->dentry = mounted->mnt_root;
+                nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+                *inode = path->dentry->d_inode;
+        }
+}
 static int __follow_mount(struct path *path)
 {
        int res = 0;
@@ -645,9 +924,6 @@ static void follow_mount(struct path *path)
        }
 }
-/* no need for dcache_lock, as serialization is taken care in
- * namespace.c
- */
 int follow_down(struct path *path)
 {
        struct vfsmount *mounted;
@@ -663,7 +939,42 @@ int follow_down(struct path *path)
        return 0;
 }
-static __always_inline void follow_dotdot(struct nameidata *nd)
+static int follow_dotdot_rcu(struct nameidata *nd)
+{
+        struct inode *inode = nd->inode;
+        set_root_rcu(nd);
+        while(1) {
+                if (nd->path.dentry == nd->root.dentry &&
+                    nd->path.mnt == nd->root.mnt) {
+                        break;
+                }
+                if (nd->path.dentry != nd->path.mnt->mnt_root) {
+                        struct dentry *old = nd->path.dentry;
+                        struct dentry *parent = old->d_parent;
+                        unsigned seq;
+                        seq = read_seqcount_begin(&parent->d_seq);
+                        if (read_seqcount_retry(&old->d_seq, nd->seq))
+                                return -ECHILD;
+                        inode = parent->d_inode;
+                        nd->path.dentry = parent;
+                        nd->seq = seq;
+                        break;
+                }
+                if (!follow_up_rcu(&nd->path))
+                        break;
+                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+                inode = nd->path.dentry->d_inode;
+        }
+        __follow_mount_rcu(nd, &nd->path, &inode);
+        nd->inode = inode;
+        return 0;
+}
+static void follow_dotdot(struct nameidata *nd)
 {
        set_root(nd);
@@ -684,6 +995,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
                        break;
        }
        follow_mount(&nd->path);
+        nd->inode = nd->path.dentry->d_inode;
 }
 /*
@@ -721,17 +1033,17 @@ static struct dentry *d_alloc_and_lookup(struct dentry *parent,
 *  It _is_ time-critical.
 */
 static int do_lookup(struct nameidata *nd, struct qstr *name,
-                     struct path *path)
+                        struct path *path, struct inode **inode)
 {
        struct vfsmount *mnt = nd->path.mnt;
-        struct dentry *dentry, *parent;
+        struct dentry *dentry, *parent = nd->path.dentry;
        struct inode *dir;
        /*
         * See if the low-level filesystem might want
         * to use its own hash..
         */
-        if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
+        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-                int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name);
+                int err = parent->d_op->d_hash(parent, nd->inode, name);
                if (err < 0)
                        return err;
        }
@@ -741,21 +1053,44 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
         * of a false negative due to a concurrent rename, we're going to
         * do the non-racy lookup, below.
         */
-        dentry = __d_lookup(nd->path.dentry, name);
+        if (nd->flags & LOOKUP_RCU) {
-        if (!dentry)
+                unsigned seq;
-                goto need_lookup;
+                *inode = nd->inode;
+                dentry = __d_lookup_rcu(parent, name, &seq, inode);
+                if (!dentry) {
+                        if (nameidata_drop_rcu(nd))
+                                return -ECHILD;
+                        goto need_lookup;
+                }
+                /* Memory barrier in read_seqcount_begin of child is enough */
+                if (__read_seqcount_retry(&parent->d_seq, nd->seq))
+                        return -ECHILD;
+                nd->seq = seq;
+                if (dentry->d_flags & DCACHE_OP_REVALIDATE)
+                        goto need_revalidate;
+                path->mnt = mnt;
+                path->dentry = dentry;
+                __follow_mount_rcu(nd, path, inode);
+        } else {
+                dentry = __d_lookup(parent, name);
+                if (!dentry)
+                        goto need_lookup;
 found:
-        if (dentry->d_op && dentry->d_op->d_revalidate)
+                if (dentry->d_flags & DCACHE_OP_REVALIDATE)
-                goto need_revalidate;
+                        goto need_revalidate;
 done:
-        path->mnt = mnt;
+                path->mnt = mnt;
-        path->dentry = dentry;
+                path->dentry = dentry;
-        __follow_mount(path);
+                __follow_mount(path);
+                *inode = path->dentry->d_inode;
+        }
        return 0;
 need_lookup:
-        parent = nd->path.dentry;
        dir = parent->d_inode;
+        BUG_ON(nd->inode != dir);
        mutex_lock(&dir->i_mutex);
        /*
@@ -817,7 +1152,6 @@ static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
 static int link_path_walk(const char *name, struct nameidata *nd)
 {
        struct path next;
-        struct inode *inode;
        int err;
        unsigned int lookup_flags = nd->flags;
        
@@ -826,18 +1160,28 @@ static int link_path_walk(const char *name, struct nameidata *nd)
        if (!*name)
                goto return_reval;
-        inode = nd->path.dentry->d_inode;
        if (nd->depth)
                lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
        /* At this point we know we have a real path component. */
        for(;;) {
+                struct inode *inode;
                unsigned long hash;
                struct qstr this;
                unsigned int c;
                nd->flags |= LOOKUP_CONTINUE;
-                err = exec_permission(inode);
+                if (nd->flags & LOOKUP_RCU) {
+                        err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+                        if (err == -ECHILD) {
+                                if (nameidata_drop_rcu(nd))
+                                        return -ECHILD;
+                                goto exec_again;
+                        }
+                } else {
+exec_again:
+                        err = exec_permission(nd->inode, 0);
+                }
                if (err)
                        break;
@@ -868,37 +1212,44 @@ static int link_path_walk(const char *name, struct nameidata *nd)
                if (this.name[0] == '.') switch (this.len) {
                        default:
                                break;
-                        case 2: 
+                        case 2:
                                if (this.name[1] != '.')
                                        break;
-                                follow_dotdot(nd);
+                                if (nd->flags & LOOKUP_RCU) {
-                                inode = nd->path.dentry->d_inode;
+                                        if (follow_dotdot_rcu(nd))
+                                                return -ECHILD;
+                                } else
+                                        follow_dotdot(nd);
                                /* fallthrough */
                        case 1:
                                continue;
                }
                /* This does the actual lookups.. */
-                err = do_lookup(nd, &this, &next);
+                err = do_lookup(nd, &this, &next, &inode);
                if (err)
                        break;
                err = -ENOENT;
-                inode = next.dentry->d_inode;
                if (!inode)
                        goto out_dput;
                if (inode->i_op->follow_link) {
+                        /* We commonly drop rcu-walk here */
+                        if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
+                                return -ECHILD;
+                        BUG_ON(inode != next.dentry->d_inode);
                        err = do_follow_link(&next, nd);
                        if (err)
                                goto return_err;
+                        nd->inode = nd->path.dentry->d_inode;
                        err = -ENOENT;
-                        inode = nd->path.dentry->d_inode;
+                        if (!nd->inode)
-                        if (!inode)
                                break;
-                } else
+                } else {
                        path_to_nameidata(&next, nd);
+                        nd->inode = inode;
+                }
                err = -ENOTDIR; 
-                if (!inode->i_op->lookup)
+                if (!nd->inode->i_op->lookup)
                        break;
                continue;
                /* here ends the main loop */
@@ -913,32 +1264,39 @@ last_component:
                if (this.name[0] == '.') switch (this.len) {
                        default:
                                break;
-                        case 2: 
+                        case 2:
                                if (this.name[1] != '.')
                                        break;
-                                follow_dotdot(nd);
+                                if (nd->flags & LOOKUP_RCU) {
-                                inode = nd->path.dentry->d_inode;
+                                        if (follow_dotdot_rcu(nd))
+                                                return -ECHILD;
+                                } else
+                                        follow_dotdot(nd);
                                /* fallthrough */
                        case 1:
                                goto return_reval;
                }
-                err = do_lookup(nd, &this, &next);
+                err = do_lookup(nd, &this, &next, &inode);
                if (err)
                        break;
-                inode = next.dentry->d_inode;
                if (follow_on_final(inode, lookup_flags)) {
+                        if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
+                                return -ECHILD;
+                        BUG_ON(inode != next.dentry->d_inode);
                        err = do_follow_link(&next, nd);
                        if (err)
                                goto return_err;
-                        inode = nd->path.dentry->d_inode;
+                        nd->inode = nd->path.dentry->d_inode;
-                } else
+                } else {
                        path_to_nameidata(&next, nd);
+                        nd->inode = inode;
+                }
                err = -ENOENT;
-                if (!inode)
+                if (!nd->inode)
                        break;
                if (lookup_flags & LOOKUP_DIRECTORY) {
                        err = -ENOTDIR; 
-                        if (!inode->i_op->lookup)
+                        if (!nd->inode->i_op->lookup)
                                break;
                }
                goto return_base;
@@ -958,25 +1316,43 @@ return_reval:
                 * We bypassed the ordinary revalidation routines.
                 * We may need to check the cached dentry for staleness.
                 */
-                if (nd->path.dentry && nd->path.dentry->d_sb &&
+                if (need_reval_dot(nd->path.dentry)) {
-                    (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
-                        err = -ESTALE;
                        /* Note: we do not d_invalidate() */
-                        if (!nd->path.dentry->d_op->d_revalidate(
+                        err = d_revalidate(nd->path.dentry, nd);
-                                        nd->path.dentry, nd))
+                        if (!err)
+                                err = -ESTALE;
+                        if (err < 0)
                                break;
                }
 return_base:
+                if (nameidata_drop_rcu_last_maybe(nd))
+                        return -ECHILD;
                return 0;
 out_dput:
-                path_put_conditional(&next, nd);
+                if (!(nd->flags & LOOKUP_RCU))
+                        path_put_conditional(&next, nd);
                break;
        }
-        path_put(&nd->path);
+        if (!(nd->flags & LOOKUP_RCU))
+                path_put(&nd->path);
 return_err:
        return err;
 }
+static inline int path_walk_rcu(const char *name, struct nameidata *nd)
+{
+        current->total_link_count = 0;
+        return link_path_walk(name, nd);
+}
+static inline int path_walk_simple(const char *name, struct nameidata *nd)
+{
+        current->total_link_count = 0;
+        return link_path_walk(name, nd);
+}
 static int path_walk(const char *name, struct nameidata *nd)
 {
        struct path save = nd->path;
@@ -1002,6 +1378,93 @@ static int path_walk(const char *name, struct nameidata *nd)
        return result;
 }
+static void path_finish_rcu(struct nameidata *nd)
+{
+        if (nd->flags & LOOKUP_RCU) {
+                /* RCU dangling. Cancel it. */
+                nd->flags &= ~LOOKUP_RCU;
+                nd->root.mnt = NULL;
+                rcu_read_unlock();
+                br_read_unlock(vfsmount_lock);
+        }
+        if (nd->file)
+                fput(nd->file);
+}
+static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+{
+        int retval = 0;
+        int fput_needed;
+        struct file *file;
+        nd->last_type = LAST_ROOT; /* if there are only slashes... */
+        nd->flags = flags | LOOKUP_RCU;
+        nd->depth = 0;
+        nd->root.mnt = NULL;
+        nd->file = NULL;
+        if (*name=='/') {
+                struct fs_struct *fs = current->fs;
+                unsigned seq;
+                br_read_lock(vfsmount_lock);
+                rcu_read_lock();
+                do {
+                        seq = read_seqcount_begin(&fs->seq);
+                        nd->root = fs->root;
+                        nd->path = nd->root;
+                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                } while (read_seqcount_retry(&fs->seq, seq));
+        } else if (dfd == AT_FDCWD) {
+                struct fs_struct *fs = current->fs;
+                unsigned seq;
+                br_read_lock(vfsmount_lock);
+                rcu_read_lock();
+                do {
+                        seq = read_seqcount_begin(&fs->seq);
+                        nd->path = fs->pwd;
+                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                } while (read_seqcount_retry(&fs->seq, seq));
+        } else {
+                struct dentry *dentry;
+                file = fget_light(dfd, &fput_needed);
+                retval = -EBADF;
+                if (!file)
+                        goto out_fail;
+                dentry = file->f_path.dentry;
+                retval = -ENOTDIR;
+                if (!S_ISDIR(dentry->d_inode->i_mode))
+                        goto fput_fail;
+                retval = file_permission(file, MAY_EXEC);
+                if (retval)
+                        goto fput_fail;
+                nd->path = file->f_path;
+                if (fput_needed)
+                        nd->file = file;
+                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                br_read_lock(vfsmount_lock);
+                rcu_read_lock();
+        }
+        nd->inode = nd->path.dentry->d_inode;
+        return 0;
+fput_fail:
+        fput_light(file, fput_needed);
+out_fail:
+        return retval;
+}
 static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
        int retval = 0;
@@ -1042,6 +1505,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
                fput_light(file, fput_needed);
        }
+        nd->inode = nd->path.dentry->d_inode;
        return 0;
 fput_fail:
@@ -1054,16 +1518,53 @@ out_fail:
 static int do_path_lookup(int dfd, const char *name,
                                unsigned int flags, struct nameidata *nd)
 {
-        int retval = path_init(dfd, name, flags, nd);
+        int retval;
-        if (!retval)
-                retval = path_walk(name, nd);
+        /*
-        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
+         * Path walking is largely split up into 2 different synchronisation
-                                nd->path.dentry->d_inode))
+         * schemes, rcu-walk and ref-walk (explained in
-                audit_inode(name, nd->path.dentry);
+         * Documentation/filesystems/path-lookup.txt). These share much of the
+         * path walk code, but some things particularly setup, cleanup, and
+         * following mounts are sufficiently divergent that functions are
+         * duplicated. Typically there is a function foo(), and its RCU
+         * analogue, foo_rcu().
+         *
+         * -ECHILD is the error number of choice (just to avoid clashes) that
+         * is returned if some aspect of an rcu-walk fails. Such an error must
+         * be handled by restarting a traditional ref-walk (which will always
+         * be able to complete).
+         */
+        retval = path_init_rcu(dfd, name, flags, nd);
+        if (unlikely(retval))
+                return retval;
+        retval = path_walk_rcu(name, nd);
+        path_finish_rcu(nd);
        if (nd->root.mnt) {
                path_put(&nd->root);
                nd->root.mnt = NULL;
        }
+        if (unlikely(retval == -ECHILD || retval == -ESTALE)) {
+                /* slower, locked walk */
+                if (retval == -ESTALE)
+                        flags |= LOOKUP_REVAL;
+                retval = path_init(dfd, name, flags, nd);
+                if (unlikely(retval))
+                        return retval;
+                retval = path_walk(name, nd);
+                if (nd->root.mnt) {
+                        path_put(&nd->root);
+                        nd->root.mnt = NULL;
+                }
+        }
+        if (likely(!retval)) {
+                if (unlikely(!audit_dummy_context())) {
+                        if (nd->path.dentry && nd->inode)
+                                audit_inode(name, nd->path.dentry);
+                }
+        }
        return retval;
 }
@@ -1106,10 +1607,11 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
        path_get(&nd->path);
        nd->root = nd->path;
        path_get(&nd->root);
+        nd->inode = nd->path.dentry->d_inode;
        retval = path_walk(name, nd);
        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
-                                nd->path.dentry->d_inode))
+                                nd->inode))
                audit_inode(name, nd->path.dentry);
        path_put(&nd->root);
@@ -1125,7 +1627,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
        struct dentry *dentry;
        int err;
-        err = exec_permission(inode);
+        err = exec_permission(inode, 0);
        if (err)
                return ERR_PTR(err);
@@ -1133,8 +1635,8 @@ static struct dentry *__lookup_hash(struct qstr *name,
         * See if the low-level filesystem might want
         * to use its own hash..
         */
-        if (base->d_op && base->d_op->d_hash) {
+        if (base->d_flags & DCACHE_OP_HASH) {
-                err = base->d_op->d_hash(base, name);
+                err = base->d_op->d_hash(base, inode, name);
                dentry = ERR_PTR(err);
                if (err < 0)
                        goto out;
@@ -1147,7 +1649,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
         */
        dentry = d_lookup(base, name);
-        if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
+        if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE))
                dentry = do_revalidate(dentry, nd);
        if (!dentry)
@@ -1490,6 +1992,7 @@ out_unlock:
        mutex_unlock(&dir->d_inode->i_mutex);
        dput(nd->path.dentry);
        nd->path.dentry = path->dentry;
        if (error)
                return error;
        /* Don't check for write permission, don't truncate */
@@ -1584,6 +2087,9 @@ exit:
        return ERR_PTR(error);
 }
+/*
+ * Handle O_CREAT case for do_filp_open
+ */
 static struct file *do_last(struct nameidata *nd, struct path *path,
                            int open_flag, int acc_mode,
                            int mode, const char *pathname)
@@ -1597,50 +2103,25 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                follow_dotdot(nd);
                dir = nd->path.dentry;
        case LAST_DOT:
-                if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
+                if (need_reval_dot(dir)) {
-                        if (!dir->d_op->d_revalidate(dir, nd)) {
+                        error = d_revalidate(nd->path.dentry, nd);
+                        if (!error)
                                error = -ESTALE;
+                        if (error < 0)
                                goto exit;
-                        }
                }
                /* fallthrough */
        case LAST_ROOT:
-                if (open_flag & O_CREAT)
+                goto exit;
-                        goto exit;
-                /* fallthrough */
        case LAST_BIND:
                audit_inode(pathname, dir);
                goto ok;
        }
        /* trailing slashes? */
-        if (nd->last.name[nd->last.len]) {
+        if (nd->last.name[nd->last.len])
-                if (open_flag & O_CREAT)
+                goto exit;
-                        goto exit;
-                nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
-        }
-        /* just plain open? */
-        if (!(open_flag & O_CREAT)) {
-                error = do_lookup(nd, &nd->last, path);
-                if (error)
-                        goto exit;
-                error = -ENOENT;
-                if (!path->dentry->d_inode)
-                        goto exit_dput;
-                if (path->dentry->d_inode->i_op->follow_link)
-                        return NULL;
-                error = -ENOTDIR;
-                if (nd->flags & LOOKUP_DIRECTORY) {
-                        if (!path->dentry->d_inode->i_op->lookup)
-                                goto exit_dput;
-                }
-                path_to_nameidata(path, nd);
-                audit_inode(pathname, nd->path.dentry);
-                goto ok;
-        }
-        /* OK, it's O_CREAT */
        mutex_lock(&dir->d_inode->i_mutex);
        path->dentry = lookup_hash(nd);
@@ -1711,8 +2192,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                return NULL;
        path_to_nameidata(path, nd);
+        nd->inode = path->dentry->d_inode;
        error = -EISDIR;
-        if (S_ISDIR(path->dentry->d_inode->i_mode))
+        if (S_ISDIR(nd->inode->i_mode))
                goto exit;
 ok:
        filp = finish_open(nd, open_flag, acc_mode);
@@ -1743,11 +2225,14 @@ struct file *do_filp_open(int dfd, const char *pathname,
        struct path path;
        int count = 0;
        int flag = open_to_namei_flags(open_flag);
-        int force_reval = 0;
+        int flags;
        if (!(open_flag & O_CREAT))
                mode = 0;
+        /* Must never be set by userspace */
+        open_flag &= ~FMODE_NONOTIFY;
        /*
         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
         * check for O_DSYNC if the need any syncing at all we enforce it's
@@ -1769,54 +2254,84 @@ struct file *do_filp_open(int dfd, const char *pathname,
        if (open_flag & O_APPEND)
                acc_mode |= MAY_APPEND;
-        /* find the parent */
+        flags = LOOKUP_OPEN;
-reval:
+        if (open_flag & O_CREAT) {
-        error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
+                flags |= LOOKUP_CREATE;
+                if (open_flag & O_EXCL)
+                        flags |= LOOKUP_EXCL;
+        }
+        if (open_flag & O_DIRECTORY)
+                flags |= LOOKUP_DIRECTORY;
+        if (!(open_flag & O_NOFOLLOW))
+                flags |= LOOKUP_FOLLOW;
+        filp = get_empty_filp();
+        if (!filp)
+                return ERR_PTR(-ENFILE);
+        filp->f_flags = open_flag;
+        nd.intent.open.file = filp;
+        nd.intent.open.flags = flag;
+        nd.intent.open.create_mode = mode;
+        if (open_flag & O_CREAT)
+                goto creat;
+        /* !O_CREAT, simple open */
+        error = do_path_lookup(dfd, pathname, flags, &nd);
+        if (unlikely(error))
+                goto out_filp;
+        error = -ELOOP;
+        if (!(nd.flags & LOOKUP_FOLLOW)) {
+                if (nd.inode->i_op->follow_link)
+                        goto out_path;
+        }
+        error = -ENOTDIR;
+        if (nd.flags & LOOKUP_DIRECTORY) {
+                if (!nd.inode->i_op->lookup)
+                        goto out_path;
+        }
+        audit_inode(pathname, nd.path.dentry);
+        filp = finish_open(&nd, open_flag, acc_mode);
+        return filp;
+creat:
+        /* OK, have to create the file. Find the parent. */
+        error = path_init_rcu(dfd, pathname,
+                        LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
        if (error)
-                return ERR_PTR(error);
+                goto out_filp;
-        if (force_reval)
+        error = path_walk_rcu(pathname, &nd);
-                nd.flags |= LOOKUP_REVAL;
+        path_finish_rcu(&nd);
+        if (unlikely(error == -ECHILD || error == -ESTALE)) {
+                /* slower, locked walk */
+                if (error == -ESTALE) {
+reval:
+                        flags |= LOOKUP_REVAL;
+                }
+                error = path_init(dfd, pathname,
+                                LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
+                if (error)
+                        goto out_filp;
-        current->total_link_count = 0;
+                error = path_walk_simple(pathname, &nd);
-        error = link_path_walk(pathname, &nd);
-        if (error) {
-                filp = ERR_PTR(error);
-                goto out;
        }
-        if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT))
+        if (unlikely(error))
+                goto out_filp;
+        if (unlikely(!audit_dummy_context()))
                audit_inode(pathname, nd.path.dentry);
        /*
         * We have the parent and last component.
         */
+        nd.flags = flags;
-        error = -ENFILE;
-        filp = get_empty_filp();
-        if (filp == NULL)
-                goto exit_parent;
-        nd.intent.open.file = filp;
-        filp->f_flags = open_flag;
-        nd.intent.open.flags = flag;
-        nd.intent.open.create_mode = mode;
-        nd.flags &= ~LOOKUP_PARENT;
-        nd.flags |= LOOKUP_OPEN;
-        if (open_flag & O_CREAT) {
-                nd.flags |= LOOKUP_CREATE;
-                if (open_flag & O_EXCL)
-                        nd.flags |= LOOKUP_EXCL;
-        }
-        if (open_flag & O_DIRECTORY)
-                nd.flags |= LOOKUP_DIRECTORY;
-        if (!(open_flag & O_NOFOLLOW))
-                nd.flags |= LOOKUP_FOLLOW;
        filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
        while (unlikely(!filp)) { /* trailing symlink */
                struct path holder;
-                struct inode *inode = path.dentry->d_inode;
                void *cookie;
                error = -ELOOP;
                /* S_ISDIR part is a temporary automount kludge */
-                if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode))
+                if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(nd.inode->i_mode))
                        goto exit_dput;
                if (count++ == 32)
                        goto exit_dput;
@@ -1837,36 +2352,33 @@ reval:
                        goto exit_dput;
                error = __do_follow_link(&path, &nd, &cookie);
                if (unlikely(error)) {
+                        if (!IS_ERR(cookie) && nd.inode->i_op->put_link)
+                                nd.inode->i_op->put_link(path.dentry, &nd, cookie);
                        /* nd.path had been dropped */
-                        if (!IS_ERR(cookie) && inode->i_op->put_link)
+                        nd.path = path;
-                                inode->i_op->put_link(path.dentry, &nd, cookie);
+                        goto out_path;
-                        path_put(&path);
-                        release_open_intent(&nd);
-                        filp = ERR_PTR(error);
-                        goto out;
                }
                holder = path;
                nd.flags &= ~LOOKUP_PARENT;
                filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-                if (inode->i_op->put_link)
+                if (nd.inode->i_op->put_link)
-                        inode->i_op->put_link(holder.dentry, &nd, cookie);
+                        nd.inode->i_op->put_link(holder.dentry, &nd, cookie);
                path_put(&holder);
        }
 out:
        if (nd.root.mnt)
                path_put(&nd.root);
-        if (filp == ERR_PTR(-ESTALE) && !force_reval) {
+        if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
-                force_reval = 1;
                goto reval;
-        }
        return filp;
 exit_dput:
        path_put_conditional(&path, &nd);
+out_path:
+        path_put(&nd.path);
+out_filp:
        if (!IS_ERR(nd.intent.open.file))
                release_open_intent(&nd);
-exit_parent:
-        path_put(&nd.path);
        filp = ERR_PTR(error);
        goto out;
 }
@@ -2127,12 +2639,10 @@ void dentry_unhash(struct dentry *dentry)
 {
        dget(dentry);
        shrink_dcache_parent(dentry);
-        spin_lock(&dcache_lock);
        spin_lock(&dentry->d_lock);
-        if (atomic_read(&dentry->d_count) == 2)
+        if (dentry->d_count == 2)
                __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
 }
 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
diff --git a/fs/namespace.c b/fs/namespace.c
index 8a415c9c5e55..3ddfd9046c44 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -13,7 +13,6 @@
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/percpu.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/acct.h>
@@ -139,6 +138,64 @@ void mnt_release_group_id(struct vfsmount *mnt)
        mnt->mnt_group_id = 0;
 }
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_add_count(struct vfsmount *mnt, int n)
+{
+#ifdef CONFIG_SMP
+        this_cpu_add(mnt->mnt_pcp->mnt_count, n);
+#else
+        preempt_disable();
+        mnt->mnt_count += n;
+        preempt_enable();
+#endif
+}
+static inline void mnt_set_count(struct vfsmount *mnt, int n)
+{
+#ifdef CONFIG_SMP
+        this_cpu_write(mnt->mnt_pcp->mnt_count, n);
+#else
+        mnt->mnt_count = n;
+#endif
+}
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_inc_count(struct vfsmount *mnt)
+{
+        mnt_add_count(mnt, 1);
+}
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void mnt_dec_count(struct vfsmount *mnt)
+{
+        mnt_add_count(mnt, -1);
+}
+/*
+ * vfsmount lock must be held for write
+ */
+unsigned int mnt_get_count(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+        unsigned int count = atomic_read(&mnt->mnt_longrefs);
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
+        }
+        return count;
+#else
+        return mnt->mnt_count;
+#endif
+}
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
        struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -155,7 +212,17 @@ struct vfsmount *alloc_vfsmnt(const char *name)
                                goto out_free_id;
                }
-                atomic_set(&mnt->mnt_count, 1);
+#ifdef CONFIG_SMP
+                mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
+                if (!mnt->mnt_pcp)
+                        goto out_free_devname;
+                atomic_set(&mnt->mnt_longrefs, 1);
+#else
+                mnt->mnt_count = 1;
+                mnt->mnt_writers = 0;
+#endif
                INIT_LIST_HEAD(&mnt->mnt_hash);
                INIT_LIST_HEAD(&mnt->mnt_child);
                INIT_LIST_HEAD(&mnt->mnt_mounts);
@@ -167,13 +234,6 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 #ifdef CONFIG_FSNOTIFY
                INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
 #endif
-#ifdef CONFIG_SMP
-                mnt->mnt_writers = alloc_percpu(int);
-                if (!mnt->mnt_writers)
-                        goto out_free_devname;
-#else
-                mnt->mnt_writers = 0;
-#endif
        }
        return mnt;
@@ -217,32 +277,32 @@ int __mnt_is_readonly(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
-static inline void inc_mnt_writers(struct vfsmount *mnt)
+static inline void mnt_inc_writers(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
-        (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
+        this_cpu_inc(mnt->mnt_pcp->mnt_writers);
 #else
        mnt->mnt_writers++;
 #endif
 }
-static inline void dec_mnt_writers(struct vfsmount *mnt)
+static inline void mnt_dec_writers(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
-        (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
+        this_cpu_dec(mnt->mnt_pcp->mnt_writers);
 #else
        mnt->mnt_writers--;
 #endif
 }
-static unsigned int count_mnt_writers(struct vfsmount *mnt)
+static unsigned int mnt_get_writers(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
        unsigned int count = 0;
        int cpu;
        for_each_possible_cpu(cpu) {
-                count += *per_cpu_ptr(mnt->mnt_writers, cpu);
+                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
        }
        return count;
@@ -274,9 +334,9 @@ int mnt_want_write(struct vfsmount *mnt)
        int ret = 0;
        preempt_disable();
-        inc_mnt_writers(mnt);
+        mnt_inc_writers(mnt);
        /*
-         * The store to inc_mnt_writers must be visible before we pass
+         * The store to mnt_inc_writers must be visible before we pass
         * MNT_WRITE_HOLD loop below, so that the slowpath can see our
         * incremented count after it has set MNT_WRITE_HOLD.
         */
@@ -290,7 +350,7 @@ int mnt_want_write(struct vfsmount *mnt)
         */
        smp_rmb();
        if (__mnt_is_readonly(mnt)) {
-                dec_mnt_writers(mnt);
+                mnt_dec_writers(mnt);
                ret = -EROFS;
                goto out;
        }
@@ -318,7 +378,7 @@ int mnt_clone_write(struct vfsmount *mnt)
        if (__mnt_is_readonly(mnt))
                return -EROFS;
        preempt_disable();
-        inc_mnt_writers(mnt);
+        mnt_inc_writers(mnt);
        preempt_enable();
        return 0;
 }
@@ -352,7 +412,7 @@ EXPORT_SYMBOL_GPL(mnt_want_write_file);
 void mnt_drop_write(struct vfsmount *mnt)
 {
        preempt_disable();
-        dec_mnt_writers(mnt);
+        mnt_dec_writers(mnt);
        preempt_enable();
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
@@ -385,7 +445,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
         * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
         * we're counting up here.
         */
-        if (count_mnt_writers(mnt) > 0)
+        if (mnt_get_writers(mnt) > 0)
                ret = -EBUSY;
        else
                mnt->mnt_flags |= MNT_READONLY;
@@ -419,7 +479,7 @@ void free_vfsmnt(struct vfsmount *mnt)
        kfree(mnt->mnt_devname);
        mnt_free_id(mnt);
 #ifdef CONFIG_SMP
-        free_percpu(mnt->mnt_writers);
+        free_percpu(mnt->mnt_pcp);
 #endif
        kmem_cache_free(mnt_cache, mnt);
 }
@@ -493,6 +553,27 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
 }
 /*
+ * Clear dentry's mounted state if it has no remaining mounts.
+ * vfsmount_lock must be held for write.
+ */
+static void dentry_reset_mounted(struct vfsmount *mnt, struct dentry *dentry)
+{
+        unsigned u;
+        for (u = 0; u < HASH_SIZE; u++) {
+                struct vfsmount *p;
+                list_for_each_entry(p, &mount_hashtable[u], mnt_hash) {
+                        if (p->mnt_mountpoint == dentry)
+                                return;
+                }
+        }
+        spin_lock(&dentry->d_lock);
+        dentry->d_flags &= ~DCACHE_MOUNTED;
+        spin_unlock(&dentry->d_lock);
+}
+/*
 * vfsmount lock must be held for write
 */
 static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
@@ -503,7 +584,7 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
        mnt->mnt_mountpoint = mnt->mnt_root;
        list_del_init(&mnt->mnt_child);
        list_del_init(&mnt->mnt_hash);
-        old_path->dentry->d_mounted--;
+        dentry_reset_mounted(old_path->mnt, old_path->dentry);
 }
 /*
@@ -514,7 +595,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
 {
        child_mnt->mnt_parent = mntget(mnt);
        child_mnt->mnt_mountpoint = dget(dentry);
-        dentry->d_mounted++;
+        spin_lock(&dentry->d_lock);
+        dentry->d_flags |= DCACHE_MOUNTED;
+        spin_unlock(&dentry->d_lock);
 }
 /*
@@ -630,9 +713,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
        return NULL;
 }
-static inline void __mntput(struct vfsmount *mnt)
+static inline void mntfree(struct vfsmount *mnt)
 {
        struct super_block *sb = mnt->mnt_sb;
        /*
         * This probably indicates that somebody messed
         * up a mnt_want/drop_write() pair.  If this
@@ -640,38 +724,123 @@ static inline void __mntput(struct vfsmount *mnt)
         * to make r/w->r/o transitions.
         */
        /*
-         * atomic_dec_and_lock() used to deal with ->mnt_count decrements
+         * The locking used to deal with mnt_count decrement provides barriers,
-         * provides barriers, so count_mnt_writers() below is safe.  AV
+         * so mnt_get_writers() below is safe.
         */
-        WARN_ON(count_mnt_writers(mnt));
+        WARN_ON(mnt_get_writers(mnt));
        fsnotify_vfsmount_delete(mnt);
        dput(mnt->mnt_root);
        free_vfsmnt(mnt);
        deactivate_super(sb);
 }
-void mntput_no_expire(struct vfsmount *mnt)
+#ifdef CONFIG_SMP
-{
+static inline void __mntput(struct vfsmount *mnt, int longrefs)
-repeat:
+{
-        if (atomic_add_unless(&mnt->mnt_count, -1, 1))
+        if (!longrefs) {
-                return;
+put_again:
+                br_read_lock(vfsmount_lock);
+                if (likely(atomic_read(&mnt->mnt_longrefs))) {
+                        mnt_dec_count(mnt);
+                        br_read_unlock(vfsmount_lock);
+                        return;
+                }
+                br_read_unlock(vfsmount_lock);
+        } else {
+                BUG_ON(!atomic_read(&mnt->mnt_longrefs));
+                if (atomic_add_unless(&mnt->mnt_longrefs, -1, 1))
+                        return;
+        }
        br_write_lock(vfsmount_lock);
-        if (!atomic_dec_and_test(&mnt->mnt_count)) {
+        if (!longrefs)
+                mnt_dec_count(mnt);
+        else
+                atomic_dec(&mnt->mnt_longrefs);
+        if (mnt_get_count(mnt)) {
                br_write_unlock(vfsmount_lock);
                return;
        }
-        if (likely(!mnt->mnt_pinned)) {
+        if (unlikely(mnt->mnt_pinned)) {
+                mnt_add_count(mnt, mnt->mnt_pinned + 1);
+                mnt->mnt_pinned = 0;
                br_write_unlock(vfsmount_lock);
-                __mntput(mnt);
+                acct_auto_close_mnt(mnt);
+                goto put_again;
+        }
+        br_write_unlock(vfsmount_lock);
+        mntfree(mnt);
+}
+#else
+static inline void __mntput(struct vfsmount *mnt, int longrefs)
+{
+put_again:
+        mnt_dec_count(mnt);
+        if (likely(mnt_get_count(mnt)))
                return;
+        br_write_lock(vfsmount_lock);
+        if (unlikely(mnt->mnt_pinned)) {
+                mnt_add_count(mnt, mnt->mnt_pinned + 1);
+                mnt->mnt_pinned = 0;
+                br_write_unlock(vfsmount_lock);
+                acct_auto_close_mnt(mnt);
+                goto put_again;
        }
-        atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
-        mnt->mnt_pinned = 0;
        br_write_unlock(vfsmount_lock);
-        acct_auto_close_mnt(mnt);
+        mntfree(mnt);
-        goto repeat;
+}
+#endif
+static void mntput_no_expire(struct vfsmount *mnt)
+{
+        __mntput(mnt, 0);
+}
+void mntput(struct vfsmount *mnt)
+{
+        if (mnt) {
+                /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+                if (unlikely(mnt->mnt_expiry_mark))
+                        mnt->mnt_expiry_mark = 0;
+                __mntput(mnt, 0);
+        }
+}
+EXPORT_SYMBOL(mntput);
+struct vfsmount *mntget(struct vfsmount *mnt)
+{
+        if (mnt)
+                mnt_inc_count(mnt);
+        return mnt;
 }
-EXPORT_SYMBOL(mntput_no_expire);
+EXPORT_SYMBOL(mntget);
+void mntput_long(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+        if (mnt) {
+                /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+                if (unlikely(mnt->mnt_expiry_mark))
+                        mnt->mnt_expiry_mark = 0;
+                __mntput(mnt, 1);
+        }
+#else
+        mntput(mnt);
+#endif
+}
+EXPORT_SYMBOL(mntput_long);
+struct vfsmount *mntget_long(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+        if (mnt)
+                atomic_inc(&mnt->mnt_longrefs);
+        return mnt;
+#else
+        return mntget(mnt);
+#endif
+}
+EXPORT_SYMBOL(mntget_long);
 void mnt_pin(struct vfsmount *mnt)
 {
@@ -679,19 +848,17 @@ void mnt_pin(struct vfsmount *mnt)
        mnt->mnt_pinned++;
        br_write_unlock(vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_pin);
 void mnt_unpin(struct vfsmount *mnt)
 {
        br_write_lock(vfsmount_lock);
        if (mnt->mnt_pinned) {
-                atomic_inc(&mnt->mnt_count);
+                mnt_inc_count(mnt);
                mnt->mnt_pinned--;
        }
        br_write_unlock(vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_unpin);
 static inline void mangle(struct seq_file *m, const char *s)
@@ -986,12 +1153,13 @@ int may_umount_tree(struct vfsmount *mnt)
        int minimum_refs = 0;
        struct vfsmount *p;
-        br_read_lock(vfsmount_lock);
+        /* write lock needed for mnt_get_count */
+        br_write_lock(vfsmount_lock);
        for (p = mnt; p; p = next_mnt(p, mnt)) {
-                actual_refs += atomic_read(&p->mnt_count);
+                actual_refs += mnt_get_count(p);
                minimum_refs += 2;
        }
-        br_read_unlock(vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        if (actual_refs > minimum_refs)
                return 0;
@@ -1018,10 +1186,10 @@ int may_umount(struct vfsmount *mnt)
 {
        int ret = 1;
        down_read(&namespace_sem);
-        br_read_lock(vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        if (propagate_mount_busy(mnt, 2))
                ret = 0;
-        br_read_unlock(vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        up_read(&namespace_sem);
        return ret;
 }
@@ -1048,7 +1216,7 @@ void release_mounts(struct list_head *head)
                        dput(dentry);
                        mntput(m);
                }
-                mntput(mnt);
+                mntput_long(mnt);
        }
 }
@@ -1074,7 +1242,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
                list_del_init(&p->mnt_child);
                if (p->mnt_parent != p) {
                        p->mnt_parent->mnt_ghosts++;
-                        p->mnt_mountpoint->d_mounted--;
+                        dentry_reset_mounted(p->mnt_parent, p->mnt_mountpoint);
                }
                change_mnt_propagation(p, MS_PRIVATE);
        }
@@ -1103,8 +1271,16 @@ static int do_umount(struct vfsmount *mnt, int flags)
                    flags & (MNT_FORCE | MNT_DETACH))
                        return -EINVAL;
-                if (atomic_read(&mnt->mnt_count) != 2)
+                /*
+                 * probably don't strictly need the lock here if we examined
+                 * all race cases, but it's a slowpath.
+                 */
+                br_write_lock(vfsmount_lock);
+                if (mnt_get_count(mnt) != 2) {
+                        br_write_lock(vfsmount_lock);
                        return -EBUSY;
+                }
+                br_write_unlock(vfsmount_lock);
                if (!xchg(&mnt->mnt_expiry_mark, 1))
                        return -EAGAIN;
@@ -1793,7 +1969,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 unlock:
        up_write(&namespace_sem);
-        mntput(newmnt);
+        mntput_long(newmnt);
        return err;
 }
@@ -2126,11 +2302,11 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
                if (fs) {
                        if (p == fs->root.mnt) {
                                rootmnt = p;
-                                fs->root.mnt = mntget(q);
+                                fs->root.mnt = mntget_long(q);
                        }
                        if (p == fs->pwd.mnt) {
                                pwdmnt = p;
-                                fs->pwd.mnt = mntget(q);
+                                fs->pwd.mnt = mntget_long(q);
                        }
                }
                p = next_mnt(p, mnt_ns->root);
@@ -2139,9 +2315,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
        up_write(&namespace_sem);
        if (rootmnt)
-                mntput(rootmnt);
+                mntput_long(rootmnt);
        if (pwdmnt)
-                mntput(pwdmnt);
+                mntput_long(pwdmnt);
        return new_ns;
 }
@@ -2328,6 +2504,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        touch_mnt_namespace(current->nsproxy->mnt_ns);
        br_write_unlock(vfsmount_lock);
        chroot_fs_refs(&root, &new);
        error = 0;
        path_put(&root_parent);
        path_put(&parent_path);
@@ -2354,6 +2531,7 @@ static void __init init_mount_tree(void)
        mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
        if (IS_ERR(mnt))
                panic("Can't create rootfs");
        ns = create_mnt_ns(mnt);
        if (IS_ERR(ns))
                panic("Can't allocate initial namespace");
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index aac8832e919e..28f136d4aaec 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -17,9 +17,9 @@
 #include <linux/kernel.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
+#include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
-#include <linux/smp_lock.h>
 #include <linux/ncp_fs.h>
@@ -75,9 +75,12 @@ const struct inode_operations ncp_dir_inode_operations =
 * Dentry operations routines
 */
 static int ncp_lookup_validate(struct dentry *, struct nameidata *);
-static int ncp_hash_dentry(struct dentry *, struct qstr *);
+static int ncp_hash_dentry(const struct dentry *, const struct inode *,
-static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *);
+                struct qstr *);
-static int ncp_delete_dentry(struct dentry *);
+static int ncp_compare_dentry(const struct dentry *, const struct inode *,
+                const struct dentry *, const struct inode *,
+                unsigned int, const char *, const struct qstr *);
+static int ncp_delete_dentry(const struct dentry *);
 static const struct dentry_operations ncp_dentry_operations =
 {
@@ -114,10 +117,10 @@ static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
 #define ncp_preserve_case(i)    (ncp_namespace(i) != NW_NS_DOS)
-static inline int ncp_case_sensitive(struct dentry *dentry)
+static inline int ncp_case_sensitive(const struct inode *i)
 {
 #ifdef CONFIG_NCPFS_NFS_NS
-        return ncp_namespace(dentry->d_inode) == NW_NS_NFS;
+        return ncp_namespace(i) == NW_NS_NFS;
 #else
        return 0;
 #endif /* CONFIG_NCPFS_NFS_NS */
@@ -128,14 +131,16 @@ static inline int ncp_case_sensitive(struct dentry *dentry)
 * is case-sensitive.
 */
 static int 
-ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
+ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *this)
 {
-        if (!ncp_case_sensitive(dentry)) {
+        if (!ncp_case_sensitive(inode)) {
+                struct super_block *sb = dentry->d_sb;
                struct nls_table *t;
                unsigned long hash;
                int i;
-                t = NCP_IO_TABLE(dentry);
+                t = NCP_IO_TABLE(sb);
                hash = init_name_hash();
                for (i=0; i<this->len ; i++)
                        hash = partial_name_hash(ncp_tolower(t, this->name[i]),
@@ -146,15 +151,17 @@ ncp_hash_dentry(struct dentry *dentry, struct qstr *this)
 }
 static int
-ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
+ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        if (a->len != b->len)
+        if (len != name->len)
                return 1;
-        if (ncp_case_sensitive(dentry))
+        if (ncp_case_sensitive(pinode))
-                return strncmp(a->name, b->name, a->len);
+                return strncmp(str, name->name, len);
-        return ncp_strnicmp(NCP_IO_TABLE(dentry), a->name, b->name, a->len);
+        return ncp_strnicmp(NCP_IO_TABLE(pinode->i_sb), str, name->name, len);
 }
 /*
@@ -163,7 +170,7 @@ ncp_compare_dentry(struct dentry *dentry, struct qstr *a, struct qstr *b)
 * Closing files can be safely postponed until iput() - it's done there anyway.
 */
 static int
-ncp_delete_dentry(struct dentry * dentry)
+ncp_delete_dentry(const struct dentry * dentry)
 {
        struct inode *inode = dentry->d_inode;
@@ -302,6 +309,9 @@ ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
        int res, val = 0, len;
        __u8 __name[NCP_MAXPATHLEN + 1];
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
@@ -385,21 +395,21 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
        }
        /* If a pointer is invalid, we search the dentry. */
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        next = parent->d_subdirs.next;
        while (next != &parent->d_subdirs) {
                dent = list_entry(next, struct dentry, d_u.d_child);
                if ((unsigned long)dent->d_fsdata == fpos) {
                        if (dent->d_inode)
-                                dget_locked(dent);
+                                dget(dent);
                        else
                                dent = NULL;
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&parent->d_lock);
                        goto out;
                }
                next = next->next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
        return NULL;
 out:
@@ -593,7 +603,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
        qname.hash = full_name_hash(qname.name, qname.len);
        if (dentry->d_op && dentry->d_op->d_hash)
-                if (dentry->d_op->d_hash(dentry, &qname) != 0)
+                if (dentry->d_op->d_hash(dentry, dentry->d_inode, &qname) != 0)
                        goto end_advance;
        newdent = d_lookup(dentry, &qname);
@@ -612,35 +622,12 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
                        shrink_dcache_parent(newdent);
                /*
-                 * It is not as dangerous as it looks.  NetWare's OS2 namespace is
+                 * NetWare's OS2 namespace is case preserving yet case
-                 * case preserving yet case insensitive.  So we update dentry's name
+                 * insensitive.  So we update dentry's name as received from
-                 * as received from server.  We found dentry via d_lookup with our
+                 * server. Parent dir's i_mutex is locked because we're in
-                 * hash, so we know that hash does not change, and so replacing name
+                 * readdir.
-                 * should be reasonably safe.
                 */
-                if (qname.len == newdent->d_name.len &&
+                dentry_update_name_case(newdent, &qname);
-                    memcmp(newdent->d_name.name, qname.name, newdent->d_name.len)) {
-                        struct inode *inode = newdent->d_inode;
-                        /*
-                         * Inside ncpfs all uses of d_name are either for debugging,
-                         * or on functions which acquire inode mutex (mknod, creat,
-                         * lookup).  So grab i_mutex here, to be sure.  d_path
-                         * uses dcache_lock when generating path, so we should too.
-                         * And finally d_compare is protected by dentry's d_lock, so
-                         * here we go.
-                         */
-                        if (inode)
-                                mutex_lock(&inode->i_mutex);
-                        spin_lock(&dcache_lock);
-                        spin_lock(&newdent->d_lock);
-                        memcpy((char *) newdent->d_name.name, qname.name,
-                                                                newdent->d_name.len);
-                        spin_unlock(&newdent->d_lock);
-                        spin_unlock(&dcache_lock);
-                        if (inode)
-                                mutex_unlock(&inode->i_mutex);
-                }
        }
        if (!newdent->d_inode) {
@@ -650,7 +637,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
                entry->ino = iunique(dir->i_sb, 2);
                inode = ncp_iget(dir->i_sb, entry);
                if (inode) {
-                        newdent->d_op = &ncp_dentry_operations;
+                        d_set_d_op(newdent, &ncp_dentry_operations);
                        d_instantiate(newdent, inode);
                        if (!hashed)
                                d_rehash(newdent);
@@ -658,7 +645,7 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
        } else {
                struct inode *inode = newdent->d_inode;
-                mutex_lock(&inode->i_mutex);
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
                ncp_update_inode2(inode, entry);
                mutex_unlock(&inode->i_mutex);
        }
@@ -906,7 +893,7 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
        if (inode) {
                ncp_new_dentry(dentry);
 add_entry:
-                dentry->d_op = &ncp_dentry_operations;
+                d_set_d_op(dentry, &ncp_dentry_operations);
                d_add(dentry, inode);
                error = 0;
        }
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 6c754f70c529..cb50aaf981df 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -17,7 +17,6 @@
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/ncp_fs.h>
 #include "ncplib_kernel.h"
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d290545aa0c4..9b39a5dd4131 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -26,10 +26,10 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/namei.h>
 #include <linux/ncp_fs.h>
@@ -59,11 +59,18 @@ static struct inode *ncp_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void ncp_destroy_inode(struct inode *inode)
+static void ncp_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
 }
+static void ncp_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, ncp_i_callback);
+}
 static void init_once(void *foo)
 {
        struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
@@ -310,7 +317,12 @@ static void ncp_stop_tasks(struct ncp_server *server) {
        sk->sk_write_space  = server->write_space;
        release_sock(sk);
        del_timer_sync(&server->timeout_tm);
-        flush_scheduled_work();
+        flush_work_sync(&server->rcv.tq);
+        if (sk->sk_socket->type == SOCK_STREAM)
+                flush_work_sync(&server->tx.tq);
+        else
+                flush_work_sync(&server->timeout_tq);
 }
 static int  ncp_show_options(struct seq_file *seq, struct vfsmount *mnt)
@@ -711,7 +723,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        sb->s_root = d_alloc_root(root_inode);
        if (!sb->s_root)
                goto out_no_root;
-        sb->s_root->d_op = &ncp_root_dentry_operations;
+        d_set_d_op(sb->s_root, &ncp_root_dentry_operations);
        return 0;
 out_no_root:
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index c2a1f9a155c3..d40a547e3377 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -17,7 +17,6 @@
 #include <linux/mount.h>
 #include <linux/slab.h>
 #include <linux/highuid.h>
-#include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 3c57eca634ce..1220df75ff22 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -135,7 +135,7 @@ int ncp__vol2io(struct ncp_server *, unsigned char *, unsigned int *,
                                const unsigned char *, unsigned int, int);
 #define NCP_ESC                 ':'
-#define NCP_IO_TABLE(dentry)    (NCP_SERVER((dentry)->d_inode)->nls_io)
+#define NCP_IO_TABLE(sb)        (NCP_SBP(sb)->nls_io)
 #define ncp_tolower(t, c)       nls_tolower(t, c)
 #define ncp_toupper(t, c)       nls_toupper(t, c)
 #define ncp_strnicmp(t, s1, s2, len) \
@@ -150,15 +150,15 @@ int ncp__io2vol(unsigned char *, unsigned int *,
 int ncp__vol2io(unsigned char *, unsigned int *,
                                const unsigned char *, unsigned int, int);
-#define NCP_IO_TABLE(dentry)    NULL
+#define NCP_IO_TABLE(sb)        NULL
 #define ncp_tolower(t, c)       tolower(c)
 #define ncp_toupper(t, c)       toupper(c)
 #define ncp_io2vol(S,m,i,n,k,U) ncp__io2vol(m,i,n,k,U)
 #define ncp_vol2io(S,m,i,n,k,U) ncp__vol2io(m,i,n,k,U)
-static inline int ncp_strnicmp(struct nls_table *t, const unsigned char *s1,
+static inline int ncp_strnicmp(const struct nls_table *t,
-                const unsigned char *s2, int len)
+                const unsigned char *s1, const unsigned char *s2, int len)
 {
        while (len--) {
                if (tolower(*s1++) != tolower(*s2++))
@@ -193,7 +193,7 @@ ncp_renew_dentries(struct dentry *parent)
        struct list_head *next;
        struct dentry *dentry;
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        next = parent->d_subdirs.next;
        while (next != &parent->d_subdirs) {
                dentry = list_entry(next, struct dentry, d_u.d_child);
@@ -205,7 +205,7 @@ ncp_renew_dentries(struct dentry *parent)
                next = next->next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
 }
 static inline void
@@ -215,7 +215,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
        struct list_head *next;
        struct dentry *dentry;
-        spin_lock(&dcache_lock);
+        spin_lock(&parent->d_lock);
        next = parent->d_subdirs.next;
        while (next != &parent->d_subdirs) {
                dentry = list_entry(next, struct dentry, d_u.d_child);
@@ -223,7 +223,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
                ncp_age_dentry(server, dentry);
                next = next->next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&parent->d_lock);
 }
 struct ncp_cache_head {
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index aeec017fe814..93a8b3bd69e3 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -9,7 +9,6 @@
 #include <linux/completion.h>
 #include <linux/ip.h>
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfs_fs.h>
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 232a7eead33a..1fd62fc49be3 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -11,7 +11,6 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/nfs4.h>
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 07ac3847e562..d33da530097a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -34,6 +34,7 @@
 #include <linux/mount.h>
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
+#include <linux/kmemleak.h>
 #include "delegation.h"
 #include "iostat.h"
@@ -56,7 +57,7 @@ static int nfs_rename(struct inode *, struct dentry *,
                      struct inode *, struct dentry *);
 static int nfs_fsync_dir(struct file *, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
-static int nfs_readdir_clear_array(struct page*, gfp_t);
+static void nfs_readdir_clear_array(struct page*);
 const struct file_operations nfs_dir_operations = {
        .llseek         = nfs_llseek_dir,
@@ -82,8 +83,8 @@ const struct inode_operations nfs_dir_inode_operations = {
        .setattr        = nfs_setattr,
 };
-const struct address_space_operations nfs_dir_addr_space_ops = {
+const struct address_space_operations nfs_dir_aops = {
-        .releasepage = nfs_readdir_clear_array,
+        .freepage = nfs_readdir_clear_array,
 };
 #ifdef CONFIG_NFS_V3
@@ -161,6 +162,7 @@ struct nfs_cache_array_entry {
        u64 cookie;
        u64 ino;
        struct qstr string;
+        unsigned char d_type;
 };
 struct nfs_cache_array {
@@ -170,14 +172,13 @@ struct nfs_cache_array {
        struct nfs_cache_array_entry array[0];
 };
-#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry))
 typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);
 typedef struct {
        struct file     *file;
        struct page     *page;
        unsigned long   page_index;
        u64             *dir_cookie;
+        u64             last_cookie;
        loff_t          current_index;
        decode_dirent_t decode;
@@ -194,9 +195,13 @@ typedef struct {
 static
 struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
 {
+        void *ptr;
        if (page == NULL)
                return ERR_PTR(-EIO);
-        return (struct nfs_cache_array *)kmap(page);
+        ptr = kmap(page);
+        if (ptr == NULL)
+                return ERR_PTR(-ENOMEM);
+        return ptr;
 }
 static
@@ -209,14 +214,15 @@ void nfs_readdir_release_array(struct page *page)
 * we are freeing strings created by nfs_add_to_readdir_array()
 */
 static
-int nfs_readdir_clear_array(struct page *page, gfp_t mask)
+void nfs_readdir_clear_array(struct page *page)
 {
-        struct nfs_cache_array *array = nfs_readdir_get_array(page);
+        struct nfs_cache_array *array;
        int i;
+        array = kmap_atomic(page, KM_USER0);
        for (i = 0; i < array->size; i++)
                kfree(array->array[i].string.name);
-        nfs_readdir_release_array(page);
+        kunmap_atomic(array, KM_USER0);
-        return 0;
 }
 /*
@@ -231,6 +237,11 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le
        string->name = kmemdup(name, len, GFP_KERNEL);
        if (string->name == NULL)
                return -ENOMEM;
+        /*
+         * Avoid a kmemleak false positive. The pointer to the name is stored
+         * in a page cache page which kmemleak does not scan.
+         */
+        kmemleak_not_leak(string->name);
        string->hash = full_name_hash(name, len);
        return 0;
 }
@@ -244,20 +255,24 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
        if (IS_ERR(array))
                return PTR_ERR(array);
-        ret = -EIO;
-        if (array->size >= MAX_READDIR_ARRAY)
-                goto out;
        cache_entry = &array->array[array->size];
+        /* Check that this entry lies within the page bounds */
+        ret = -ENOSPC;
+        if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE)
+                goto out;
        cache_entry->cookie = entry->prev_cookie;
        cache_entry->ino = entry->ino;
+        cache_entry->d_type = entry->d_type;
        ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
        if (ret)
                goto out;
        array->last_cookie = entry->cookie;
-        if (entry->eof == 1)
-                array->eof_index = array->size;
        array->size++;
+        if (entry->eof != 0)
+                array->eof_index = array->size;
 out:
        nfs_readdir_release_array(page);
        return ret;
@@ -272,7 +287,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
        if (diff < 0)
                goto out_eof;
        if (diff >= array->size) {
-                if (array->eof_index > 0)
+                if (array->eof_index >= 0)
                        goto out_eof;
                desc->current_index += array->size;
                return -EAGAIN;
@@ -281,8 +296,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
        index = (unsigned int)diff;
        *desc->dir_cookie = array->array[index].cookie;
        desc->cache_entry_index = index;
-        if (index == array->eof_index)
-                desc->eof = 1;
        return 0;
 out_eof:
        desc->eof = 1;
@@ -296,17 +309,16 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
        int status = -EAGAIN;
        for (i = 0; i < array->size; i++) {
-                if (i == array->eof_index) {
-                        desc->eof = 1;
-                        status = -EBADCOOKIE;
-                }
                if (array->array[i].cookie == *desc->dir_cookie) {
                        desc->cache_entry_index = i;
-                        status = 0;
+                        return 0;
-                        break;
                }
        }
+        if (array->eof_index >= 0) {
+                status = -EBADCOOKIE;
+                if (*desc->dir_cookie == array->last_cookie)
+                        desc->eof = 1;
+        }
        return status;
 }
@@ -314,10 +326,7 @@ static
 int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
 {
        struct nfs_cache_array *array;
-        int status = -EBADCOOKIE;
+        int status;
-        if (desc->dir_cookie == NULL)
-                goto out;
        array = nfs_readdir_get_array(desc->page);
        if (IS_ERR(array)) {
@@ -330,6 +339,10 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
        else
                status = nfs_readdir_search_for_cookie(array, desc);
+        if (status == -EAGAIN) {
+                desc->last_cookie = array->last_cookie;
+                desc->page_index++;
+        }
        nfs_readdir_release_array(desc->page);
 out:
        return status;
@@ -381,13 +394,9 @@ int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct x
 static
 int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
 {
-        struct nfs_inode *node;
        if (dentry->d_inode == NULL)
                goto different;
-        node = NFS_I(dentry->d_inode);
+        if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0)
-        if (node->fh.size != entry->fh->size)
-                goto different;
-        if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0)
                goto different;
        return 1;
 different:
@@ -429,7 +438,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
        if (dentry == NULL)
                return;
-        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+        d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
        inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
        if (IS_ERR(inode))
                goto out;
@@ -449,14 +458,15 @@ out:
 /* Perform conversion from xdr to cache array */
 static
-void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
+int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
                                void *xdr_page, struct page *page, unsigned int buflen)
 {
        struct xdr_stream stream;
        struct xdr_buf buf;
        __be32 *ptr = xdr_page;
-        int status;
        struct nfs_cache_array *array;
+        unsigned int count = 0;
+        int status;
        buf.head->iov_base = xdr_page;
        buf.head->iov_len = buflen;
@@ -471,21 +481,32 @@ void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *e
        do {
                status = xdr_decode(desc, entry, &stream);
-                if (status != 0)
+                if (status != 0) {
+                        if (status == -EAGAIN)
+                                status = 0;
                        break;
+                }
-                if (nfs_readdir_add_to_array(entry, page) == -1)
+                count++;
-                        break;
-                if (desc->plus == 1)
+                if (desc->plus != 0)
                        nfs_prime_dcache(desc->file->f_path.dentry, entry);
+                status = nfs_readdir_add_to_array(entry, page);
+                if (status != 0)
+                        break;
        } while (!entry->eof);
-        if (status == -EBADCOOKIE && entry->eof) {
+        if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
                array = nfs_readdir_get_array(page);
-                array->eof_index = array->size - 1;
+                if (!IS_ERR(array)) {
-                status = 0;
+                        array->eof_index = array->size;
-                nfs_readdir_release_array(page);
+                        status = 0;
+                        nfs_readdir_release_array(page);
+                } else
+                        status = PTR_ERR(array);
        }
+        return status;
 }
 static
@@ -537,11 +558,11 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
        struct nfs_entry entry;
        struct file     *file = desc->file;
        struct nfs_cache_array *array;
-        int status = 0;
+        int status = -ENOMEM;
        unsigned int array_size = ARRAY_SIZE(pages);
        entry.prev_cookie = 0;
-        entry.cookie = *desc->dir_cookie;
+        entry.cookie = desc->last_cookie;
        entry.eof = 0;
        entry.fh = nfs_alloc_fhandle();
        entry.fattr = nfs_alloc_fattr();
@@ -549,6 +570,10 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
                goto out;
        array = nfs_readdir_get_array(page);
+        if (IS_ERR(array)) {
+                status = PTR_ERR(array);
+                goto out;
+        }
        memset(array, 0, sizeof(struct nfs_cache_array));
        array->eof_index = -1;
@@ -556,12 +581,19 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
        if (!pages_ptr)
                goto out_release_array;
        do {
+                unsigned int pglen;
                status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
                if (status < 0)
                        break;
-                nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE);
+                pglen = status;
-        } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY);
+                status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, pglen);
+                if (status < 0) {
+                        if (status == -ENOSPC)
+                                status = 0;
+                        break;
+                }
+        } while (array->eof_index < 0);
        nfs_readdir_free_large_page(pages_ptr, pages, array_size);
 out_release_array:
@@ -582,8 +614,10 @@ static
 int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
 {
        struct inode    *inode = desc->file->f_path.dentry->d_inode;
+        int ret;
-        if (nfs_readdir_xdr_to_array(desc, page, inode) < 0)
+        ret = nfs_readdir_xdr_to_array(desc, page, inode);
+        if (ret < 0)
                goto error;
        SetPageUptodate(page);
@@ -595,12 +629,14 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
        return 0;
 error:
        unlock_page(page);
-        return -EIO;
+        return ret;
 }
 static
 void cache_page_release(nfs_readdir_descriptor_t *desc)
 {
+        if (!desc->page->mapping)
+                nfs_readdir_clear_array(desc->page);
        page_cache_release(desc->page);
        desc->page = NULL;
 }
@@ -608,12 +644,8 @@ void cache_page_release(nfs_readdir_descriptor_t *desc)
 static
 struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
 {
-        struct page *page;
+        return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
-        page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
                        desc->page_index, (filler_t *)nfs_readdir_filler, desc);
-        if (IS_ERR(page))
-                desc->eof = 1;
-        return page;
 }
 /*
@@ -629,9 +661,8 @@ int find_cache_page(nfs_readdir_descriptor_t *desc)
                return PTR_ERR(desc->page);
        res = nfs_readdir_search_array(desc);
-        if (res == 0)
+        if (res != 0)
-                return 0;
+                cache_page_release(desc);
-        cache_page_release(desc);
        return res;
 }
@@ -639,22 +670,18 @@ int find_cache_page(nfs_readdir_descriptor_t *desc)
 static inline
 int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 {
-        int res = -EAGAIN;
+        int res;
-        while (1) {
+        if (desc->page_index == 0) {
-                res = find_cache_page(desc);
+                desc->current_index = 0;
-                if (res != -EAGAIN)
+                desc->last_cookie = 0;
-                        break;
-                desc->page_index++;
        }
+        do {
+                res = find_cache_page(desc);
+        } while (res == -EAGAIN);
        return res;
 }
-static inline unsigned int dt_type(struct inode *inode)
-{
-        return (inode->i_mode >> 12) & 15;
-}
 /*
 * Once we've found the start of the dirent within a page: fill 'er up...
 */
@@ -666,35 +693,35 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
        int i = 0;
        int res = 0;
        struct nfs_cache_array *array = NULL;
-        unsigned int d_type = DT_UNKNOWN;
-        struct dentry *dentry = NULL;
        array = nfs_readdir_get_array(desc->page);
+        if (IS_ERR(array)) {
+                res = PTR_ERR(array);
+                goto out;
+        }
        for (i = desc->cache_entry_index; i < array->size; i++) {
-                d_type = DT_UNKNOWN;
+                struct nfs_cache_array_entry *ent;
-                res = filldir(dirent, array->array[i].string.name,
+                ent = &array->array[i];
-                        array->array[i].string.len, file->f_pos,
+                if (filldir(dirent, ent->string.name, ent->string.len,
-                        nfs_compat_user_ino64(array->array[i].ino), d_type);
+                    file->f_pos, nfs_compat_user_ino64(ent->ino),
-                if (res < 0)
+                    ent->d_type) < 0) {
+                        desc->eof = 1;
                        break;
+                }
                file->f_pos++;
-                desc->cache_entry_index = i;
                if (i < (array->size-1))
                        *desc->dir_cookie = array->array[i+1].cookie;
                else
                        *desc->dir_cookie = array->last_cookie;
-                if (i == array->eof_index) {
-                        desc->eof = 1;
-                        break;
-                }
        }
+        if (array->eof_index >= 0)
+                desc->eof = 1;
        nfs_readdir_release_array(desc->page);
+out:
        cache_page_release(desc);
-        if (dentry != NULL)
-                dput(dentry);
        dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
                        (unsigned long long)*desc->dir_cookie, res);
        return res;
@@ -729,13 +756,14 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
                goto out;
        }
-        if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) {
-                status = -EIO;
-                goto out_release;
-        }
        desc->page_index = 0;
+        desc->last_cookie = *desc->dir_cookie;
        desc->page = page;
+        status = nfs_readdir_xdr_to_array(desc, page, inode);
+        if (status < 0)
+                goto out_release;
        status = nfs_do_filldir(desc, dirent, filldir);
 out:
@@ -757,7 +785,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct inode    *inode = dentry->d_inode;
        nfs_readdir_descriptor_t my_desc,
                        *desc = &my_desc;
-        int res = -ENOMEM;
+        int res;
        dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -782,18 +810,18 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        if (res < 0)
                goto out;
-        while (desc->eof != 1) {
+        do {
                res = readdir_search_pagecache(desc);
                if (res == -EBADCOOKIE) {
+                        res = 0;
                        /* This means either end of directory */
                        if (*desc->dir_cookie && desc->eof == 0) {
                                /* Or that the server has 'lost' a cookie */
                                res = uncached_readdir(desc, dirent, filldir);
-                                if (res >= 0)
+                                if (res == 0)
                                        continue;
                        }
-                        res = 0;
                        break;
                }
                if (res == -ETOOSMALL && desc->plus) {
@@ -808,11 +836,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        break;
                res = nfs_do_filldir(desc, dirent, filldir);
-                if (res < 0) {
+                if (res < 0)
-                        res = 0;
                        break;
-                }
+        } while (!desc->eof);
-        }
 out:
        nfs_unblock_sillyrename(dentry);
        if (res > 0)
@@ -912,7 +938,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
 * component of the path.
 * We check for this using LOOKUP_CONTINUE and LOOKUP_PARENT.
 */
-static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, unsigned int mask)
+static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd,
+                                                unsigned int mask)
 {
        if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT))
                return 0;
@@ -992,7 +1019,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
 * If the parent directory is seen to have changed, we throw out the
 * cached dentry and do a new lookup.
 */
-static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
+static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *dir;
        struct inode *inode;
@@ -1001,6 +1028,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        struct nfs_fattr *fattr = NULL;
        int error;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
        nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
@@ -1091,7 +1121,7 @@ out_error:
 /*
 * This is called from dput() when d_count is going to 0.
 */
-static int nfs_dentry_delete(struct dentry *dentry)
+static int nfs_dentry_delete(const struct dentry *dentry)
 {
        dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -1162,7 +1192,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
        if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
                goto out;
-        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+        d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
        /*
         * If we're doing an exclusive create, optimize away the lookup
@@ -1307,7 +1337,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                res = ERR_PTR(-ENAMETOOLONG);
                goto out;
        }
-        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+        d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
        /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
         * the dentry. */
@@ -1345,12 +1375,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                                res = NULL;
                                goto out;
                        /* This turned out not to be a regular file */
-                        case -EISDIR:
                        case -ENOTDIR:
                                goto no_open;
                        case -ELOOP:
                                if (!(nd->intent.open.flags & O_NOFOLLOW))
                                        goto no_open;
+                        /* case -EISDIR: */
                        /* case -EINVAL: */
                        default:
                                res = ERR_CAST(inode);
@@ -1692,11 +1722,9 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
        dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
                dir->i_ino, dentry->d_name.name);
-        spin_lock(&dcache_lock);
        spin_lock(&dentry->d_lock);
-        if (atomic_read(&dentry->d_count) > 1) {
+        if (dentry->d_count > 1) {
                spin_unlock(&dentry->d_lock);
-                spin_unlock(&dcache_lock);
                /* Start asynchronous writeout of the inode */
                write_inode_now(dentry->d_inode, 0);
                error = nfs_sillyrename(dir, dentry);
@@ -1707,7 +1735,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
                need_rehash = 1;
        }
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&dcache_lock);
        error = nfs_safe_remove(dentry);
        if (!error || error == -ENOENT) {
                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
@@ -1842,7 +1869,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
                 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
                 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
-                 atomic_read(&new_dentry->d_count));
+                 new_dentry->d_count);
        /*
         * For non-directories, check whether the target is busy and if so,
@@ -1860,7 +1887,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        rehash = new_dentry;
                }
-                if (atomic_read(&new_dentry->d_count) > 2) {
+                if (new_dentry->d_count > 2) {
                        int err;
                        /* copy the target dentry's name */
@@ -2162,11 +2189,14 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
        return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
 }
-int nfs_permission(struct inode *inode, int mask)
+int nfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
        struct rpc_cred *cred;
        int res = 0;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        nfs_inc_stats(inode, NFSIOS_VFSACCESS);
        if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
@@ -2214,7 +2244,7 @@ out:
 out_notsup:
        res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
        if (res == 0)
-                res = generic_permission(inode, mask, NULL);
+                res = generic_permission(inode, mask, flags, NULL);
        goto out;
 }
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 84d3c8b90206..e6ace0d93c71 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -867,7 +867,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
                goto out;
        nfs_alloc_commit_data(dreq);
-        if (dreq->commit_data == NULL || count < wsize)
+        if (dreq->commit_data == NULL || count <= wsize)
                sync = NFS_FILE_SYNC;
        dreq->inode = inode;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 60677f9f1311..7bf029ef4084 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -693,6 +693,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
        struct inode *inode = filp->f_mapping->host;
        int status = 0;
+        unsigned int saved_type = fl->fl_type;
        /* Try local locking first */
        posix_test_lock(filp, fl);
@@ -700,6 +701,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
                /* found a conflict */
                goto out;
        }
+        fl->fl_type = saved_type;
        if (nfs_have_delegation(inode, FMODE_READ))
                goto out_noconflict;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index ac7b814ce162..5596c6a2881e 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -63,9 +63,11 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
                 * This again causes shrink_dcache_for_umount_subtree() to
                 * Oops, since the test for IS_ROOT() will fail.
                 */
-                spin_lock(&dcache_lock);
+                spin_lock(&sb->s_root->d_inode->i_lock);
+                spin_lock(&sb->s_root->d_lock);
                list_del_init(&sb->s_root->d_alias);
-                spin_unlock(&dcache_lock);
+                spin_unlock(&sb->s_root->d_lock);
+                spin_unlock(&sb->s_root->d_inode->i_lock);
        }
        return 0;
 }
@@ -119,7 +121,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
        security_d_instantiate(ret, inode);
        if (ret->d_op == NULL)
-                ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
+                d_set_d_op(ret, server->nfs_client->rpc_ops->dentry_ops);
 out:
        nfs_free_fattr(fsinfo.fattr);
        return ret;
@@ -226,7 +228,7 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
        security_d_instantiate(ret, inode);
        if (ret->d_op == NULL)
-                ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
+                d_set_d_op(ret, server->nfs_client->rpc_ops->dentry_ops);
 out:
        nfs_free_fattr(fattr);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 314f57164602..017daa3bed38 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -289,6 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                } else if (S_ISDIR(inode->i_mode)) {
                        inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
                        inode->i_fop = &nfs_dir_operations;
+                        inode->i_data.a_ops = &nfs_dir_aops;
                        if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
                                set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                        /* Deal with crossing mountpoints */
@@ -1437,11 +1438,18 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
        return &nfsi->vfs_inode;
 }
-void nfs_destroy_inode(struct inode *inode)
+static void nfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
 }
+void nfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, nfs_i_callback);
+}
 static inline void nfs4_init_once(struct nfs_inode *nfsi)
 {
 #ifdef CONFIG_NFS_V4
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index db08ff3ff454..e6356b750b77 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -362,6 +362,15 @@ unsigned int nfs_page_length(struct page *page)
 }
 /*
+ * Convert a umode to a dirent->d_type
+ */
+static inline
+unsigned char nfs_umode_to_dtype(umode_t mode)
+{
+        return (mode >> 12) & 15;
+}
+/*
 * Determine the number of pages in an array of length 'len' and
 * with a base offset of 'base'
 */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index eceafe74f473..4f981f1f6689 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -505,13 +505,13 @@ static struct rpc_procinfo mnt3_procedures[] = {
 static struct rpc_version mnt_version1 = {
        .number         = 1,
-        .nrprocs        = 2,
+        .nrprocs        = ARRAY_SIZE(mnt_procedures),
        .procs          = mnt_procedures,
 };
 static struct rpc_version mnt_version3 = {
        .number         = 3,
-        .nrprocs        = 2,
+        .nrprocs        = ARRAY_SIZE(mnt3_procedures),
        .procs          = mnt3_procedures,
 };
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index db6aa3673cf3..74aaf3963c10 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -49,12 +49,17 @@ char *nfs_path(const char *base,
               const struct dentry *dentry,
               char *buffer, ssize_t buflen)
 {
-        char *end = buffer+buflen;
+        char *end;
        int namelen;
+        unsigned seq;
+rename_retry:
+        end = buffer+buflen;
        *--end = '\0';
        buflen--;
-        spin_lock(&dcache_lock);
+        seq = read_seqbegin(&rename_lock);
+        rcu_read_lock();
        while (!IS_ROOT(dentry) && dentry != droot) {
                namelen = dentry->d_name.len;
                buflen -= namelen + 1;
@@ -65,7 +70,9 @@ char *nfs_path(const char *base,
                *--end = '/';
                dentry = dentry->d_parent;
        }
-        spin_unlock(&dcache_lock);
+        rcu_read_unlock();
+        if (read_seqretry(&rename_lock, seq))
+                goto rename_retry;
        if (*end != '/') {
                if (--buflen < 0)
                        goto Elong;
@@ -82,7 +89,9 @@ char *nfs_path(const char *base,
        memcpy(end, base, namelen);
        return end;
 Elong_unlock:
-        spin_unlock(&dcache_lock);
+        rcu_read_unlock();
+        if (read_seqretry(&rename_lock, seq))
+                goto rename_retry;
 Elong:
        return ERR_PTR(-ENAMETOOLONG);
 }
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index e6bf45710cc7..5914a1911c95 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -423,7 +423,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
        struct page **page;
        size_t hdrlen;
        unsigned int pglen, recvd;
-        int status, nr = 0;
+        int status;
        if ((status = ntohl(*p++)))
                return nfs_stat_to_errno(status);
@@ -443,7 +443,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
        if (pglen > recvd)
                pglen = recvd;
        page = rcvbuf->pages;
-        return nr;
+        return pglen;
 }
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -485,6 +485,8 @@ nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_se
        entry->prev_cookie        = entry->cookie;
        entry->cookie     = ntohl(*p++);
+        entry->d_type = DT_UNKNOWN;
        p = xdr_inline_peek(xdr, 8);
        if (p != NULL)
                entry->eof = !p[0] && p[1];
@@ -495,7 +497,7 @@ nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_se
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EIO);
+        return ERR_PTR(-EAGAIN);
 }
 /*
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index d9a5e832c257..f6cc60f06dac 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -555,7 +555,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
        struct page **page;
        size_t hdrlen;
        u32 recvd, pglen;
-        int status, nr = 0;
+        int status;
        status = ntohl(*p++);
        /* Decode post_op_attrs */
@@ -586,7 +586,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res
                pglen = recvd;
        page = rcvbuf->pages;
-        return nr;
+        return pglen;
 }
 __be32 *
@@ -622,11 +622,13 @@ nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_s
        entry->prev_cookie = entry->cookie;
        p = xdr_decode_hyper(p, &entry->cookie);
+        entry->d_type = DT_UNKNOWN;
        if (plus) {
                entry->fattr->valid = 0;
                p = xdr_decode_post_op_attr_stream(xdr, entry->fattr);
                if (IS_ERR(p))
                        goto out_overflow_exit;
+                entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
                /* In fact, a post_op_fh3: */
                p = xdr_inline_decode(xdr, 4);
                if (unlikely(!p))
@@ -656,7 +658,7 @@ nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_s
 out_overflow:
        print_overflow_msg(__func__, xdr);
 out_overflow_exit:
-        return ERR_PTR(-EIO);
+        return ERR_PTR(-EAGAIN);
 }
 /*
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0f24cdf2cb13..4435e5e1f904 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2852,8 +2852,10 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
        nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args);
        res.pgbase = args.pgbase;
        status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0);
-        if (status == 0)
+        if (status >= 0) {
                memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
+                status += args.pgbase;
+        }
        nfs_invalidate_atime(dir);
@@ -3359,6 +3361,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
        ret = nfs_revalidate_inode(server, inode);
        if (ret < 0)
                return ret;
+        if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+                nfs_zap_acl_cache(inode);
        ret = nfs4_read_cached_acl(inode, buf, buflen);
        if (ret != -ENOENT)
                return ret;
@@ -3387,6 +3391,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
        nfs_inode_return_delegation(inode);
        buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
        ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
+        /*
+         * Acl update can result in inode attribute update.
+         * so mark the attribute cache invalid.
+         */
+        spin_lock(&inode->i_lock);
+        NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
+        spin_unlock(&inode->i_lock);
        nfs_access_zap_cache(inode);
        nfs_zap_acl_cache(inode);
        return ret;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f313c4cce7e4..9f1826b012e6 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4518,7 +4518,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
        xdr_read_pages(xdr, pglen);
-        return 0;
+        return pglen;
 }
 static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
@@ -6208,6 +6208,10 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
        if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)
                entry->ino = entry->fattr->fileid;
+        entry->d_type = DT_UNKNOWN;
+        if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
+                entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
        if (verify_attr_len(xdr, p, len) < 0)
                goto out_overflow;
@@ -6221,7 +6225,7 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 out_overflow:
        print_overflow_msg(__func__, xdr);
-        return ERR_PTR(-EIO);
+        return ERR_PTR(-EAGAIN);
 }
 /*
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 137b549e63db..b68536cc9046 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -115,7 +115,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
 {
        if (!nfs_lock_request_dontget(req))
                return 0;
-        if (req->wb_page != NULL)
+        if (test_bit(PG_MAPPED, &req->wb_flags))
                radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
        return 1;
 }
@@ -125,7 +125,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
 */
 void nfs_clear_page_tag_locked(struct nfs_page *req)
 {
-        if (req->wb_page != NULL) {
+        if (test_bit(PG_MAPPED, &req->wb_flags)) {
                struct inode *inode = req->wb_context->path.dentry->d_inode;
                struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index e4b62c6f5a6e..aedcaa7f291f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -152,7 +152,6 @@ static void nfs_readpage_release(struct nfs_page *req)
                        (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
                        req->wb_bytes,
                        (long long)req_offset(req));
-        nfs_clear_request(req);
        nfs_release_request(req);
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0a42e8f4adcb..4100630c9a5b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -39,7 +39,6 @@
 #include <linux/nfs_mount.h>
 #include <linux/nfs4_mount.h>
 #include <linux/lockd/bind.h>
-#include <linux/smp_lock.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include <linux/mnt_namespace.h>
@@ -67,6 +66,12 @@
 #define NFSDBG_FACILITY         NFSDBG_VFS
+#ifdef CONFIG_NFS_V3
+#define NFS_DEFAULT_VERSION 3
+#else
+#define NFS_DEFAULT_VERSION 2
+#endif
 enum {
        /* Mount options that take no arguments */
        Opt_soft, Opt_hard,
@@ -1064,12 +1069,10 @@ static int nfs_parse_mount_options(char *raw,
                        mnt->flags |= NFS_MOUNT_VER3;
                        mnt->version = 3;
                        break;
-#ifdef CONFIG_NFS_V4
                case Opt_v4:
                        mnt->flags &= ~NFS_MOUNT_VER3;
                        mnt->version = 4;
                        break;
-#endif
                case Opt_udp:
                        mnt->flags &= ~NFS_MOUNT_TCP;
                        mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1281,12 +1284,10 @@ static int nfs_parse_mount_options(char *raw,
                                mnt->flags |= NFS_MOUNT_VER3;
                                mnt->version = 3;
                                break;
-#ifdef CONFIG_NFS_V4
                        case NFS4_VERSION:
                                mnt->flags &= ~NFS_MOUNT_VER3;
                                mnt->version = 4;
                                break;
-#endif
                        default:
                                goto out_invalid_value;
                        }
@@ -2277,7 +2278,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        };
        int error = -ENOMEM;
-        data = nfs_alloc_parsed_mount_data(3);
+        data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
        mntfh = nfs_alloc_fhandle();
        if (data == NULL || mntfh == NULL)
                goto out_free_fh;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 7bdec8531400..8fe9eb47a97f 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -496,7 +496,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
        dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name,
-                atomic_read(&dentry->d_count));
+                dentry->d_count);
        nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
        /*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4c14c17a5276..10d648ea128b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -390,6 +390,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
                if (nfs_have_delegation(inode, FMODE_WRITE))
                        nfsi->change_attr++;
        }
+        set_bit(PG_MAPPED, &req->wb_flags);
        SetPagePrivate(req->wb_page);
        set_page_private(req->wb_page, (unsigned long)req);
        nfsi->npages++;
@@ -415,6 +416,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
        spin_lock(&inode->i_lock);
        set_page_private(req->wb_page, 0);
        ClearPagePrivate(req->wb_page);
+        clear_bit(PG_MAPPED, &req->wb_flags);
        radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
        nfsi->npages--;
        if (!nfsi->npages) {
@@ -422,7 +424,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
                iput(inode);
        } else
                spin_unlock(&inode->i_lock);
-        nfs_clear_request(req);
        nfs_release_request(req);
 }
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2a533a0af2a9..7e84a852cdae 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -260,9 +260,11 @@ void fill_post_wcc(struct svc_fh *fhp)
        err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry,
                        &fhp->fh_post_attr);
        fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
-        if (err)
+        if (err) {
                fhp->fh_post_saved = 0;
-        else
+                /* Grab the ctime anyway - set_change_info might use it */
+                fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime;
+        } else
                fhp->fh_post_saved = 1;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f1e5ec6b5105..fbd18c3074bb 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -673,16 +673,17 @@ static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses)
        spin_unlock(&clp->cl_lock);
 }
-static void nfsd4_register_conn(struct nfsd4_conn *conn)
+static int nfsd4_register_conn(struct nfsd4_conn *conn)
 {
        conn->cn_xpt_user.callback = nfsd4_conn_lost;
-        register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
+        return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
 }
 static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
 {
        struct nfsd4_conn *conn;
        u32 flags = NFS4_CDFC4_FORE;
+        int ret;
        if (ses->se_flags & SESSION4_BACK_CHAN)
                flags |= NFS4_CDFC4_BACK;
@@ -690,7 +691,10 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
        if (!conn)
                return nfserr_jukebox;
        nfsd4_hash_conn(conn, ses);
-        nfsd4_register_conn(conn);
+        ret = nfsd4_register_conn(conn);
+        if (ret)
+                /* oops; xprt is already down: */
+                nfsd4_conn_lost(&conn->cn_xpt_user);
        return nfs_ok;
 }
@@ -1644,6 +1648,7 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi
 {
        struct nfs4_client *clp = ses->se_client;
        struct nfsd4_conn *c;
+        int ret;
        spin_lock(&clp->cl_lock);
        c = __nfsd4_find_conn(new->cn_xprt, ses);
@@ -1654,7 +1659,10 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi
        }
        __nfsd4_hash_conn(new, ses);
        spin_unlock(&clp->cl_lock);
-        nfsd4_register_conn(new);
+        ret = nfsd4_register_conn(new);
+        if (ret)
+                /* oops; xprt is already down: */
+                nfsd4_conn_lost(&new->cn_xpt_user);
        return;
 }
@@ -2254,7 +2262,7 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
 * Spawn a thread to perform a recall on the delegation represented
 * by the lease (file_lock)
 *
- * Called from break_lease() with lock_kernel() held.
+ * Called from break_lease() with lock_flocks() held.
 * Note: we assume break_lease will only call this *once* for any given
 * lease.
 */
@@ -2278,7 +2286,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
        list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
        spin_unlock(&recall_lock);
-        /* only place dl_time is set. protected by lock_kernel*/
+        /* only place dl_time is set. protected by lock_flocks*/
        dp->dl_time = get_seconds();
        /*
@@ -2295,7 +2303,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
 /*
 * The file_lock is being reapd.
 *
- * Called by locks_free_lock() with lock_kernel() held.
+ * Called by locks_free_lock() with lock_flocks() held.
 */
 static
 void nfsd_release_deleg_cb(struct file_lock *fl)
@@ -2310,7 +2318,7 @@ void nfsd_release_deleg_cb(struct file_lock *fl)
 }
 /*
- * Called from setlease() with lock_kernel() held
+ * Called from setlease() with lock_flocks() held
 */
 static
 int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try)
@@ -4328,7 +4336,7 @@ __nfs4_state_shutdown(void)
 void
 nfs4_state_shutdown(void)
 {
-        cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work);
+        cancel_delayed_work_sync(&laundromat_work);
        destroy_workqueue(laundry_wq);
        locks_end_grace(&nfsd4_manager);
        nfs4_lock_state();
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 184938fcff04..3a359023c9f7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1756,8 +1756,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
                goto out_dput_new;
        if (svc_msnfs(ffhp) &&
-                ((atomic_read(&odentry->d_count) > 1)
+                ((odentry->d_count > 1) || (ndentry->d_count > 1))) {
-                 || (atomic_read(&ndentry->d_count) > 1))) {
                        host_err = -EPERM;
                        goto out_dput_new;
        }
@@ -1843,7 +1842,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        if (type != S_IFDIR) { /* It's UNLINK */
 #ifdef MSNFS
                if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-                        (atomic_read(&rdentry->d_count) > 1)) {
+                        (rdentry->d_count > 1)) {
                        host_err = -EPERM;
                } else
 #endif
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 4d476ff08ae6..60fce3dc5cb5 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -484,18 +484,17 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
 static inline void
 set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
 {
-        BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved);
+        BUG_ON(!fhp->fh_pre_saved);
-        cinfo->atomic = 1;
+        cinfo->atomic = fhp->fh_post_saved;
        cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode);
-        if (cinfo->change_supported) {
-                cinfo->before_change = fhp->fh_pre_change;
+        cinfo->before_change = fhp->fh_pre_change;
-                cinfo->after_change = fhp->fh_post_change;
+        cinfo->after_change = fhp->fh_post_change;
-        } else {
+        cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
-                cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
+        cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
-                cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
+        cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
-                cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
+        cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
-                cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
-        }
 }
 int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 49c844dab33a..59e5fe742f7b 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -335,7 +335,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
         * the device at this point.
         *
         * To prevent nilfs_dat_translate() from returning the
-         * uncommited block number, this makes a copy of the entry
+         * uncommitted block number, this makes a copy of the entry
         * buffer and redirects nilfs_dat_translate() to the copy.
         */
        if (!buffer_nilfs_redirected(entry_bh)) {
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 33ad25ddd5c4..caf9a6a3fb54 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -176,7 +176,6 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
 int nilfs_init_gcinode(struct inode *inode)
 {
        struct nilfs_inode_info *ii = NILFS_I(inode);
-        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
        inode->i_mode = S_IFREG;
        mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
@@ -186,14 +185,6 @@ int nilfs_init_gcinode(struct inode *inode)
        ii->i_flags = 0;
        nilfs_bmap_init_gc(ii->i_bmap);
-        /*
-         * Add the inode to GC inode list. Garbage Collection
-         * is serialized and no two processes manipulate the
-         * list simultaneously.
-         */
-        igrab(inode);
-        list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
        return 0;
 }
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 71d4bc8464e0..77b48c8fab17 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -785,15 +785,19 @@ out_err:
        return err;
 }
-int nilfs_permission(struct inode *inode, int mask)
+int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        struct nilfs_root *root = NILFS_I(inode)->i_root;
+        struct nilfs_root *root;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        root = NILFS_I(inode)->i_root;
        if ((mask & MAY_WRITE) && root &&
            root->cno != NILFS_CPTREE_CURRENT_CNO)
                return -EROFS; /* snapshot is not writable */
-        return generic_permission(inode, mask, NULL);
+        return generic_permission(inode, mask, flags, NULL);
 }
 int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 3e90f86d5bfe..b185e937a335 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -337,6 +337,7 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
                                   struct nilfs_argv *argv, void *buf)
 {
        size_t nmembs = argv->v_nmembs;
+        struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
        struct inode *inode;
        struct nilfs_vdesc *vdesc;
        struct buffer_head *bh, *n;
@@ -349,10 +350,21 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
                ino = vdesc->vd_ino;
                cno = vdesc->vd_cno;
                inode = nilfs_iget_for_gc(sb, ino, cno);
-                if (unlikely(inode == NULL)) {
+                if (IS_ERR(inode)) {
-                        ret = -ENOMEM;
+                        ret = PTR_ERR(inode);
                        goto failed;
                }
+                if (list_empty(&NILFS_I(inode)->i_dirty)) {
+                        /*
+                         * Add the inode to GC inode list. Garbage Collection
+                         * is serialized and no two processes manipulate the
+                         * list simultaneously.
+                         */
+                        igrab(inode);
+                        list_add(&NILFS_I(inode)->i_dirty,
+                                 &nilfs->ns_gc_inodes);
+                }
                do {
                        ret = nilfs_ioctl_move_inode_block(inode, vdesc,
                                                           &buffers);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index f7560da5a567..0ca98823db59 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -256,7 +256,7 @@ extern void nilfs_update_inode(struct inode *, struct buffer_head *);
 extern void nilfs_truncate(struct inode *);
 extern void nilfs_evict_inode(struct inode *);
 extern int nilfs_setattr(struct dentry *, struct iattr *);
-int nilfs_permission(struct inode *inode, int mask);
+int nilfs_permission(struct inode *inode, int mask, unsigned int flags);
 extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
                                  struct buffer_head **);
 extern int nilfs_inode_dirty(struct inode *);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f804d41ec9d3..e2dcc9c733f7 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -162,10 +162,13 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
        return &ii->vfs_inode;
 }
-void nilfs_destroy_inode(struct inode *inode)
+static void nilfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
        if (mdi) {
                kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
                kfree(mdi);
@@ -173,6 +176,11 @@ void nilfs_destroy_inode(struct inode *inode)
        kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
 }
+void nilfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, nilfs_i_callback);
+}
 static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
 {
        struct the_nilfs *nilfs = sbi->s_nilfs;
@@ -838,7 +846,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
 static int nilfs_tree_was_touched(struct dentry *root_dentry)
 {
-        return atomic_read(&root_dentry->d_count) > 1;
+        return root_dentry->d_count > 1;
 }
 /**
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index b04f88eed09e..f35794b97e8e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -92,7 +92,11 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-        wait_event(group->fanotify_data.access_waitq, event->response);
+        wait_event(group->fanotify_data.access_waitq, event->response ||
+                                atomic_read(&group->fanotify_data.bypass_perm));
+        if (!event->response) /* bypass_perm set */
+                return 0;
        /* userspace responded, convert to something usable */
        spin_lock(&event->lock);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 063224812b7e..8b61220cffc5 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -106,20 +106,29 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
        return client_fd;
 }
-static ssize_t fill_event_metadata(struct fsnotify_group *group,
+static int fill_event_metadata(struct fsnotify_group *group,
                                   struct fanotify_event_metadata *metadata,
                                   struct fsnotify_event *event)
 {
+        int ret = 0;
        pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
                 group, metadata, event);
        metadata->event_len = FAN_EVENT_METADATA_LEN;
+        metadata->metadata_len = FAN_EVENT_METADATA_LEN;
        metadata->vers = FANOTIFY_METADATA_VERSION;
        metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
        metadata->pid = pid_vnr(event->tgid);
-        metadata->fd = create_fd(group, event);
+        if (unlikely(event->mask & FAN_Q_OVERFLOW))
+                metadata->fd = FAN_NOFD;
+        else {
+                metadata->fd = create_fd(group, event);
+                if (metadata->fd < 0)
+                        ret = metadata->fd;
+        }
-        return metadata->fd;
+        return ret;
 }
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
@@ -200,7 +209,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
        mutex_lock(&group->fanotify_data.access_mutex);
-        if (group->fanotify_data.bypass_perm) {
+        if (atomic_read(&group->fanotify_data.bypass_perm)) {
                mutex_unlock(&group->fanotify_data.access_mutex);
                kmem_cache_free(fanotify_response_event_cache, re);
                event->response = FAN_ALLOW;
@@ -257,24 +266,34 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-        fd = fill_event_metadata(group, &fanotify_event_metadata, event);
+        ret = fill_event_metadata(group, &fanotify_event_metadata, event);
-        if (fd < 0)
+        if (ret < 0)
-                return fd;
+                goto out;
+        fd = fanotify_event_metadata.fd;
        ret = prepare_for_access_response(group, event, fd);
        if (ret)
                goto out_close_fd;
        ret = -EFAULT;
-        if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN))
+        if (copy_to_user(buf, &fanotify_event_metadata,
+                         fanotify_event_metadata.event_len))
                goto out_kill_access_response;
-        return FAN_EVENT_METADATA_LEN;
+        return fanotify_event_metadata.event_len;
 out_kill_access_response:
        remove_access_response(group, event, fd);
 out_close_fd:
-        sys_close(fd);
+        if (fd != FAN_NOFD)
+                sys_close(fd);
+out:
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        if (event->mask & FAN_ALL_PERM_EVENTS) {
+                event->response = FAN_DENY;
+                wake_up(&group->fanotify_data.access_waitq);
+        }
+#endif
        return ret;
 }
@@ -382,7 +401,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
        mutex_lock(&group->fanotify_data.access_mutex);
-        group->fanotify_data.bypass_perm = true;
+        atomic_inc(&group->fanotify_data.bypass_perm);
        list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
                pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
@@ -586,11 +605,10 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 {
        struct fsnotify_mark *fsn_mark;
        __u32 added;
+        int ret = 0;
        fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
        if (!fsn_mark) {
-                int ret;
                if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
                        return -ENOSPC;
@@ -600,17 +618,16 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
                fsnotify_init_mark(fsn_mark, fanotify_free_mark);
                ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
-                if (ret) {
+                if (ret)
-                        fanotify_free_mark(fsn_mark);
+                        goto err;
-                        return ret;
-                }
        }
        added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
-        fsnotify_put_mark(fsn_mark);
        if (added & ~mnt->mnt_fsnotify_mask)
                fsnotify_recalc_vfsmount_mask(mnt);
+err:
-        return 0;
+        fsnotify_put_mark(fsn_mark);
+        return ret;
 }
 static int fanotify_add_inode_mark(struct fsnotify_group *group,
@@ -619,6 +636,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 {
        struct fsnotify_mark *fsn_mark;
        __u32 added;
+        int ret = 0;
        pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
@@ -634,8 +652,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
        fsn_mark = fsnotify_find_inode_mark(group, inode);
        if (!fsn_mark) {
-                int ret;
                if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
                        return -ENOSPC;
@@ -645,16 +661,16 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
                fsnotify_init_mark(fsn_mark, fanotify_free_mark);
                ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
-                if (ret) {
+                if (ret)
-                        fanotify_free_mark(fsn_mark);
+                        goto err;
-                        return ret;
-                }
        }
        added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
-        fsnotify_put_mark(fsn_mark);
        if (added & ~inode->i_fsnotify_mask)
                fsnotify_recalc_inode_mask(inode);
-        return 0;
+err:
+        fsnotify_put_mark(fsn_mark);
+        return ret;
 }
 /* fanotify syscalls */
@@ -687,8 +703,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
        /* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
        group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
-        if (IS_ERR(group))
+        if (IS_ERR(group)) {
+                free_uid(user);
                return PTR_ERR(group);
+        }
        group->fanotify_data.user = user;
        atomic_inc(&user->fanotify_listeners);
@@ -698,6 +716,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
        mutex_init(&group->fanotify_data.access_mutex);
        init_waitqueue_head(&group->fanotify_data.access_waitq);
        INIT_LIST_HEAD(&group->fanotify_data.access_list);
+        atomic_set(&group->fanotify_data.bypass_perm, 0);
 #endif
        switch (flags & FAN_ALL_CLASS_BITS) {
        case FAN_CLASS_NOTIF:
@@ -764,8 +783,10 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
        if (flags & ~FAN_ALL_MARK_FLAGS)
                return -EINVAL;
        switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
-        case FAN_MARK_ADD:
+        case FAN_MARK_ADD:              /* fallthrough */
        case FAN_MARK_REMOVE:
+                if (!mask)
+                        return -EINVAL;
        case FAN_MARK_FLUSH:
                break;
        default:
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 20dc218707ca..79b47cbb5cd8 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -59,7 +59,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
        /* determine if the children should tell inode about their events */
        watched = fsnotify_inode_watches_children(inode);
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        /* run all of the dentries associated with this inode.  Since this is a
         * directory, there damn well better only be one item on this list */
        list_for_each_entry(alias, &inode->i_dentry, d_alias) {
@@ -68,19 +68,21 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
                /* run all of the children of the original inode and fix their
                 * d_flags to indicate parental interest (their parent is the
                 * original inode) */
+                spin_lock(&alias->d_lock);
                list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
                        if (!child->d_inode)
                                continue;
-                        spin_lock(&child->d_lock);
+                        spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
                        if (watched)
                                child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
                        else
                                child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
                        spin_unlock(&child->d_lock);
                }
+                spin_unlock(&alias->d_lock);
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
 }
 /* Notify this dentry's parent about a child's events. */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 444c305a468c..4cd5d5d78f9f 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -752,6 +752,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
        if (ret >= 0)
                return ret;
+        fsnotify_put_group(group);
        atomic_dec(&user->inotify_devs);
 out_free_uid:
        free_uid(user);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 93622b175fc7..a627ed82c0a3 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -332,6 +332,13 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb)
        return NULL;
 }
+static void ntfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
+}
 void ntfs_destroy_big_inode(struct inode *inode)
 {
        ntfs_inode *ni = NTFS_I(inode);
@@ -340,7 +347,7 @@ void ntfs_destroy_big_inode(struct inode *inode)
        BUG_ON(ni->page);
        if (!atomic_dec_and_test(&ni->count))
                BUG();
-        kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
+        call_rcu(&inode->i_rcu, ntfs_i_callback);
 }
 static inline ntfs_inode *ntfs_alloc_extent_inode(void)
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 391915093fe1..704f6b1742f3 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -291,13 +291,17 @@ static int ocfs2_set_acl(handle_t *handle,
        return ret;
 }
-int ocfs2_check_acl(struct inode *inode, int mask)
+int ocfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_super *osb;
        struct buffer_head *di_bh = NULL;
        struct posix_acl *acl;
        int ret = -EAGAIN;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        osb = OCFS2_SB(inode->i_sb);
        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
                return ret;
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 5c5d31f05853..4fe7c9cf4bfb 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,7 +26,7 @@ struct ocfs2_acl_entry {
        __le32 e_id;
 };
-extern int ocfs2_check_acl(struct inode *, int);
+extern int ocfs2_check_acl(struct inode *, int, unsigned int);
 extern int ocfs2_acl_chmod(struct inode *);
 extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
                          struct buffer_head *, struct buffer_head *,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f1e962cb3b73..0d7c5540ad66 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -573,11 +573,14 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
        /* this io's submitter should not have unlocked this before we could */
        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+        if (ocfs2_iocb_is_sem_locked(iocb)) {
+                up_read(&inode->i_alloc_sem);
+                ocfs2_iocb_clear_sem_locked(iocb);
+        }
        ocfs2_iocb_clear_rw_locked(iocb);
        level = ocfs2_iocb_rw_locked_level(iocb);
-        if (!level)
-                up_read(&inode->i_alloc_sem);
        ocfs2_rw_unlock(inode, level);
        if (is_async)
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 76bfdfda691a..eceb456037c1 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -68,8 +68,27 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
        else
                clear_bit(1, (unsigned long *)&iocb->private);
 }
+/*
+ * Using a named enum representing lock types in terms of #N bit stored in
+ * iocb->private, which is going to be used for communication bewteen
+ * ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
+ */
+enum ocfs2_iocb_lock_bits {
+        OCFS2_IOCB_RW_LOCK = 0,
+        OCFS2_IOCB_RW_LOCK_LEVEL,
+        OCFS2_IOCB_SEM,
+        OCFS2_IOCB_NUM_LOCKS
+};
 #define ocfs2_iocb_clear_rw_locked(iocb) \
-        clear_bit(0, (unsigned long *)&iocb->private)
+        clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
 #define ocfs2_iocb_rw_locked_level(iocb) \
-        test_bit(1, (unsigned long *)&iocb->private)
+        test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_set_sem_locked(iocb) \
+        set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_sem_locked(iocb) \
+        clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_sem_locked(iocb) \
+        test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
 #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 52c7557f3e25..9e3d45bcb5fd 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -307,8 +307,7 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
 static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
 {
-        cancel_delayed_work(&reg->hr_write_timeout_work);
+        cancel_delayed_work_sync(&reg->hr_write_timeout_work);
-        flush_scheduled_work();
 }
 static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
@@ -1964,8 +1963,10 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
        if (reg == NULL)
                return ERR_PTR(-ENOMEM);
-        if (strlen(name) > O2HB_MAX_REGION_NAME_LEN)
+        if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) {
-                return ERR_PTR(-ENAMETOOLONG);
+                ret = -ENAMETOOLONG;
+                goto free;
+        }
        spin_lock(&o2hb_live_lock);
        reg->hr_region_num = 0;
@@ -1974,7 +1975,8 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
                                                         O2NM_MAX_REGIONS);
                if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
                        spin_unlock(&o2hb_live_lock);
-                        return ERR_PTR(-EFBIG);
+                        ret = -EFBIG;
+                        goto free;
                }
                set_bit(reg->hr_region_num, o2hb_region_bitmap);
        }
@@ -1986,10 +1988,13 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
        ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
        if (ret) {
                config_item_put(&reg->hr_item);
-                return ERR_PTR(ret);
+                goto free;
        }
        return &reg->hr_item;
+free:
+        kfree(reg);
+        return ERR_PTR(ret);
 }
 static void o2hb_heartbeat_group_drop_item(struct config_group *group,
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index c7fba396392d..6c61771469af 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -113,10 +113,11 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
        define_mask(QUOTA),
        define_mask(REFCOUNT),
        define_mask(BASTS),
+        define_mask(RESERVATIONS),
+        define_mask(CLUSTER),
        define_mask(ERROR),
        define_mask(NOTICE),
        define_mask(KTHREAD),
-        define_mask(RESERVATIONS),
 };
 static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index ea2ed9f56c94..34d6544357d9 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -81,7 +81,7 @@
 #include <linux/sched.h>
 /* bits that are frequently given and infrequently matched in the low word */
-/* NOTE: If you add a flag, you need to also update mlog.c! */
+/* NOTE: If you add a flag, you need to also update masklog.c! */
 #define ML_ENTRY        0x0000000000000001ULL /* func call entry */
 #define ML_EXIT         0x0000000000000002ULL /* func call exit */
 #define ML_TCP          0x0000000000000004ULL /* net cluster/tcp.c */
@@ -114,13 +114,14 @@
 #define ML_XATTR        0x0000000020000000ULL /* ocfs2 extended attributes */
 #define ML_QUOTA        0x0000000040000000ULL /* ocfs2 quota operations */
 #define ML_REFCOUNT     0x0000000080000000ULL /* refcount tree operations */
-#define ML_BASTS        0x0000001000000000ULL /* dlmglue asts and basts */
+#define ML_BASTS        0x0000000100000000ULL /* dlmglue asts and basts */
+#define ML_RESERVATIONS 0x0000000200000000ULL /* ocfs2 alloc reservations */
+#define ML_CLUSTER      0x0000000400000000ULL /* cluster stack */
 /* bits that are infrequently given and frequently matched in the high word */
-#define ML_ERROR        0x0000000100000000ULL /* sent to KERN_ERR */
+#define ML_ERROR        0x1000000000000000ULL /* sent to KERN_ERR */
-#define ML_NOTICE       0x0000000200000000ULL /* setn to KERN_NOTICE */
+#define ML_NOTICE       0x2000000000000000ULL /* setn to KERN_NOTICE */
-#define ML_KTHREAD      0x0000000400000000ULL /* kernel thread activity */
+#define ML_KTHREAD      0x4000000000000000ULL /* kernel thread activity */
-#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
-#define ML_CLUSTER      0x0000001000000000ULL /* cluster stack */
 #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
 #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index cf3e16696216..a87366750f23 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -325,5 +325,7 @@ void o2quo_init(void)
 void o2quo_exit(void)
 {
-        flush_scheduled_work();
+        struct o2quo_state *qs = &o2quo_state;
+        flush_work_sync(&qs->qs_work);
 }
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index edaded48e7e9..6d80ecc7834f 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -52,9 +52,15 @@ void ocfs2_dentry_attach_gen(struct dentry *dentry)
 static int ocfs2_dentry_revalidate(struct dentry *dentry,
                                   struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
        int ret = 0;    /* if all else fails, just return false */
-        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+        struct ocfs2_super *osb;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        osb = OCFS2_SB(dentry->d_sb);
        mlog_entry("(0x%p, '%.*s')\n", dentry,
                   dentry->d_name.len, dentry->d_name.name);
@@ -169,23 +175,25 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
        struct list_head *p;
        struct dentry *dentry = NULL;
-        spin_lock(&dcache_lock);
+        spin_lock(&inode->i_lock);
        list_for_each(p, &inode->i_dentry) {
                dentry = list_entry(p, struct dentry, d_alias);
+                spin_lock(&dentry->d_lock);
                if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
                        mlog(0, "dentry found: %.*s\n",
                             dentry->d_name.len, dentry->d_name.name);
-                        dget_locked(dentry);
+                        dget_dlock(dentry);
+                        spin_unlock(&dentry->d_lock);
                        break;
                }
+                spin_unlock(&dentry->d_lock);
                dentry = NULL;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&inode->i_lock);
        return dentry;
 }
@@ -476,7 +484,6 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
 out:
        iput(inode);
-        ocfs2_dentry_attach_gen(dentry);
 }
 /*
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c49f6de0e7ab..d417b3f9b0c7 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2461,8 +2461,10 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        di->i_dx_root = cpu_to_le64(dr_blkno);
+        spin_lock(&OCFS2_I(dir)->ip_lock);
        OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+        spin_unlock(&OCFS2_I(dir)->ip_lock);
        ocfs2_journal_dirty(handle, di_bh);
@@ -4466,8 +4468,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
                goto out_commit;
        }
+        spin_lock(&OCFS2_I(dir)->ip_lock);
        OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+        spin_unlock(&OCFS2_I(dir)->ip_lock);
        di->i_dx_root = cpu_to_le64(0ULL);
        ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 58a93b953735..cc2aaa96cfe5 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -959,7 +959,7 @@ static int dlm_match_regions(struct dlm_ctxt *dlm,
                r += O2HB_MAX_REGION_NAME_LEN;
        }
-        local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
+        local = kmalloc(sizeof(qr->qr_regions), GFP_ATOMIC);
        if (!local) {
                status = -ENOMEM;
                goto bail;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f564b0e5f80d..59f0f6bdfc62 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2346,7 +2346,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
 */
 static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
                                      struct dlm_lock_resource *res,
-                                      int *numlocks)
+                                      int *numlocks,
+                                      int *hasrefs)
 {
        int ret;
        int i;
@@ -2356,6 +2357,9 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
        assert_spin_locked(&res->spinlock);
+        *numlocks = 0;
+        *hasrefs = 0;
        ret = -EINVAL;
        if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
                mlog(0, "cannot migrate lockres with unknown owner!\n");
@@ -2386,7 +2390,13 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
        }
        *numlocks = count;
-        mlog(0, "migrateable lockres having %d locks\n", *numlocks);
+        count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+        if (count < O2NM_MAX_NODES)
+                *hasrefs = 1;
+        mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name,
+             res->lockname.len, res->lockname.name, *numlocks, *hasrefs);
 leave:
        return ret;
@@ -2408,7 +2418,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        const char *name;
        unsigned int namelen;
        int mle_added = 0;
-        int numlocks;
+        int numlocks, hasrefs;
        int wake = 0;
        if (!dlm_grab(dlm))
@@ -2417,13 +2427,13 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        name = res->lockname.name;
        namelen = res->lockname.len;
-        mlog(0, "migrating %.*s to %u\n", namelen, name, target);
+        mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target);
        /*
         * ensure this lockres is a proper candidate for migration
         */
        spin_lock(&res->spinlock);
-        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
+        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
        if (ret < 0) {
                spin_unlock(&res->spinlock);
                goto leave;
@@ -2431,10 +2441,8 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        spin_unlock(&res->spinlock);
        /* no work to do */
-        if (numlocks == 0) {
+        if (numlocks == 0 && !hasrefs)
-                mlog(0, "no locks were found on this lockres! done!\n");
                goto leave;
-        }
        /*
         * preallocate up front
@@ -2459,14 +2467,14 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
         * find a node to migrate the lockres to
         */
-        mlog(0, "picking a migration node\n");
        spin_lock(&dlm->spinlock);
        /* pick a new node */
        if (!test_bit(target, dlm->domain_map) ||
            target >= O2NM_MAX_NODES) {
                target = dlm_pick_migration_target(dlm, res);
        }
-        mlog(0, "node %u chosen for migration\n", target);
+        mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
+             namelen, name, target);
        if (target >= O2NM_MAX_NODES ||
            !test_bit(target, dlm->domain_map)) {
@@ -2667,7 +2675,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
        int ret;
        int lock_dropped = 0;
-        int numlocks;
+        int numlocks, hasrefs;
        spin_lock(&res->spinlock);
        if (res->owner != dlm->node_num) {
@@ -2681,8 +2689,8 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
        }
        /* No need to migrate a lockres having no locks */
-        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
+        ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
-        if (ret >= 0 && numlocks == 0) {
+        if (ret >= 0 && numlocks == 0 && !hasrefs) {
                spin_unlock(&res->spinlock);
                goto leave;
        }
@@ -2915,6 +2923,12 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
                }
                queue++;
        }
+        nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+        if (nodenum < O2NM_MAX_NODES) {
+                spin_unlock(&res->spinlock);
+                return nodenum;
+        }
        spin_unlock(&res->spinlock);
        mlog(0, "have not found a suitable target yet! checking domain map\n");
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b2df490a19ed..8c5c0eddc365 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -351,11 +351,18 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb)
        return &ip->ip_vfs_inode;
 }
-static void dlmfs_destroy_inode(struct inode *inode)
+static void dlmfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
 }
+static void dlmfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, dlmfs_i_callback);
+}
 static void dlmfs_evict_inode(struct inode *inode)
 {
        int status;
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 19ad145d2af3..6adafa576065 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -138,7 +138,7 @@ check_gen:
        result = d_obtain_alias(inode);
        if (!IS_ERR(result))
-                result->d_op = &ocfs2_dentry_ops;
+                d_set_d_op(result, &ocfs2_dentry_ops);
        else
                mlog_errno(PTR_ERR(result));
@@ -176,7 +176,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
        parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
        if (!IS_ERR(parent))
-                parent->d_op = &ocfs2_dentry_ops;
+                d_set_d_op(parent, &ocfs2_dentry_ops);
 bail_unlock:
        ocfs2_inode_unlock(dir, 0);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 77b4c04a2809..bdadbae09094 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1307,10 +1307,13 @@ bail:
        return err;
 }
-int ocfs2_permission(struct inode *inode, int mask)
+int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
 {
        int ret;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        mlog_entry_void();
        ret = ocfs2_inode_lock(inode, NULL, 0);
@@ -1320,7 +1323,7 @@ int ocfs2_permission(struct inode *inode, int mask)
                goto out;
        }
-        ret = generic_permission(inode, mask, ocfs2_check_acl);
+        ret = generic_permission(inode, mask, flags, ocfs2_check_acl);
        ocfs2_inode_unlock(inode, 0);
 out:
@@ -2241,11 +2244,15 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        mutex_lock(&inode->i_mutex);
+        ocfs2_iocb_clear_sem_locked(iocb);
 relock:
        /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
        if (direct_io) {
                down_read(&inode->i_alloc_sem);
                have_alloc_sem = 1;
+                /* communicate with ocfs2_dio_end_io */
+                ocfs2_iocb_set_sem_locked(iocb);
        }
        /*
@@ -2382,8 +2389,10 @@ out:
                ocfs2_rw_unlock(inode, rw_level);
 out_sems:
-        if (have_alloc_sem)
+        if (have_alloc_sem) {
                up_read(&inode->i_alloc_sem);
+                ocfs2_iocb_clear_sem_locked(iocb);
+        }
        mutex_unlock(&inode->i_mutex);
@@ -2527,6 +2536,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
                goto bail;
        }
+        ocfs2_iocb_clear_sem_locked(iocb);
        /*
         * buffered reads protect themselves in ->readpage().  O_DIRECT reads
         * need locks to protect pending reads from racing with truncate.
@@ -2534,6 +2545,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
        if (filp->f_flags & O_DIRECT) {
                down_read(&inode->i_alloc_sem);
                have_alloc_sem = 1;
+                ocfs2_iocb_set_sem_locked(iocb);
                ret = ocfs2_rw_lock(inode, 0);
                if (ret < 0) {
@@ -2575,8 +2587,10 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
        }
 bail:
-        if (have_alloc_sem)
+        if (have_alloc_sem) {
                up_read(&inode->i_alloc_sem);
+                ocfs2_iocb_clear_sem_locked(iocb);
+        }
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
        mlog_exit(ret);
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 97bf761c9e7c..f5afbbef6703 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -61,7 +61,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
                  struct kstat *stat);
-int ocfs2_permission(struct inode *inode, int mask);
+int ocfs2_permission(struct inode *inode, int mask, unsigned int flags);
 int ocfs2_should_update_atime(struct inode *inode,
                              struct vfsmount *vfsmnt);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index ff5744e1e36f..d14cad6e2e41 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -147,7 +147,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
        spin_unlock(&oi->ip_lock);
 bail_add:
-        dentry->d_op = &ocfs2_dentry_ops;
+        d_set_d_op(dentry, &ocfs2_dentry_ops);
        ret = d_splice_alias(inode, dentry);
        if (inode) {
@@ -415,7 +415,7 @@ static int ocfs2_mknod(struct inode *dir,
                mlog_errno(status);
                goto leave;
        }
-        dentry->d_op = &ocfs2_dentry_ops;
+        d_set_d_op(dentry, &ocfs2_dentry_ops);
        status = ocfs2_add_entry(handle, dentry, inode,
                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
@@ -743,7 +743,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        }
        ihold(inode);
-        dentry->d_op = &ocfs2_dentry_ops;
+        d_set_d_op(dentry, &ocfs2_dentry_ops);
        d_instantiate(dentry, inode);
 out_commit:
@@ -1794,7 +1794,7 @@ static int ocfs2_symlink(struct inode *dir,
                mlog_errno(status);
                goto bail;
        }
-        dentry->d_op = &ocfs2_dentry_ops;
+        d_set_d_op(dentry, &ocfs2_dentry_ops);
        status = ocfs2_add_entry(handle, dentry, inode,
                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
@@ -2459,7 +2459,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
                goto out_commit;
        }
-        dentry->d_op = &ocfs2_dentry_ops;
+        d_set_d_op(dentry, &ocfs2_dentry_ops);
        d_instantiate(dentry, inode);
        status = 0;
 out_commit:
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d8408217e3bd..70dd3b1798f1 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -159,7 +159,9 @@ struct ocfs2_lock_res {
        char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
        unsigned int             l_ro_holders;
        unsigned int             l_ex_holders;
-        unsigned char            l_level;
+        signed char              l_level;
+        signed char              l_requested;
+        signed char              l_blocking;
        /* Data packed - type enum ocfs2_lock_type */
        unsigned char            l_type;
@@ -169,8 +171,6 @@ struct ocfs2_lock_res {
        unsigned char            l_action;
        /* Data packed - enum type ocfs2_unlock_action */
        unsigned char            l_unlock_action;
-        unsigned char            l_requested;
-        unsigned char            l_blocking;
        unsigned int             l_pending_gen;
        spinlock_t               l_lock;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index c2e4f8222e2f..bf2e7764920e 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -350,7 +350,7 @@ enum {
 #define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
        NUM_SYSTEM_INODES
 };
-#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE
+#define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE
 #define NUM_LOCAL_SYSTEM_INODES \
                (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 252e7c82f929..a5ebe421195f 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -190,7 +190,7 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
                        return c;
        }
-        return c;
+        return NULL;
 }
 /*
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f02c0ef31578..17ff46fa8a10 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,7 +41,6 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
-#include <linux/smp_lock.h>
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -570,11 +569,18 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
        return &oi->vfs_inode;
 }
-static void ocfs2_destroy_inode(struct inode *inode)
+static void ocfs2_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
 }
+static void ocfs2_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, ocfs2_i_callback);
+}
 static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
                                                unsigned int cbits)
 {
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index ddb1f41376e5..a2a5bff774e3 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -343,11 +343,18 @@ static struct inode *openprom_alloc_inode(struct super_block *sb)
        return &oi->vfs_inode;
 }
-static void openprom_destroy_inode(struct inode *inode)
+static void openprom_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(op_inode_cachep, OP_I(inode));
 }
+static void openprom_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, openprom_i_callback);
+}
 static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
 {
        struct inode *inode;
@@ -418,7 +425,7 @@ out_no_root:
 static struct dentry *openprom_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
 {
-        return mount_single(fs_type, flags, data, openprom_fill_super)
+        return mount_single(fs_type, flags, data, openprom_fill_super);
 }
 static struct file_system_type openprom_fs_type = {
diff --git a/fs/pipe.c b/fs/pipe.c
index a8012a955720..68f1f8e4e23b 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -999,12 +999,12 @@ struct file *create_write_pipe(int flags)
                goto err;
        err = -ENOMEM;
-        path.dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name);
+        path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
        if (!path.dentry)
                goto err_inode;
        path.mnt = mntget(pipe_mnt);
-        path.dentry->d_op = &pipefs_dentry_operations;
+        d_set_d_op(path.dentry, &pipefs_dentry_operations);
        d_instantiate(path.dentry, inode);
        err = -ENFILE;
@@ -1199,12 +1199,24 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
        return ret;
 }
+/*
+ * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
+ * location, so checking ->i_pipe is not enough to verify that this is a
+ * pipe.
+ */
+struct pipe_inode_info *get_pipe_info(struct file *file)
+{
+        struct inode *i = file->f_path.dentry->d_inode;
+        return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL;
+}
 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 {
        struct pipe_inode_info *pipe;
        long ret;
-        pipe = file->f_path.dentry->d_inode->i_pipe;
+        pipe = get_pipe_info(file);
        if (!pipe)
                return -EBADF;
@@ -1241,6 +1253,10 @@ out:
        return ret;
 }
+static const struct super_operations pipefs_ops = {
+        .destroy_inode = free_inode_nonrcu,
+};
 /*
 * pipefs should _never_ be mounted by userland - too much of security hassle,
 * no real gain from having the whole whorehouse mounted. So we don't need
@@ -1250,7 +1266,7 @@ out:
 static struct dentry *pipefs_mount(struct file_system_type *fs_type,
                         int flags, const char *dev_name, void *data)
 {
-        return mount_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
+        return mount_pseudo(fs_type, "pipe:", &pipefs_ops, PIPEFS_MAGIC);
 }
 static struct file_system_type pipe_fs_type = {
@@ -1276,7 +1292,7 @@ static int __init init_pipe_fs(void)
 static void __exit exit_pipe_fs(void)
 {
        unregister_filesystem(&pipe_fs_type);
-        mntput(pipe_mnt);
+        mntput_long(pipe_mnt);
 }
 fs_initcall(init_pipe_fs);
diff --git a/fs/pnode.c b/fs/pnode.c
index 8066b8dd748f..d42514e32380 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -288,7 +288,7 @@ out:
 */
 static inline int do_refcount_check(struct vfsmount *mnt, int count)
 {
-        int mycount = atomic_read(&mnt->mnt_count) - mnt->mnt_ghosts;
+        int mycount = mnt_get_count(mnt) - mnt->mnt_ghosts;
        return (mycount > count);
 }
@@ -300,7 +300,7 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
 * Check if any of these mounts that **do not have submounts**
 * have more references than 'refcnt'. If so return busy.
 *
- * vfsmount lock must be held for read or write
+ * vfsmount lock must be held for write
 */
 int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
 {
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 2758e2afc518..288a49e098bf 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -15,6 +15,7 @@ proc-y	+= devices.o
 proc-y  += interrupts.o
 proc-y  += loadavg.o
 proc-y  += meminfo.o
+proc-y  += proc_console.o
 proc-y  += stat.o
 proc-y  += uptime.o
 proc-y  += version.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index f3d02ca461ec..b20962c71a52 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1407,6 +1407,82 @@ static const struct file_operations proc_pid_sched_operations = {
 #endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+/*
+ * Print out autogroup related information:
+ */
+static int sched_autogroup_show(struct seq_file *m, void *v)
+{
+        struct inode *inode = m->private;
+        struct task_struct *p;
+        p = get_proc_task(inode);
+        if (!p)
+                return -ESRCH;
+        proc_sched_autogroup_show_task(p, m);
+        put_task_struct(p);
+        return 0;
+}
+static ssize_t
+sched_autogroup_write(struct file *file, const char __user *buf,
+            size_t count, loff_t *offset)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct task_struct *p;
+        char buffer[PROC_NUMBUF];
+        long nice;
+        int err;
+        memset(buffer, 0, sizeof(buffer));
+        if (count > sizeof(buffer) - 1)
+                count = sizeof(buffer) - 1;
+        if (copy_from_user(buffer, buf, count))
+                return -EFAULT;
+        err = strict_strtol(strstrip(buffer), 0, &nice);
+        if (err)
+                return -EINVAL;
+        p = get_proc_task(inode);
+        if (!p)
+                return -ESRCH;
+        err = nice;
+        err = proc_sched_autogroup_set_nice(p, &err);
+        if (err)
+                count = err;
+        put_task_struct(p);
+        return count;
+}
+static int sched_autogroup_open(struct inode *inode, struct file *filp)
+{
+        int ret;
+        ret = single_open(filp, sched_autogroup_show, NULL);
+        if (!ret) {
+                struct seq_file *m = filp->private_data;
+                m->private = inode;
+        }
+        return ret;
+}
+static const struct file_operations proc_pid_sched_autogroup_operations = {
+        .open           = sched_autogroup_open,
+        .read           = seq_read,
+        .write          = sched_autogroup_write,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+#endif /* CONFIG_SCHED_AUTOGROUP */
 static ssize_t comm_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *offset)
 {
@@ -1574,7 +1650,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
        if (!tmp)
                return -ENOMEM;
-        pathname = d_path_with_unreachable(path, tmp, PAGE_SIZE);
+        pathname = d_path(path, tmp, PAGE_SIZE);
        len = PTR_ERR(pathname);
        if (IS_ERR(pathname))
                goto out;
@@ -1719,10 +1795,16 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
 */
 static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
-        struct task_struct *task = get_proc_task(inode);
+        struct task_struct *task;
        const struct cred *cred;
+        if (nd && nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        task = get_proc_task(inode);
        if (task) {
                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
                    task_dumpable(task)) {
@@ -1744,7 +1826,7 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
        return 0;
 }
-static int pid_delete_dentry(struct dentry * dentry)
+static int pid_delete_dentry(const struct dentry * dentry)
 {
        /* Is the task we represent dead?
         * If so, then don't put the dentry on the lru list,
@@ -1888,12 +1970,19 @@ static int proc_fd_link(struct inode *inode, struct path *path)
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
-        struct task_struct *task = get_proc_task(inode);
+        struct task_struct *task;
-        int fd = proc_fd(inode);
+        int fd;
        struct files_struct *files;
        const struct cred *cred;
+        if (nd && nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        task = get_proc_task(inode);
+        fd = proc_fd(inode);
        if (task) {
                files = get_files_struct(task);
                if (files) {
@@ -1969,7 +2058,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
        inode->i_op = &proc_pid_link_inode_operations;
        inode->i_size = 64;
        ei->op.proc_get_link = proc_fd_link;
-        dentry->d_op = &tid_fd_dentry_operations;
+        d_set_d_op(dentry, &tid_fd_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
        if (tid_fd_revalidate(dentry, NULL))
@@ -2101,11 +2190,13 @@ static const struct file_operations proc_fd_operations = {
 * /proc/pid/fd needs a special permission handler so that a process can still
 * access /proc/self/fd after it has executed a setuid().
 */
-static int proc_fd_permission(struct inode *inode, int mask)
+static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags)
 {
        int rv;
-        rv = generic_permission(inode, mask, NULL);
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        rv = generic_permission(inode, mask, flags, NULL);
        if (rv == 0)
                return 0;
        if (task_pid(current) == proc_pid(inode))
@@ -2137,7 +2228,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
        ei->fd = fd;
        inode->i_mode = S_IFREG | S_IRUSR;
        inode->i_fop = &proc_fdinfo_file_operations;
-        dentry->d_op = &tid_fd_dentry_operations;
+        d_set_d_op(dentry, &tid_fd_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
        if (tid_fd_revalidate(dentry, NULL))
@@ -2196,7 +2287,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
        if (p->fop)
                inode->i_fop = p->fop;
        ei->op = p->op;
-        dentry->d_op = &pid_dentry_operations;
+        d_set_d_op(dentry, &pid_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
        if (pid_revalidate(dentry, NULL))
@@ -2563,8 +2654,14 @@ static const struct pid_entry proc_base_stuff[] = {
 */
 static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode;
-        struct task_struct *task = get_proc_task(inode);
+        struct task_struct *task;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        task = get_proc_task(inode);
        if (task) {
                put_task_struct(task);
                return 1;
@@ -2615,7 +2712,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        if (p->fop)
                inode->i_fop = p->fop;
        ei->op = p->op;
-        dentry->d_op = &proc_base_dentry_operations;
+        d_set_d_op(dentry, &proc_base_dentry_operations);
        d_add(dentry, inode);
        error = NULL;
 out:
@@ -2733,6 +2830,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+        REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
+#endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        INF("syscall",    S_IRUSR, proc_pid_syscall),
@@ -2926,7 +3026,7 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
        inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
                ARRAY_SIZE(tgid_base_stuff));
-        dentry->d_op = &pid_dentry_operations;
+        d_set_d_op(dentry, &pid_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
@@ -3169,7 +3269,7 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
        inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
                ARRAY_SIZE(tid_base_stuff));
-        dentry->d_op = &pid_dentry_operations;
+        d_set_d_op(dentry, &pid_dentry_operations);
        d_add(dentry, inode);
        /* Close the race of the process dying before we return the dentry */
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index dd29f0337661..f766be29d2c7 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -400,7 +400,7 @@ static const struct inode_operations proc_link_inode_operations = {
 * smarter: we could keep a "volatile" flag in the 
 * inode to indicate which ones to keep.
 */
-static int proc_delete_dentry(struct dentry * dentry)
+static int proc_delete_dentry(const struct dentry * dentry)
 {
        return 1;
 }
@@ -439,7 +439,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 out_unlock:
        if (inode) {
-                dentry->d_op = &proc_dentry_operations;
+                d_set_d_op(dentry, &proc_dentry_operations);
                d_add(dentry, inode);
                return NULL;
        }
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 9c2b5f484879..6bcb926b101b 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -16,7 +16,6 @@
 #include <linux/limits.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/smp_lock.h>
 #include <linux/sysctl.h>
 #include <linux/slab.h>
@@ -66,11 +65,18 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
        return inode;
 }
-static void proc_destroy_inode(struct inode *inode)
+static void proc_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(proc_inode_cachep, PROC_I(inode));
 }
+static void proc_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, proc_i_callback);
+}
 static void init_once(void *foo)
 {
        struct proc_inode *ei = (struct proc_inode *) foo;
diff --git a/fs/proc/proc_console.c b/fs/proc/proc_console.c
new file mode 100644
index 000000000000..8a707609f528
--- /dev/null
+++ b/fs/proc/proc_console.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2010 Werner Fink, Jiri Slaby
+ *
+ * Licensed under GPLv2
+ */
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/tty_driver.h>
+/*
+ * This is handler for /proc/consoles
+ */
+static int show_console_dev(struct seq_file *m, void *v)
+{
+        static const struct {
+                short flag;
+                char name;
+        } con_flags[] = {
+                { CON_ENABLED,          'E' },
+                { CON_CONSDEV,          'C' },
+                { CON_BOOT,             'B' },
+                { CON_PRINTBUFFER,      'p' },
+                { CON_BRL,              'b' },
+                { CON_ANYTIME,          'a' },
+        };
+        char flags[ARRAY_SIZE(con_flags) + 1];
+        struct console *con = v;
+        unsigned int a;
+        int len;
+        dev_t dev = 0;
+        if (con->device) {
+                const struct tty_driver *driver;
+                int index;
+                driver = con->device(con, &index);
+                if (driver) {
+                        dev = MKDEV(driver->major, driver->minor_start);
+                        dev += index;
+                }
+        }
+        for (a = 0; a < ARRAY_SIZE(con_flags); a++)
+                flags[a] = (con->flags & con_flags[a].flag) ?
+                        con_flags[a].name : ' ';
+        flags[a] = 0;
+        seq_printf(m, "%s%d%n", con->name, con->index, &len);
+        len = 21 - len;
+        if (len < 1)
+                len = 1;
+        seq_printf(m, "%*c%c%c%c (%s)", len, ' ', con->read ? 'R' : '-',
+                        con->write ? 'W' : '-', con->unblank ? 'U' : '-',
+                        flags);
+        if (dev)
+                seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
+        seq_printf(m, "\n");
+        return 0;
+}
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+        struct console *con;
+        loff_t off = 0;
+        acquire_console_sem();
+        for_each_console(con)
+                if (off++ == *pos)
+                        break;
+        return con;
+}
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        struct console *con = v;
+        ++*pos;
+        return con->next;
+}
+static void c_stop(struct seq_file *m, void *v)
+{
+        release_console_sem();
+}
+static const struct seq_operations consoles_op = {
+        .start  = c_start,
+        .next   = c_next,
+        .stop   = c_stop,
+        .show   = show_console_dev
+};
+static int consoles_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &consoles_op);
+}
+static const struct file_operations proc_consoles_operations = {
+        .open           = consoles_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+static int register_proc_consoles(void)
+{
+        proc_create("consoles", 0, NULL, &proc_consoles_operations);
+        return 0;
+}
+module_init(register_proc_consoles);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index b652cb00906b..09a1f92a34ef 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -5,6 +5,7 @@
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
+#include <linux/namei.h>
 #include "internal.h"
 static const struct dentry_operations proc_sys_dentry_operations;
@@ -120,7 +121,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
                goto out;
        err = NULL;
-        dentry->d_op = &proc_sys_dentry_operations;
+        d_set_d_op(dentry, &proc_sys_dentry_operations);
        d_add(dentry, inode);
 out:
@@ -201,7 +202,7 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
                                dput(child);
                                return -ENOMEM;
                        } else {
-                                child->d_op = &proc_sys_dentry_operations;
+                                d_set_d_op(child, &proc_sys_dentry_operations);
                                d_add(child, inode);
                        }
                } else {
@@ -294,7 +295,7 @@ out:
        return ret;
 }
-static int proc_sys_permission(struct inode *inode, int mask)
+static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags)
 {
        /*
         * sysctl entries that are not writeable,
@@ -304,6 +305,9 @@ static int proc_sys_permission(struct inode *inode, int mask)
        struct ctl_table *table;
        int error;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        /* Executable files are not allowed under /proc/sys/ */
        if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))
                return -EACCES;
@@ -389,23 +393,30 @@ static const struct inode_operations proc_sys_dir_operations = {
 static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        return !PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
-static int proc_sys_delete(struct dentry *dentry)
+static int proc_sys_delete(const struct dentry *dentry)
 {
        return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
-static int proc_sys_compare(struct dentry *dir, struct qstr *qstr,
+static int proc_sys_compare(const struct dentry *parent,
-                            struct qstr *name)
+                const struct inode *pinode,
+                const struct dentry *dentry, const struct inode *inode,
+                unsigned int len, const char *str, const struct qstr *name)
 {
-        struct dentry *dentry = container_of(qstr, struct dentry, d_name);
+        /* Although proc doesn't have negative dentries, rcu-walk means
-        if (qstr->len != name->len)
+         * that inode here can be NULL */
+        if (!inode)
+                return 0;
+        if (name->len != len)
                return 1;
-        if (memcmp(qstr->name, name->name, name->len))
+        if (memcmp(name->name, str, len))
                return 1;
-        return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl);
+        return !sysctl_is_seen(PROC_I(inode)->sysctl);
 }
 static const struct dentry_operations proc_sys_dentry_operations = {
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index da6b01d70f01..c126c83b9a45 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -706,6 +706,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
 * skip over unmapped regions.
 */
 #define PAGEMAP_WALK_SIZE       (PMD_SIZE)
+#define PAGEMAP_WALK_MASK       (PMD_MASK)
 static ssize_t pagemap_read(struct file *file, char __user *buf,
                            size_t count, loff_t *ppos)
 {
@@ -776,7 +777,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
                unsigned long end;
                pm.pos = 0;
-                end = start_vaddr + PAGEMAP_WALK_SIZE;
+                end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
                /* overflow ? */
                if (end < start_vaddr || end > end_vaddr)
                        end = end_vaddr;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 2367fb3f70bc..74802bc5ded9 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -499,7 +499,7 @@ static int __init parse_crash_elf64_headers(void)
        /* Do some basic Verification. */
        if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
                (ehdr.e_type != ET_CORE) ||
-                !vmcore_elf_check_arch(&ehdr) ||
+                !vmcore_elf64_check_arch(&ehdr) ||
                ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
                ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
                ehdr.e_version != EV_CURRENT ||
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index fcada42f1aa3..e63b4171d583 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -425,11 +425,18 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void qnx4_destroy_inode(struct inode *inode)
+static void qnx4_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
 }
+static void qnx4_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, qnx4_i_callback);
+}
 static void init_once(void *foo)
 {
        struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
diff --git a/fs/read_write.c b/fs/read_write.c
index 431a0ed610c8..5d431bacbea9 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,7 +9,6 @@
 #include <linux/fcntl.h>
 #include <linux/file.h>
 #include <linux/uio.h>
-#include <linux/smp_lock.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
 #include <linux/module.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 41656d40dc5c..0bae036831e2 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -8,7 +8,6 @@
 #include <linux/reiserfs_acl.h>
 #include <linux/reiserfs_xattr.h>
 #include <linux/exportfs.h>
-#include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index adf22b485cea..79265fdc317a 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -9,7 +9,6 @@
 #include <linux/time.h>
 #include <asm/uaccess.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/compat.h>
 /*
@@ -184,12 +183,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
                return 0;
        }
-        /* we need to make sure nobody is changing the file size beneath
-         ** us
-         */
-        reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
        depth = reiserfs_write_lock_once(inode->i_sb);
+        /* we need to make sure nobody is changing the file size beneath us */
+        reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
        write_from = inode->i_size & (blocksize - 1);
        /* if we are on a block boundary, we are already unpacked.  */
        if (write_from == 0) {
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 076c8b194682..d31bce1a9f90 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -43,7 +43,6 @@
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/workqueue.h>
 #include <linux/writeback.h>
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3bf7a6457f4d..2575682a9ead 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -28,7 +28,6 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/crc32.h>
-#include <linux/smp_lock.h>
 struct file_system_type reiserfs_fs_type;
@@ -530,11 +529,18 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void reiserfs_destroy_inode(struct inode *inode)
+static void reiserfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
 }
+static void reiserfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, reiserfs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 5d04a7828e7a..3cfb2e933644 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -870,11 +870,14 @@ out:
        return err;
 }
-static int reiserfs_check_acl(struct inode *inode, int mask)
+static int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
        struct posix_acl *acl;
        int error = -EAGAIN; /* do regular unix permission checks by default */
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
        if (acl) {
@@ -951,8 +954,10 @@ static int xattr_mount_check(struct super_block *s)
        return 0;
 }
-int reiserfs_permission(struct inode *inode, int mask)
+int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
        /*
         * We don't do permission checks on the internal objects.
         * Permissions are determined by the "owning" object.
@@ -965,13 +970,16 @@ int reiserfs_permission(struct inode *inode, int mask)
         * Stat data v1 doesn't support ACLs.
         */
        if (get_inode_sd_version(inode) != STAT_DATA_V1)
-                return generic_permission(inode, mask, reiserfs_check_acl);
+                return generic_permission(inode, mask, flags,
+                                        reiserfs_check_acl);
 #endif
-        return generic_permission(inode, mask, NULL);
+        return generic_permission(inode, mask, flags, NULL);
 }
 static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
        return -EPERM;
 }
@@ -990,7 +998,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
                                strlen(PRIVROOT_NAME));
        if (!IS_ERR(dentry)) {
                REISERFS_SB(s)->priv_root = dentry;
-                dentry->d_op = &xattr_lookup_poison_ops;
+                d_set_d_op(dentry, &xattr_lookup_poison_ops);
                if (dentry->d_inode)
                        dentry->d_inode->i_flags |= S_PRIVATE;
        } else
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 536d697a8a28..90d2fcb67a31 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -472,7 +472,9 @@ int reiserfs_acl_chmod(struct inode *inode)
                struct reiserfs_transaction_handle th;
                size_t size = reiserfs_xattr_nblocks(inode,
                                             reiserfs_acl_size(clone->a_count));
-                reiserfs_write_lock(inode->i_sb);
+                int depth;
+                depth = reiserfs_write_lock_once(inode->i_sb);
                error = journal_begin(&th, inode->i_sb, size * 2);
                if (!error) {
                        int error2;
@@ -482,7 +484,7 @@ int reiserfs_acl_chmod(struct inode *inode)
                        if (error2)
                                error = error2;
                }
-                reiserfs_write_unlock(inode->i_sb);
+                reiserfs_write_unlock_once(inode->i_sb, depth);
        }
        posix_acl_release(clone);
        return error;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 6647f90e55cd..2305e3121cb1 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -400,11 +400,18 @@ static struct inode *romfs_alloc_inode(struct super_block *sb)
 /*
 * return a spent inode to the slab cache
 */
-static void romfs_destroy_inode(struct inode *inode)
+static void romfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
 }
+static void romfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, romfs_i_callback);
+}
 /*
 * get filesystem statistics
 */
diff --git a/fs/splice.c b/fs/splice.c
index 8f1dfaecc8f0..ce2f02579e35 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1311,18 +1311,6 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
                               struct pipe_inode_info *opipe,
                               size_t len, unsigned int flags);
-/*
- * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
- * location, so checking ->i_pipe is not enough to verify that this is a
- * pipe.
- */
-static inline struct pipe_inode_info *pipe_info(struct inode *inode)
-{
-        if (S_ISFIFO(inode->i_mode))
-                return inode->i_pipe;
-        return NULL;
-}
 /*
 * Determine where to splice to/from.
@@ -1336,8 +1324,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
        loff_t offset, *off;
        long ret;
-        ipipe = pipe_info(in->f_path.dentry->d_inode);
+        ipipe = get_pipe_info(in);
-        opipe = pipe_info(out->f_path.dentry->d_inode);
+        opipe = get_pipe_info(out);
        if (ipipe && opipe) {
                if (off_in || off_out)
@@ -1555,7 +1543,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
        int error;
        long ret;
-        pipe = pipe_info(file->f_path.dentry->d_inode);
+        pipe = get_pipe_info(file);
        if (!pipe)
                return -EBADF;
@@ -1642,7 +1630,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
        };
        long ret;
-        pipe = pipe_info(file->f_path.dentry->d_inode);
+        pipe = get_pipe_info(file);
        if (!pipe)
                return -EBADF;
@@ -2022,8 +2010,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 static long do_tee(struct file *in, struct file *out, size_t len,
                   unsigned int flags)
 {
-        struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
+        struct pipe_inode_info *ipipe = get_pipe_info(in);
-        struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
+        struct pipe_inode_info *opipe = get_pipe_info(out);
        int ret = -EINVAL;
        /*
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 24de30ba34c1..20700b9f2b4c 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -440,11 +440,18 @@ static struct inode *squashfs_alloc_inode(struct super_block *sb)
 }
-static void squashfs_destroy_inode(struct inode *inode)
+static void squashfs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
 }
+static void squashfs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, squashfs_i_callback);
+}
 static struct file_system_type squashfs_fs_type = {
        .owner = THIS_MODULE,
diff --git a/fs/super.c b/fs/super.c
index ca696155cd9a..823e061faa87 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -30,6 +30,7 @@
 #include <linux/idr.h>
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
+#include <linux/rculist_bl.h>
 #include "internal.h"
@@ -71,7 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
                INIT_LIST_HEAD(&s->s_files);
 #endif
                INIT_LIST_HEAD(&s->s_instances);
-                INIT_HLIST_HEAD(&s->s_anon);
+                INIT_HLIST_BL_HEAD(&s->s_anon);
                INIT_LIST_HEAD(&s->s_inodes);
                INIT_LIST_HEAD(&s->s_dentry_lru);
                init_rwsem(&s->s_umount);
@@ -1139,7 +1140,7 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
        return mnt;
 err:
-        mntput(mnt);
+        mntput_long(mnt);
        return ERR_PTR(err);
 }
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 7e54bac8c4b0..ea9120a830d8 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -231,7 +231,7 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
                goto repeat;
 }
-static int sysfs_dentry_delete(struct dentry *dentry)
+static int sysfs_dentry_delete(const struct dentry *dentry)
 {
        struct sysfs_dirent *sd = dentry->d_fsdata;
        return !!(sd->s_flags & SYSFS_FLAG_REMOVED);
@@ -239,9 +239,13 @@ static int sysfs_dentry_delete(struct dentry *dentry)
 static int sysfs_dentry_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct sysfs_dirent *sd = dentry->d_fsdata;
+        struct sysfs_dirent *sd;
        int is_dir;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        sd = dentry->d_fsdata;
        mutex_lock(&sysfs_mutex);
        /* The sysfs dirent has been deleted */
@@ -701,7 +705,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
        /* instantiate and hash dentry */
        ret = d_find_alias(inode);
        if (!ret) {
-                dentry->d_op = &sysfs_dentry_ops;
+                d_set_d_op(dentry, &sysfs_dentry_ops);
                dentry->d_fsdata = sysfs_get(sd);
                d_add(dentry, inode);
        } else {
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index cffb1fd8ba33..30ac27345586 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -348,13 +348,18 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
                return -ENOENT;
 }
-int sysfs_permission(struct inode *inode, int mask)
+int sysfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
-        struct sysfs_dirent *sd = inode->i_private;
+        struct sysfs_dirent *sd;
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        sd = inode->i_private;
        mutex_lock(&sysfs_mutex);
        sysfs_refresh_inode(sd, inode);
        mutex_unlock(&sysfs_mutex);
-        return generic_permission(inode, mask, NULL);
+        return generic_permission(inode, mask, flags, NULL);
 }
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index d9be60a2e956..ffaaa816bfba 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -200,7 +200,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
 struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
 void sysfs_evict_inode(struct inode *inode);
 int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
-int sysfs_permission(struct inode *inode, int mask);
+int sysfs_permission(struct inode *inode, int mask, unsigned int flags);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index de44d067b9e6..0630eb969a28 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -333,11 +333,18 @@ static struct inode *sysv_alloc_inode(struct super_block *sb)
        return &si->vfs_inode;
 }
-static void sysv_destroy_inode(struct inode *inode)
+static void sysv_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
 }
+static void sysv_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, sysv_i_callback);
+}
 static void init_once(void *p)
 {
        struct sysv_inode_info *si = (struct sysv_inode_info *)p;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 11e7f7d11cd0..b5e68da2db32 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -27,7 +27,8 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
        return err;
 }
-static int sysv_hash(struct dentry *dentry, struct qstr *qstr)
+static int sysv_hash(const struct dentry *dentry, const struct inode *inode,
+                struct qstr *qstr)
 {
        /* Truncate the name in place, avoids having to define a compare
           function. */
@@ -47,7 +48,7 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, st
        struct inode * inode = NULL;
        ino_t ino;
-        dentry->d_op = dir->i_sb->s_root->d_op;
+        d_set_d_op(dentry, dir->i_sb->s_root->d_op);
        if (dentry->d_name.len > SYSV_NAMELEN)
                return ERR_PTR(-ENAMETOOLONG);
        ino = sysv_inode_by_name(dentry);
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 3d9c62be0c10..76712aefc4ab 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -346,7 +346,7 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
        if (sbi->s_forced_ro)
                sb->s_flags |= MS_RDONLY;
        if (sbi->s_truncate)
-                sb->s_root->d_op = &sysv_dentry_operations;
+                d_set_d_op(sb->s_root, &sysv_dentry_operations);
        return 1;
 }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 91fac54c70e3..6e11c2975dcf 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -272,12 +272,20 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb)
        return &ui->vfs_inode;
 };
+static void ubifs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(ubifs_inode_slab, ui);
+}
 static void ubifs_destroy_inode(struct inode *inode)
 {
        struct ubifs_inode *ui = ubifs_inode(inode);
        kfree(ui->data);
-        kmem_cache_free(ubifs_inode_slab, inode);
+        call_rcu(&inode->i_rcu, ubifs_i_callback);
 }
 /*
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 4a5c7c61836a..b539d53320fb 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -139,11 +139,18 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void udf_destroy_inode(struct inode *inode)
+static void udf_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(udf_inode_cachep, UDF_I(inode));
 }
+static void udf_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, udf_i_callback);
+}
 static void init_once(void *foo)
 {
        struct udf_inode_info *ei = (struct udf_inode_info *)foo;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 2c47daed56da..2c61ac5d4e48 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1412,11 +1412,18 @@ static struct inode *ufs_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
-static void ufs_destroy_inode(struct inode *inode)
+static void ufs_i_callback(struct rcu_head *head)
 {
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
        kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
 }
+static void ufs_destroy_inode(struct inode *inode)
+{
+        call_rcu(&inode->i_rcu, ufs_i_callback);
+}
 static void init_once(void *foo)
 {
        struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index b2771862fd3d..39f4f809bb68 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -219,12 +219,13 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 }
 int
-xfs_check_acl(struct inode *inode, int mask)
+xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct xfs_inode *ip = XFS_I(inode);
+        struct xfs_inode *ip;
        struct posix_acl *acl;
        int error = -EAGAIN;
+        ip = XFS_I(inode);
        trace_xfs_check_acl(ip);
        /*
@@ -234,6 +235,12 @@ xfs_check_acl(struct inode *inode, int mask)
        if (!XFS_IFORK_Q(ip))
                return -EAGAIN;
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
        acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c9af48fffcd7..691f61223ed6 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -934,7 +934,6 @@ xfs_aops_discard_page(
        struct xfs_inode        *ip = XFS_I(inode);
        struct buffer_head      *bh, *head;
        loff_t                  offset = page_offset(page);
-        ssize_t                 len = 1 << inode->i_blkbits;
        if (!xfs_is_delayed_page(page, IO_DELAY))
                goto out_invalidate;
@@ -949,58 +948,14 @@ xfs_aops_discard_page(
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        bh = head = page_buffers(page);
        do {
-                int             done;
-                xfs_fileoff_t   offset_fsb;
-                xfs_bmbt_irec_t imap;
-                int             nimaps = 1;
                int             error;
-                xfs_fsblock_t   firstblock;
+                xfs_fileoff_t   start_fsb;
-                xfs_bmap_free_t flist;
                if (!buffer_delay(bh))
                        goto next_buffer;
-                offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+                start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+                error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
-                /*
-                 * Map the range first and check that it is a delalloc extent
-                 * before trying to unmap the range. Otherwise we will be
-                 * trying to remove a real extent (which requires a
-                 * transaction) or a hole, which is probably a bad idea...
-                 */
-                error = xfs_bmapi(NULL, ip, offset_fsb, 1,
-                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
-                                &nimaps, NULL);
-                if (error) {
-                        /* something screwed, just bail */
-                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
-                                "page discard failed delalloc mapping lookup.");
-                        }
-                        break;
-                }
-                if (!nimaps) {
-                        /* nothing there */
-                        goto next_buffer;
-                }
-                if (imap.br_startblock != DELAYSTARTBLOCK) {
-                        /* been converted, ignore */
-                        goto next_buffer;
-                }
-                WARN_ON(imap.br_blockcount == 0);
-                /*
-                 * Note: while we initialise the firstblock/flist pair, they
-                 * should never be used because blocks should never be
-                 * allocated or freed for a delalloc extent and hence we need
-                 * don't cancel or finish them after the xfs_bunmapi() call.
-                 */
-                xfs_bmap_init(&flist, &firstblock);
-                error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
-                                        &flist, &done);
-                ASSERT(!flist.xbf_count && !flist.xbf_first);
                if (error) {
                        /* something screwed, just bail */
                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
@@ -1010,7 +965,7 @@ xfs_aops_discard_page(
                        break;
                }
 next_buffer:
-                offset += len;
+                offset += 1 << inode->i_blkbits;
        } while ((bh = bh->b_this_page) != head);
@@ -1111,11 +1066,12 @@ xfs_vm_writepage(
                        uptodate = 0;
                /*
-                 * A hole may still be marked uptodate because discard_buffer
+                 * set_page_dirty dirties all buffers in a page, independent
-                 * leaves the flag set.
+                 * of their state.  The dirty state however is entirely
+                 * meaningless for holes (!mapped && uptodate), so skip
+                 * buffers covering holes here.
                 */
                if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
-                        ASSERT(!buffer_dirty(bh));
                        imap_valid = 0;
                        continue;
                }
@@ -1504,11 +1460,42 @@ xfs_vm_write_failed(
        struct inode            *inode = mapping->host;
        if (to > inode->i_size) {
-                struct iattr    ia = {
+                /*
-                        .ia_valid       = ATTR_SIZE | ATTR_FORCE,
+                 * punch out the delalloc blocks we have already allocated. We
-                        .ia_size        = inode->i_size,
+                 * don't call xfs_setattr() to do this as we may be in the
-                };
+                 * middle of a multi-iovec write and so the vfs inode->i_size
-                xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK);
+                 * will not match the xfs ip->i_size and so it will zero too
+                 * much. Hence we jus truncate the page cache to zero what is
+                 * necessary and punch the delalloc blocks directly.
+                 */
+                struct xfs_inode        *ip = XFS_I(inode);
+                xfs_fileoff_t           start_fsb;
+                xfs_fileoff_t           end_fsb;
+                int                     error;
+                truncate_pagecache(inode, to, inode->i_size);
+                /*
+                 * Check if there are any blocks that are outside of i_size
+                 * that need to be trimmed back.
+                 */
+                start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
+                end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
+                if (end_fsb <= start_fsb)
+                        return;
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+                                                        end_fsb - start_fsb);
+                if (error) {
+                        /* something screwed, just bail */
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                        "xfs_vm_write_failed: unable to clean up ino %lld",
+                                                ip->i_ino);
+                        }
+                }
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
 }
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 63fd2c07cb57..4c5deb6e9e31 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -488,29 +488,16 @@ found:
        spin_unlock(&pag->pag_buf_lock);
        xfs_perag_put(pag);
-        /* Attempt to get the semaphore without sleeping,
+        if (xfs_buf_cond_lock(bp)) {
-         * if this does not work then we need to drop the
+                /* failed, so wait for the lock if requested. */
-         * spinlock and do a hard attempt on the semaphore.
-         */
-        if (down_trylock(&bp->b_sema)) {
                if (!(flags & XBF_TRYLOCK)) {
-                        /* wait for buffer ownership */
                        xfs_buf_lock(bp);
                        XFS_STATS_INC(xb_get_locked_waited);
                } else {
-                        /* We asked for a trylock and failed, no need
-                         * to look at file offset and length here, we
-                         * know that this buffer at least overlaps our
-                         * buffer and is locked, therefore our buffer
-                         * either does not exist, or is this buffer.
-                         */
                        xfs_buf_rele(bp);
                        XFS_STATS_INC(xb_busy_locked);
                        return NULL;
                }
-        } else {
-                /* trylock worked */
-                XB_SET_OWNER(bp);
        }
        if (bp->b_flags & XBF_STALE) {
@@ -876,10 +863,18 @@ xfs_buf_rele(
 */
 /*
- *      Locks a buffer object, if it is not already locked.
+ *      Locks a buffer object, if it is not already locked.  Note that this in
- *      Note that this in no way locks the underlying pages, so it is only
+ *      no way locks the underlying pages, so it is only useful for
- *      useful for synchronizing concurrent use of buffer objects, not for
+ *      synchronizing concurrent use of buffer objects, not for synchronizing
- *      synchronizing independent access to the underlying pages.
+ *      independent access to the underlying pages.
+ *
+ *      If we come across a stale, pinned, locked buffer, we know that we are
+ *      being asked to lock a buffer that has been reallocated. Because it is
+ *      pinned, we know that the log has not been pushed to disk and hence it
+ *      will still be locked.  Rather than continuing to have trylock attempts
+ *      fail until someone else pushes the log, push it ourselves before
+ *      returning.  This means that the xfsaild will not get stuck trying
+ *      to push on stale inode buffers.
 */
 int
 xfs_buf_cond_lock(
@@ -890,6 +885,8 @@ xfs_buf_cond_lock(
        locked = down_trylock(&bp->b_sema) == 0;
        if (locked)
                XB_SET_OWNER(bp);
+        else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+                xfs_log_force(bp->b_target->bt_mount, 0);
        trace_xfs_buf_cond_lock(bp, _RET_IP_);
        return locked ? 0 : -EBUSY;
@@ -1781,7 +1778,6 @@ xfs_buf_delwri_split(
        INIT_LIST_HEAD(list);
        spin_lock(dwlk);
        list_for_each_entry_safe(bp, n, dwq, b_list) {
-                trace_xfs_buf_delwri_split(bp, _RET_IP_);
                ASSERT(bp->b_flags & XBF_DELWRI);
                if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
@@ -1795,6 +1791,7 @@ xfs_buf_delwri_split(
                                         _XBF_RUN_QUEUES);
                        bp->b_flags |= XBF_WRITE;
                        list_move_tail(&bp->b_list, list);
+                        trace_xfs_buf_delwri_split(bp, _RET_IP_);
                } else
                        skipped++;
        }
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 2ea238f6d38e..ad442d9e392e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -416,7 +416,7 @@ xfs_attrlist_by_handle(
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
-        kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
+        kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
        if (!kbuf)
                goto out_dput;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 96107efc0c61..94d5fd6a2973 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -762,7 +762,8 @@ xfs_setup_inode(
        inode->i_state = I_NEW;
        inode_sb_list_add(inode);
-        insert_inode_hash(inode);
+        /* make the inode look hashed for the writeback code */
+        hlist_add_fake(&inode->i_hash);
        inode->i_mode   = ip->i_d.di_mode;
        inode->i_nlink  = ip->i_d.di_nlink;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 9f3a78fe6ae4..064f964d4f3c 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -353,9 +353,6 @@ xfs_parseargs(
                        mp->m_qflags &= ~XFS_OQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
                        mp->m_flags |= XFS_MOUNT_DELAYLOG;
-                        cmn_err(CE_WARN,
-                                "Enabling EXPERIMENTAL delayed logging feature "
-                                "- use at your own risk.\n");
                } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
                        mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
                } else if (!strcmp(this_char, "ihashsize")) {
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 37d33254981d..afb0d7cfad1c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -853,6 +853,7 @@ restart:
                if (trylock) {
                        if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
                                skipped++;
+                                xfs_perag_put(pag);
                                continue;
                        }
                        first_index = pag->pag_ici_reclaim_cursor;
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 0135e2a669d7..11dd72070cbb 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -42,7 +42,7 @@ struct xfs_acl {
 #define SGI_ACL_DEFAULT_SIZE    (sizeof(SGI_ACL_DEFAULT)-1)
 #ifdef CONFIG_XFS_POSIX_ACL
-extern int xfs_check_acl(struct inode *inode, int mask);
+extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags);
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
 extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
 extern int xfs_acl_chmod(struct inode *inode);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8abd12e32e13..4111cd3966c7 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5471,8 +5471,13 @@ xfs_getbmap(
                        if (error)
                                goto out_unlock_iolock;
                }
+                /*
-                ASSERT(ip->i_delayed_blks == 0);
+                 * even after flushing the inode, there can still be delalloc
+                 * blocks on the inode beyond EOF due to speculative
+                 * preallocation. These are not removed until the release
+                 * function is called or the inode is inactivated. Hence we
+                 * cannot assert here that ip->i_delayed_blks == 0.
+                 */
        }
        lock = xfs_ilock_map_shared(ip);
@@ -6070,3 +6075,79 @@ xfs_bmap_disk_count_leaves(
                *count += xfs_bmbt_disk_get_blockcount(frp);
        }
 }
+/*
+ * dead simple method of punching delalyed allocation blocks from a range in
+ * the inode. Walks a block at a time so will be slow, but is only executed in
+ * rare error cases so the overhead is not critical. This will alays punch out
+ * both the start and end blocks, even if the ranges only partially overlap
+ * them, so it is up to the caller to ensure that partial blocks are not
+ * passed in.
+ */
+int
+xfs_bmap_punch_delalloc_range(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           start_fsb,
+        xfs_fileoff_t           length)
+{
+        xfs_fileoff_t           remaining = length;
+        int                     error = 0;
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        do {
+                int             done;
+                xfs_bmbt_irec_t imap;
+                int             nimaps = 1;
+                xfs_fsblock_t   firstblock;
+                xfs_bmap_free_t flist;
+                /*
+                 * Map the range first and check that it is a delalloc extent
+                 * before trying to unmap the range. Otherwise we will be
+                 * trying to remove a real extent (which requires a
+                 * transaction) or a hole, which is probably a bad idea...
+                 */
+                error = xfs_bmapi(NULL, ip, start_fsb, 1,
+                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
+                                &nimaps, NULL);
+                if (error) {
+                        /* something screwed, just bail */
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                        "Failed delalloc mapping lookup ino %lld fsb %lld.",
+                                                ip->i_ino, start_fsb);
+                        }
+                        break;
+                }
+                if (!nimaps) {
+                        /* nothing there */
+                        goto next_block;
+                }
+                if (imap.br_startblock != DELAYSTARTBLOCK) {
+                        /* been converted, ignore */
+                        goto next_block;
+                }
+                WARN_ON(imap.br_blockcount == 0);
+                /*
+                 * Note: while we initialise the firstblock/flist pair, they
+                 * should never be used because blocks should never be
+                 * allocated or freed for a delalloc extent and hence we need
+                 * don't cancel or finish them after the xfs_bunmapi() call.
+                 */
+                xfs_bmap_init(&flist, &firstblock);
+                error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
+                                        &flist, &done);
+                if (error)
+                        break;
+                ASSERT(!flist.xbf_count && !flist.xbf_first);
+next_block:
+                start_fsb++;
+                remaining--;
+        } while(remaining > 0);
+        return error;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 71ec9b6ecdfc..3651191daea1 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -394,6 +394,11 @@ xfs_bmap_count_blocks(
        int                     whichfork,
        int                     *count);
+int
+xfs_bmap_punch_delalloc_range(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           start_fsb,
+        xfs_fileoff_t           length);
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 3b9582c60a22..e60490bc00a6 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -377,6 +377,19 @@ xfs_swap_extents(
        ip->i_d.di_format = tip->i_d.di_format;
        tip->i_d.di_format = tmp;
+        /*
+         * The extents in the source inode could still contain speculative
+         * preallocation beyond EOF (e.g. the file is open but not modified
+         * while defrag is in progress). In that case, we need to copy over the
+         * number of delalloc blocks the data fork in the source inode is
+         * tracking beyond EOF so that when the fork is truncated away when the
+         * temporary inode is unlinked we don't underrun the i_delayed_blks
+         * counter on that inode.
+         */
+        ASSERT(tip->i_delayed_blks == 0);
+        tip->i_delayed_blks = ip->i_delayed_blks;
+        ip->i_delayed_blks = 0;
        ilf_fields = XFS_ILOG_CORE;
        switch(ip->i_d.di_format) {
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ed9990267661..c78cc6a3d87c 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -58,6 +58,7 @@ xfs_error_trap(int e)
 int     xfs_etest[XFS_NUM_INJECT_ERROR];
 int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
 char *  xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
+int     xfs_error_test_active;
 int
 xfs_error_test(int error_tag, int *fsidp, char *expression,
@@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
                        len = strlen(mp->m_fsname);
                        xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
                        strcpy(xfs_etest_fsname[i], mp->m_fsname);
+                        xfs_error_test_active++;
                        return 0;
                }
        }
@@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
                        xfs_etest_fsid[i] = 0LL;
                        kmem_free(xfs_etest_fsname[i]);
                        xfs_etest_fsname[i] = NULL;
+                        xfs_error_test_active--;
                }
        }
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c2c1a072bb82..f338847f80b8 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -127,13 +127,14 @@ extern void xfs_corruption_error(const char *tag, int level,
 #define XFS_RANDOM_BMAPIFORMAT                          XFS_RANDOM_DEFAULT
 #ifdef DEBUG
+extern int xfs_error_test_active;
 extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
 #define XFS_NUM_INJECT_ERROR                            10
 #define XFS_TEST_ERROR(expr, mp, tag, rf)               \
-        ((expr) || \
+        ((expr) || (xfs_error_test_active && \
         xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
-                        (rf)))
+                        (rf))))
 extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
 extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 9b715dce5699..9124425b7f2f 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -744,9 +744,15 @@ xfs_filestream_new_ag(
         * If the file's parent directory is known, take its iolock in exclusive
         * mode to prevent two sibling files from racing each other to migrate
         * themselves and their parent to different AGs.
+         *
+         * Note that we lock the parent directory iolock inside the child
+         * iolock here.  That's fine as we never hold both parent and child
+         * iolock in any other place.  This is different from the ilock,
+         * which requires locking of the child after the parent for namespace
+         * operations.
         */
        if (pip)
-                xfs_ilock(pip, XFS_IOLOCK_EXCL);
+                xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
        /*
         * A new AG needs to be found for the file.  If the file's parent
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 0cdd26932d8e..d7de5a3f7867 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -91,6 +91,17 @@ xfs_inode_alloc(
        return ip;
 }
+STATIC void
+xfs_inode_free_callback(
+        struct rcu_head         *head)
+{
+        struct inode            *inode = container_of(head, struct inode, i_rcu);
+        struct xfs_inode        *ip = XFS_I(inode);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_zone_free(xfs_inode_zone, ip);
+}
 void
 xfs_inode_free(
        struct xfs_inode        *ip)
@@ -134,7 +145,7 @@ xfs_inode_free(
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
-        kmem_zone_free(xfs_inode_zone, ip);
+        call_rcu(&ip->i_vnode.i_rcu, xfs_inode_free_callback);
 }
 /*
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index c7ac020705df..7c8d30c453c3 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -657,18 +657,37 @@ xfs_inode_item_unlock(
 }
 /*
- * This is called to find out where the oldest active copy of the
+ * This is called to find out where the oldest active copy of the inode log
- * inode log item in the on disk log resides now that the last log
+ * item in the on disk log resides now that the last log write of it completed
- * write of it completed at the given lsn.  Since we always re-log
+ * at the given lsn.  Since we always re-log all dirty data in an inode, the
- * all dirty data in an inode, the latest copy in the on disk log
+ * latest copy in the on disk log is the only one that matters.  Therefore,
- * is the only one that matters.  Therefore, simply return the
+ * simply return the given lsn.
- * given lsn.
+ *
+ * If the inode has been marked stale because the cluster is being freed, we
+ * don't want to (re-)insert this inode into the AIL. There is a race condition
+ * where the cluster buffer may be unpinned before the inode is inserted into
+ * the AIL during transaction committed processing. If the buffer is unpinned
+ * before the inode item has been committed and inserted, then it is possible
+ * for the buffer to be written and IO completions before the inode is inserted
+ * into the AIL. In that case, we'd be inserting a clean, stale inode into the
+ * AIL which will never get removed. It will, however, get reclaimed which
+ * triggers an assert in xfs_inode_free() complaining about freein an inode
+ * still in the AIL.
+ *
+ * To avoid this, return a lower LSN than the one passed in so that the
+ * transaction committed code will not move the inode forward in the AIL but
+ * will still unpin it properly.
 */
 STATIC xfs_lsn_t
 xfs_inode_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode        *ip = iip->ili_inode;
+        if (xfs_iflags_test(ip, XFS_ISTALE))
+                return lsn - 1;
        return lsn;
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b1498ab5a399..19e9dfa1c254 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -275,6 +275,7 @@ xfs_free_perag(
                pag = radix_tree_delete(&mp->m_perag_tree, agno);
                spin_unlock(&mp->m_perag_lock);
                ASSERT(pag);
+                ASSERT(atomic_read(&pag->pag_ref) == 0);
                call_rcu(&pag->rcu_head, __xfs_free_perag);
        }
 }
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 45ce15dc5b2b..edfa178bafb6 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -408,7 +408,7 @@ xfs_mru_cache_flush(
        spin_lock(&mru->lock);
        if (mru->queued) {
                spin_unlock(&mru->lock);
-                cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work);
+                cancel_delayed_work_sync(&mru->work);
                spin_lock(&mru->lock);
        }
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index e0e64b113bd6..9bb6eda4cd21 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -346,8 +346,17 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
 #define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
 #define xfs_trans_apply_dquot_deltas(tp)
 #define xfs_trans_unreserve_and_mod_dquots(tp)
-#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags)      (0)
+static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
-#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl)      (0)
+                struct xfs_inode *ip, long nblks, long ninos, uint flags)
+{
+        return 0;
+}
+static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
+                struct xfs_mount *mp, struct xfs_dquot *udqp,
+                struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
+{
+        return 0;
+}
 #define xfs_qm_vop_create_dqattach(tp, ip, u, g)
 #define xfs_qm_vop_rename_dqattach(it)                                  (0)
 #define xfs_qm_vop_chown(tp, ip, old, new)                              (NULL)
@@ -357,11 +366,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
 #define xfs_qm_dqdetach(ip)
 #define xfs_qm_dqrele(d)
 #define xfs_qm_statvfs(ip, s)
-#define xfs_qm_sync(mp, fl)                                             (0)
+static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
+{
+        return 0;
+}
 #define xfs_qm_newmount(mp, a, b)                                       (0)
 #define xfs_qm_mount_quotas(mp)
 #define xfs_qm_unmount(mp)
-#define xfs_qm_unmount_quotas(mp)                                       (0)
+#define xfs_qm_unmount_quotas(mp)
 #endif /* CONFIG_XFS_QUOTA */
 #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index d2af0a8381a6..77a59891734e 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -297,6 +297,7 @@ xfs_rename(
         * it and some incremental backup programs won't work without it.
         */
        xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+        xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
        /*
         * Adjust the link count on src_dp.  This is necessary when