77 files changed, 2985 insertions, 2078 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 02a2cf61631..51545529637 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -21,8 +21,8 @@
 #include <linux/posix_acl_xattr.h>
 #include "xattr.h"
 #include "acl.h"
-#include "v9fs_vfs.h"
 #include "v9fs.h"
+#include "v9fs_vfs.h"
 static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
 {
@@ -59,7 +59,8 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
        struct v9fs_session_info *v9ses;
        v9ses = v9fs_inode2v9ses(inode);
-        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+        if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
+                        ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
                set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL);
                set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
                return 0;
@@ -71,11 +72,15 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
        if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
                set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
                set_cached_acl(inode, ACL_TYPE_ACCESS, pacl);
-                posix_acl_release(dacl);
-                posix_acl_release(pacl);
        } else
                retval = -EIO;
+        if (!IS_ERR(dacl))
+                posix_acl_release(dacl);
+        if (!IS_ERR(pacl))
+                posix_acl_release(pacl);
        return retval;
 }
@@ -100,9 +105,10 @@ int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
                return -ECHILD;
        v9ses = v9fs_inode2v9ses(inode);
-        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+        if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
+                        ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
                /*
-                 * On access = client mode get the acl
+                 * On access = client  and acl = on mode get the acl
                 * values from the server
                 */
                return 0;
@@ -128,6 +134,10 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
        struct inode *inode = dentry->d_inode;
        set_cached_acl(inode, type, acl);
+        if (!acl)
+                return 0;
        /* Set a setxattr request to server */
        size = posix_acl_xattr_size(acl->a_count);
        buffer = kmalloc(size, GFP_KERNEL);
@@ -177,10 +187,8 @@ int v9fs_acl_chmod(struct dentry *dentry)
 int v9fs_set_create_acl(struct dentry *dentry,
                        struct posix_acl *dpacl, struct posix_acl *pacl)
 {
-        if (dpacl)
+        v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
-                v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
+        v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
-        if (pacl)
-                v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
        posix_acl_release(dpacl);
        posix_acl_release(pacl);
        return 0;
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 0dbe0d139ac..5b335c5086a 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -33,67 +33,11 @@
 #define CACHETAG_LEN  11
-struct kmem_cache *vcookie_cache;
 struct fscache_netfs v9fs_cache_netfs = {
        .name           = "9p",
        .version        = 0,
 };
-static void init_once(void *foo)
-{
-        struct v9fs_cookie *vcookie = (struct v9fs_cookie *) foo;
-        vcookie->fscache = NULL;
-        vcookie->qid = NULL;
-        inode_init_once(&vcookie->inode);
-}
-/**
- * v9fs_init_vcookiecache - initialize a cache for vcookies to maintain
- *                          vcookie to inode mapping
- *
- * Returns 0 on success.
- */
-static int v9fs_init_vcookiecache(void)
-{
-        vcookie_cache = kmem_cache_create("vcookie_cache",
-                                          sizeof(struct v9fs_cookie),
-                                          0, (SLAB_RECLAIM_ACCOUNT|
-                                              SLAB_MEM_SPREAD),
-                                          init_once);
-        if (!vcookie_cache)
-                return -ENOMEM;
-        return 0;
-}
-/**
- * v9fs_destroy_vcookiecache - destroy the cache of vcookies
- *
- */
-static void v9fs_destroy_vcookiecache(void)
-{
-        kmem_cache_destroy(vcookie_cache);
-}
-int __v9fs_cache_register(void)
-{
-        int ret;
-        ret = v9fs_init_vcookiecache();
-        if (ret < 0)
-                return ret;
-        return fscache_register_netfs(&v9fs_cache_netfs);
-}
-void __v9fs_cache_unregister(void)
-{
-        v9fs_destroy_vcookiecache();
-        fscache_unregister_netfs(&v9fs_cache_netfs);
-}
 /**
 * v9fs_random_cachetag - Generate a random tag to be associated
 *                        with a new cache session.
@@ -133,9 +77,9 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
 }
 const struct fscache_cookie_def v9fs_cache_session_index_def = {
-        .name           = "9P.session",
+        .name           = "9P.session",
-        .type           = FSCACHE_COOKIE_TYPE_INDEX,
+        .type           = FSCACHE_COOKIE_TYPE_INDEX,
-        .get_key        = v9fs_cache_session_get_key,
+        .get_key        = v9fs_cache_session_get_key,
 };
 void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
@@ -163,33 +107,33 @@ void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
 static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
                                         void *buffer, uint16_t bufmax)
 {
-        const struct v9fs_cookie *vcookie = cookie_netfs_data;
+        const struct v9fs_inode *v9inode = cookie_netfs_data;
-        memcpy(buffer, &vcookie->qid->path, sizeof(vcookie->qid->path));
+        memcpy(buffer, &v9inode->fscache_key->path,
+               sizeof(v9inode->fscache_key->path));
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &vcookie->inode,
+        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode,
-                   vcookie->qid->path);
+                   v9inode->fscache_key->path);
-        return sizeof(vcookie->qid->path);
+        return sizeof(v9inode->fscache_key->path);
 }
 static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
                                      uint64_t *size)
 {
-        const struct v9fs_cookie *vcookie = cookie_netfs_data;
+        const struct v9fs_inode *v9inode = cookie_netfs_data;
-        *size = i_size_read(&vcookie->inode);
+        *size = i_size_read(&v9inode->vfs_inode);
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &vcookie->inode,
+        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode,
                   *size);
 }
 static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
                                         void *buffer, uint16_t buflen)
 {
-        const struct v9fs_cookie *vcookie = cookie_netfs_data;
+        const struct v9fs_inode *v9inode = cookie_netfs_data;
-        memcpy(buffer, &vcookie->qid->version, sizeof(vcookie->qid->version));
+        memcpy(buffer, &v9inode->fscache_key->version,
+               sizeof(v9inode->fscache_key->version));
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &vcookie->inode,
+        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode,
-                   vcookie->qid->version);
+                   v9inode->fscache_key->version);
-        return sizeof(vcookie->qid->version);
+        return sizeof(v9inode->fscache_key->version);
 }
 static enum
@@ -197,13 +141,13 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
                                            const void *buffer,
                                            uint16_t buflen)
 {
-        const struct v9fs_cookie *vcookie = cookie_netfs_data;
+        const struct v9fs_inode *v9inode = cookie_netfs_data;
-        if (buflen != sizeof(vcookie->qid->version))
+        if (buflen != sizeof(v9inode->fscache_key->version))
                return FSCACHE_CHECKAUX_OBSOLETE;
-        if (memcmp(buffer, &vcookie->qid->version,
+        if (memcmp(buffer, &v9inode->fscache_key->version,
-                   sizeof(vcookie->qid->version)))
+                   sizeof(v9inode->fscache_key->version)))
                return FSCACHE_CHECKAUX_OBSOLETE;
        return FSCACHE_CHECKAUX_OKAY;
@@ -211,7 +155,7 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
 static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
 {
-        struct v9fs_cookie *vcookie = cookie_netfs_data;
+        struct v9fs_inode *v9inode = cookie_netfs_data;
        struct pagevec pvec;
        pgoff_t first;
        int loop, nr_pages;
@@ -220,7 +164,7 @@ static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
        first = 0;
        for (;;) {
-                nr_pages = pagevec_lookup(&pvec, vcookie->inode.i_mapping,
+                nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping,
                                          first,
                                          PAGEVEC_SIZE - pagevec_count(&pvec));
                if (!nr_pages)
@@ -249,115 +193,114 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = {
 void v9fs_cache_inode_get_cookie(struct inode *inode)
 {
-        struct v9fs_cookie *vcookie;
+        struct v9fs_inode *v9inode;
        struct v9fs_session_info *v9ses;
        if (!S_ISREG(inode->i_mode))
                return;
-        vcookie = v9fs_inode2cookie(inode);
+        v9inode = V9FS_I(inode);
-        if (vcookie->fscache)
+        if (v9inode->fscache)
                return;
        v9ses = v9fs_inode2v9ses(inode);
-        vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
+        v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
                                                  &v9fs_cache_inode_index_def,
-                                                  vcookie);
+                                                  v9inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
-                   vcookie->fscache);
+                   v9inode->fscache);
 }
 void v9fs_cache_inode_put_cookie(struct inode *inode)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return;
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
-                   vcookie->fscache);
+                   v9inode->fscache);
-        fscache_relinquish_cookie(vcookie->fscache, 0);
+        fscache_relinquish_cookie(v9inode->fscache, 0);
-        vcookie->fscache = NULL;
+        v9inode->fscache = NULL;
 }
 void v9fs_cache_inode_flush_cookie(struct inode *inode)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return;
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
-                   vcookie->fscache);
+                   v9inode->fscache);
-        fscache_relinquish_cookie(vcookie->fscache, 1);
+        fscache_relinquish_cookie(v9inode->fscache, 1);
-        vcookie->fscache = NULL;
+        v9inode->fscache = NULL;
 }
 void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        struct p9_fid *fid;
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return;
-        spin_lock(&vcookie->lock);
+        spin_lock(&v9inode->fscache_lock);
        fid = filp->private_data;
        if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
                v9fs_cache_inode_flush_cookie(inode);
        else
                v9fs_cache_inode_get_cookie(inode);
-        spin_unlock(&vcookie->lock);
+        spin_unlock(&v9inode->fscache_lock);
 }
 void v9fs_cache_inode_reset_cookie(struct inode *inode)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        struct v9fs_session_info *v9ses;
        struct fscache_cookie *old;
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return;
-        old = vcookie->fscache;
+        old = v9inode->fscache;
-        spin_lock(&vcookie->lock);
+        spin_lock(&v9inode->fscache_lock);
-        fscache_relinquish_cookie(vcookie->fscache, 1);
+        fscache_relinquish_cookie(v9inode->fscache, 1);
        v9ses = v9fs_inode2v9ses(inode);
-        vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
+        v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
                                                  &v9fs_cache_inode_index_def,
-                                                  vcookie);
+                                                  v9inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
-                   inode, old, vcookie->fscache);
+                   inode, old, v9inode->fscache);
-        spin_unlock(&vcookie->lock);
+        spin_unlock(&v9inode->fscache_lock);
 }
 int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
 {
        struct inode *inode = page->mapping->host;
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        BUG_ON(!vcookie->fscache);
+        BUG_ON(!v9inode->fscache);
-        return fscache_maybe_release_page(vcookie->fscache, page, gfp);
+        return fscache_maybe_release_page(v9inode->fscache, page, gfp);
 }
 void __v9fs_fscache_invalidate_page(struct page *page)
 {
        struct inode *inode = page->mapping->host;
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        BUG_ON(!vcookie->fscache);
+        BUG_ON(!v9inode->fscache);
        if (PageFsCache(page)) {
-                fscache_wait_on_page_write(vcookie->fscache, page);
+                fscache_wait_on_page_write(v9inode->fscache, page);
                BUG_ON(!PageLocked(page));
-                fscache_uncache_page(vcookie->fscache, page);
+                fscache_uncache_page(v9inode->fscache, page);
        }
 }
@@ -380,13 +323,13 @@ static void v9fs_vfs_readpage_complete(struct page *page, void *data,
 int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
 {
        int ret;
-        const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        const struct v9fs_inode *v9inode = V9FS_I(inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return -ENOBUFS;
-        ret = fscache_read_or_alloc_page(vcookie->fscache,
+        ret = fscache_read_or_alloc_page(v9inode->fscache,
                                         page,
                                         v9fs_vfs_readpage_complete,
                                         NULL,
@@ -418,13 +361,13 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
                                  unsigned *nr_pages)
 {
        int ret;
-        const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        const struct v9fs_inode *v9inode = V9FS_I(inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return -ENOBUFS;
-        ret = fscache_read_or_alloc_pages(vcookie->fscache,
+        ret = fscache_read_or_alloc_pages(v9inode->fscache,
                                          mapping, pages, nr_pages,
                                          v9fs_vfs_readpage_complete,
                                          NULL,
@@ -453,11 +396,22 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
 void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
 {
        int ret;
-        const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        const struct v9fs_inode *v9inode = V9FS_I(inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
-        ret = fscache_write_page(vcookie->fscache, page, GFP_KERNEL);
+        ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL);
        P9_DPRINTK(P9_DEBUG_FSC, "ret =  %d", ret);
        if (ret != 0)
                v9fs_uncache_page(inode, page);
 }
+/*
+ * wait for a page to complete writing to the cache
+ */
+void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
+{
+        const struct v9fs_inode *v9inode = V9FS_I(inode);
+        P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+        if (PageFsCache(page))
+                fscache_wait_on_page_write(v9inode->fscache, page);
+}
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index a94192bfaee..049507a5b01 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -25,20 +25,6 @@
 #include <linux/fscache.h>
 #include <linux/spinlock.h>
-extern struct kmem_cache *vcookie_cache;
-struct v9fs_cookie {
-        spinlock_t lock;
-        struct inode inode;
-        struct fscache_cookie *fscache;
-        struct p9_qid *qid;
-};
-static inline struct v9fs_cookie *v9fs_inode2cookie(const struct inode *inode)
-{
-        return container_of(inode, struct v9fs_cookie, inode);
-}
 extern struct fscache_netfs v9fs_cache_netfs;
 extern const struct fscache_cookie_def v9fs_cache_session_index_def;
 extern const struct fscache_cookie_def v9fs_cache_inode_index_def;
@@ -64,23 +50,8 @@ extern int __v9fs_readpages_from_fscache(struct inode *inode,
                                         struct list_head *pages,
                                         unsigned *nr_pages);
 extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
+extern void __v9fs_fscache_wait_on_page_write(struct inode *inode,
+                                              struct page *page);
-/**
- * v9fs_cache_register - Register v9fs file system with the cache
- */
-static inline int v9fs_cache_register(void)
-{
-        return __v9fs_cache_register();
-}
-/**
- * v9fs_cache_unregister - Unregister v9fs from the cache
- */
-static inline void v9fs_cache_unregister(void)
-{
-        __v9fs_cache_unregister();
-}
 static inline int v9fs_fscache_release_page(struct page *page,
                                            gfp_t gfp)
@@ -117,28 +88,27 @@ static inline void v9fs_readpage_to_fscache(struct inode *inode,
 static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        fscache_uncache_page(vcookie->fscache, page);
+        fscache_uncache_page(v9inode->fscache, page);
        BUG_ON(PageFsCache(page));
 }
-static inline void v9fs_vcookie_set_qid(struct inode *inode,
+static inline void v9fs_fscache_set_key(struct inode *inode,
                                        struct p9_qid *qid)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        spin_lock(&vcookie->lock);
+        spin_lock(&v9inode->fscache_lock);
-        vcookie->qid = qid;
+        v9inode->fscache_key = qid;
-        spin_unlock(&vcookie->lock);
+        spin_unlock(&v9inode->fscache_lock);
 }
-#else /* CONFIG_9P_FSCACHE */
+static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
+                                                   struct page *page)
-static inline int v9fs_cache_register(void)
 {
-        return 1;
+        return __v9fs_fscache_wait_on_page_write(inode, page);
 }
-static inline void v9fs_cache_unregister(void) {}
+#else /* CONFIG_9P_FSCACHE */
 static inline int v9fs_fscache_release_page(struct page *page,
                                            gfp_t gfp) {
@@ -168,9 +138,11 @@ static inline void v9fs_readpage_to_fscache(struct inode *inode,
 static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
 {}
-static inline void v9fs_vcookie_set_qid(struct inode *inode,
+static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
-                                        struct p9_qid *qid)
+                                                   struct page *page)
-{}
+{
+        return;
+}
 #endif /* CONFIG_9P_FSCACHE */
 #endif /* _9P_CACHE_H */
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index b00223c99d7..cd63e002d82 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -125,46 +125,17 @@ err_out:
        return -ENOMEM;
 }
-/**
+static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
- * v9fs_fid_lookup - lookup for a fid, try to walk if not found
+                                               uid_t uid, int any)
- * @dentry: dentry to look for fid in
- *
- * Look for a fid in the specified dentry for the current user.
- * If no fid is found, try to create one walking from a fid from the parent
- * dentry (if it has one), or the root dentry. If the user haven't accessed
- * the fs yet, attach now and walk from the root.
- */
-struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 {
-        int i, n, l, clone, any, access;
-        u32 uid;
-        struct p9_fid *fid, *old_fid = NULL;
        struct dentry *ds;
-        struct v9fs_session_info *v9ses;
        char **wnames, *uname;
+        int i, n, l, clone, access;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid, *old_fid = NULL;
        v9ses = v9fs_inode2v9ses(dentry->d_inode);
        access = v9ses->flags & V9FS_ACCESS_MASK;
-        switch (access) {
-        case V9FS_ACCESS_SINGLE:
-        case V9FS_ACCESS_USER:
-        case V9FS_ACCESS_CLIENT:
-                uid = current_fsuid();
-                any = 0;
-                break;
-        case V9FS_ACCESS_ANY:
-                uid = v9ses->uid;
-                any = 1;
-                break;
-        default:
-                uid = ~0;
-                any = 0;
-                break;
-        }
        fid = v9fs_fid_find(dentry, uid, any);
        if (fid)
                return fid;
@@ -250,6 +221,45 @@ err_out:
        return fid;
 }
+/**
+ * v9fs_fid_lookup - lookup for a fid, try to walk if not found
+ * @dentry: dentry to look for fid in
+ *
+ * Look for a fid in the specified dentry for the current user.
+ * If no fid is found, try to create one walking from a fid from the parent
+ * dentry (if it has one), or the root dentry. If the user haven't accessed
+ * the fs yet, attach now and walk from the root.
+ */
+struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
+{
+        uid_t uid;
+        int  any, access;
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        access = v9ses->flags & V9FS_ACCESS_MASK;
+        switch (access) {
+        case V9FS_ACCESS_SINGLE:
+        case V9FS_ACCESS_USER:
+        case V9FS_ACCESS_CLIENT:
+                uid = current_fsuid();
+                any = 0;
+                break;
+        case V9FS_ACCESS_ANY:
+                uid = v9ses->uid;
+                any = 1;
+                break;
+        default:
+                uid = ~0;
+                any = 0;
+                break;
+        }
+        return v9fs_fid_lookup_with_uid(dentry, uid, any);
+}
 struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
 {
        struct p9_fid *fid, *ret;
@@ -261,3 +271,39 @@ struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
        ret = p9_client_walk(fid, 0, NULL, 1);
        return ret;
 }
+static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid)
+{
+        struct p9_fid *fid, *ret;
+        fid = v9fs_fid_lookup_with_uid(dentry, uid, 0);
+        if (IS_ERR(fid))
+                return fid;
+        ret = p9_client_walk(fid, 0, NULL, 1);
+        return ret;
+}
+struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
+{
+        int err;
+        struct p9_fid *fid;
+        fid = v9fs_fid_clone_with_uid(dentry, 0);
+        if (IS_ERR(fid))
+                goto error_out;
+        /*
+         * writeback fid will only be used to write back the
+         * dirty pages. We always request for the open fid in read-write
+         * mode so that a partial page write which result in page
+         * read can work.
+         */
+        err = p9_client_open(fid, O_RDWR);
+        if (err < 0) {
+                p9_client_clunk(fid);
+                fid = ERR_PTR(err);
+                goto error_out;
+        }
+error_out:
+        return fid;
+}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index c3bbd6af996..bb0b6e7f58f 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -19,7 +19,8 @@
 *  Boston, MA  02111-1301  USA
 *
 */
+#ifndef FS_9P_FID_H
+#define FS_9P_FID_H
 #include <linux/list.h>
 /**
@@ -45,3 +46,5 @@ struct v9fs_dentry {
 struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
 struct p9_fid *v9fs_fid_clone(struct dentry *dentry);
 int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
+struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
+#endif
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 2f77cd33ba8..c82b017f51f 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -39,6 +39,7 @@
 static DEFINE_SPINLOCK(v9fs_sessionlist_lock);
 static LIST_HEAD(v9fs_sessionlist);
+struct kmem_cache *v9fs_inode_cache;
 /*
 * Option Parsing (code inspired by NFS code)
@@ -55,7 +56,7 @@ enum {
        /* Cache options */
        Opt_cache_loose, Opt_fscache,
        /* Access options */
-        Opt_access,
+        Opt_access, Opt_posixacl,
        /* Error token */
        Opt_err
 };
@@ -73,6 +74,7 @@ static const match_table_t tokens = {
        {Opt_fscache, "fscache"},
        {Opt_cachetag, "cachetag=%s"},
        {Opt_access, "access=%s"},
+        {Opt_posixacl, "posixacl"},
        {Opt_err, NULL}
 };
@@ -194,15 +196,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                        else if (strcmp(s, "any") == 0)
                                v9ses->flags |= V9FS_ACCESS_ANY;
                        else if (strcmp(s, "client") == 0) {
-#ifdef CONFIG_9P_FS_POSIX_ACL
                                v9ses->flags |= V9FS_ACCESS_CLIENT;
-#else
-                                P9_DPRINTK(P9_DEBUG_ERROR,
-                                        "access=client option not supported\n");
-                                kfree(s);
-                                ret = -EINVAL;
-                                goto free_and_return;
-#endif
                        } else {
                                v9ses->flags |= V9FS_ACCESS_SINGLE;
                                v9ses->uid = simple_strtoul(s, &e, 10);
@@ -212,6 +206,16 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                        kfree(s);
                        break;
+                case Opt_posixacl:
+#ifdef CONFIG_9P_FS_POSIX_ACL
+                        v9ses->flags |= V9FS_POSIX_ACL;
+#else
+                        P9_DPRINTK(P9_DEBUG_ERROR,
+                                        "Not defined CONFIG_9P_FS_POSIX_ACL. "
+                                        "Ignoring posixacl option\n");
+#endif
+                        break;
                default:
                        continue;
                }
@@ -260,19 +264,12 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        list_add(&v9ses->slist, &v9fs_sessionlist);
        spin_unlock(&v9fs_sessionlist_lock);
-        v9ses->flags = V9FS_ACCESS_USER;
        strcpy(v9ses->uname, V9FS_DEFUSER);
        strcpy(v9ses->aname, V9FS_DEFANAME);
        v9ses->uid = ~0;
        v9ses->dfltuid = V9FS_DEFUID;
        v9ses->dfltgid = V9FS_DEFGID;
-        rc = v9fs_parse_options(v9ses, data);
-        if (rc < 0) {
-                retval = rc;
-                goto error;
-        }
        v9ses->clnt = p9_client_create(dev_name, data);
        if (IS_ERR(v9ses->clnt)) {
                retval = PTR_ERR(v9ses->clnt);
@@ -281,10 +278,20 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                goto error;
        }
-        if (p9_is_proto_dotl(v9ses->clnt))
+        v9ses->flags = V9FS_ACCESS_USER;
+        if (p9_is_proto_dotl(v9ses->clnt)) {
+                v9ses->flags = V9FS_ACCESS_CLIENT;
                v9ses->flags |= V9FS_PROTO_2000L;
-        else if (p9_is_proto_dotu(v9ses->clnt))
+        } else if (p9_is_proto_dotu(v9ses->clnt)) {
                v9ses->flags |= V9FS_PROTO_2000U;
+        }
+        rc = v9fs_parse_options(v9ses, data);
+        if (rc < 0) {
+                retval = rc;
+                goto error;
+        }
        v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
@@ -306,6 +313,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                v9ses->flags |= V9FS_ACCESS_ANY;
                v9ses->uid = ~0;
        }
+        if (!v9fs_proto_dotl(v9ses) ||
+                !((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
+                /*
+                 * We support ACL checks on clinet only if the protocol is
+                 * 9P2000.L and access is V9FS_ACCESS_CLIENT.
+                 */
+                v9ses->flags &= ~V9FS_ACL_MASK;
+        }
        fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0,
                                                        v9ses->aname);
@@ -467,6 +482,63 @@ static void v9fs_sysfs_cleanup(void)
        kobject_put(v9fs_kobj);
 }
+static void v9fs_inode_init_once(void *foo)
+{
+        struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;
+#ifdef CONFIG_9P_FSCACHE
+        v9inode->fscache = NULL;
+        v9inode->fscache_key = NULL;
+#endif
+        inode_init_once(&v9inode->vfs_inode);
+}
+/**
+ * v9fs_init_inode_cache - initialize a cache for 9P
+ * Returns 0 on success.
+ */
+static int v9fs_init_inode_cache(void)
+{
+        v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
+                                          sizeof(struct v9fs_inode),
+                                          0, (SLAB_RECLAIM_ACCOUNT|
+                                              SLAB_MEM_SPREAD),
+                                          v9fs_inode_init_once);
+        if (!v9fs_inode_cache)
+                return -ENOMEM;
+        return 0;
+}
+/**
+ * v9fs_destroy_inode_cache - destroy the cache of 9P inode
+ *
+ */
+static void v9fs_destroy_inode_cache(void)
+{
+        kmem_cache_destroy(v9fs_inode_cache);
+}
+static int v9fs_cache_register(void)
+{
+        int ret;
+        ret = v9fs_init_inode_cache();
+        if (ret < 0)
+                return ret;
+#ifdef CONFIG_9P_FSCACHE
+        return fscache_register_netfs(&v9fs_cache_netfs);
+#else
+        return ret;
+#endif
+}
+static void v9fs_cache_unregister(void)
+{
+        v9fs_destroy_inode_cache();
+#ifdef CONFIG_9P_FSCACHE
+        fscache_unregister_netfs(&v9fs_cache_netfs);
+#endif
+}
 /**
 * init_v9fs - Initialize module
 *
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index c4b5d8864f0..bd8496db135 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -20,6 +20,9 @@
 *  Boston, MA  02111-1301  USA
 *
 */
+#ifndef FS_9P_V9FS_H
+#define FS_9P_V9FS_H
 #include <linux/backing-dev.h>
 /**
@@ -28,8 +31,10 @@
 * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
+ * @V9FS_ACCESS_CLIENT: Just like user, but access check is performed on client.
 * @V9FS_ACCESS_ANY: use a single attach for all users
 * @V9FS_ACCESS_MASK: bit mask of different ACCESS options
+ * @V9FS_POSIX_ACL: POSIX ACLs are enforced
 *
 * Session flags reflect options selected by users at mount time
 */
@@ -37,13 +42,15 @@
                         V9FS_ACCESS_USER |   \
                         V9FS_ACCESS_CLIENT)
 #define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
+#define V9FS_ACL_MASK V9FS_POSIX_ACL
 enum p9_session_flags {
        V9FS_PROTO_2000U        = 0x01,
        V9FS_PROTO_2000L        = 0x02,
        V9FS_ACCESS_SINGLE      = 0x04,
        V9FS_ACCESS_USER        = 0x08,
-        V9FS_ACCESS_CLIENT      = 0x10
+        V9FS_ACCESS_CLIENT      = 0x10,
+        V9FS_POSIX_ACL          = 0x20
 };
 /* possible values of ->cache */
@@ -109,8 +116,28 @@ struct v9fs_session_info {
        struct list_head slist; /* list of sessions registered with v9fs */
        struct backing_dev_info bdi;
        struct rw_semaphore rename_sem;
+        struct p9_fid *root_fid; /* Used for file system sync */
+};
+/* cache_validity flags */
+#define V9FS_INO_INVALID_ATTR 0x01
+struct v9fs_inode {
+#ifdef CONFIG_9P_FSCACHE
+        spinlock_t fscache_lock;
+        struct fscache_cookie *fscache;
+        struct p9_qid *fscache_key;
+#endif
+        unsigned int cache_validity;
+        struct p9_fid *writeback_fid;
+        struct inode vfs_inode;
 };
+static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
+{
+        return container_of(inode, struct v9fs_inode, vfs_inode);
+}
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
                                                                        char *);
 extern void v9fs_session_close(struct v9fs_session_info *v9ses);
@@ -124,16 +151,15 @@ extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        struct inode *new_dir, struct dentry *new_dentry);
 extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
                        void *p);
-extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses,
+extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
-                        struct p9_fid *fid,
+                                         struct p9_fid *fid,
-                        struct super_block *sb);
+                                         struct super_block *sb);
 extern const struct inode_operations v9fs_dir_inode_operations_dotl;
 extern const struct inode_operations v9fs_file_inode_operations_dotl;
 extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
-extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses,
+extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
-                        struct p9_fid *fid,
+                                              struct p9_fid *fid,
-                        struct super_block *sb);
+                                              struct super_block *sb);
 /* other default globals */
 #define V9FS_PORT       564
@@ -158,7 +184,7 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
 }
 /**
- * v9fs_inode_from_fid - Helper routine to populate an inode by
+ * v9fs_get_inode_from_fid - Helper routine to populate an inode by
 * issuing a attribute request
 * @v9ses: session information
 * @fid: fid to issue attribute request for
@@ -166,11 +192,12 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
 *
 */
 static inline struct inode *
-v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-                                struct super_block *sb)
+                        struct super_block *sb)
 {
        if (v9fs_proto_dotl(v9ses))
-                return v9fs_inode_dotl(v9ses, fid, sb);
+                return v9fs_inode_from_fid_dotl(v9ses, fid, sb);
        else
-                return v9fs_inode(v9ses, fid, sb);
+                return v9fs_inode_from_fid(v9ses, fid, sb);
 }
+#endif
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index b789f8e597e..4014160903a 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -20,6 +20,8 @@
 *  Boston, MA  02111-1301  USA
 *
 */
+#ifndef FS_9P_V9FS_VFS_H
+#define FS_9P_V9FS_VFS_H
 /* plan9 semantics are that created files are implicitly opened.
 * But linux semantics are that you call create, then open.
@@ -36,6 +38,7 @@
 * unlink calls remove, which is an implicit clunk. So we have to track
 * that kind of thing so that we don't try to clunk a dead fid.
 */
+#define P9_LOCK_TIMEOUT (30*HZ)
 extern struct file_system_type v9fs_fs_type;
 extern const struct address_space_operations v9fs_addr_operations;
@@ -45,13 +48,15 @@ extern const struct file_operations v9fs_dir_operations;
 extern const struct file_operations v9fs_dir_operations_dotl;
 extern const struct dentry_operations v9fs_dentry_operations;
 extern const struct dentry_operations v9fs_cached_dentry_operations;
+extern const struct file_operations v9fs_cached_file_operations;
+extern const struct file_operations v9fs_cached_file_operations_dotl;
+extern struct kmem_cache *v9fs_inode_cache;
-#ifdef CONFIG_9P_FSCACHE
 struct inode *v9fs_alloc_inode(struct super_block *sb);
 void v9fs_destroy_inode(struct inode *inode);
-#endif
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
+int v9fs_init_inode(struct v9fs_session_info *v9ses,
+                    struct inode *inode, int mode);
 void v9fs_evict_inode(struct inode *inode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
 void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
@@ -62,8 +67,19 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
 int v9fs_uflags2omode(int uflags, int extended);
 ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
+ssize_t v9fs_fid_readn(struct p9_fid *, char *, char __user *, u32, u64);
 void v9fs_blank_wstat(struct p9_wstat *wstat);
 int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
 int v9fs_file_fsync_dotl(struct file *filp, int datasync);
+ssize_t v9fs_file_write_internal(struct inode *, struct p9_fid *,
-#define P9_LOCK_TIMEOUT (30*HZ)
+                                 const char __user *, size_t, loff_t *, int);
+int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode);
+int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode);
+static inline void v9fs_invalidate_inode_attr(struct inode *inode)
+{
+        struct v9fs_inode *v9inode;
+        v9inode = V9FS_I(inode);
+        v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
+        return;
+}
+#endif
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index b7f2a8e3863..2524e4cbb8e 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -39,16 +39,16 @@
 #include "v9fs.h"
 #include "v9fs_vfs.h"
 #include "cache.h"
+#include "fid.h"
 /**
- * v9fs_vfs_readpage - read an entire page in from 9P
+ * v9fs_fid_readpage - read an entire page in from 9P
 *
- * @filp: file being read
+ * @fid: fid being read
 * @page: structure to page
 *
 */
+static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
-static int v9fs_vfs_readpage(struct file *filp, struct page *page)
 {
        int retval;
        loff_t offset;
@@ -67,7 +67,7 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
        buffer = kmap(page);
        offset = page_offset(page);
-        retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset);
+        retval = v9fs_fid_readn(fid, buffer, NULL, PAGE_CACHE_SIZE, offset);
        if (retval < 0) {
                v9fs_uncache_page(inode, page);
                goto done;
@@ -87,6 +87,19 @@ done:
 }
 /**
+ * v9fs_vfs_readpage - read an entire page in from 9P
+ *
+ * @filp: file being read
+ * @page: structure to page
+ *
+ */
+static int v9fs_vfs_readpage(struct file *filp, struct page *page)
+{
+        return v9fs_fid_readpage(filp->private_data, page);
+}
+/**
 * v9fs_vfs_readpages - read a set of pages from 9P
 *
 * @filp: file being read
@@ -124,7 +137,6 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
 {
        if (PagePrivate(page))
                return 0;
        return v9fs_fscache_release_page(page, gfp);
 }
@@ -137,20 +149,89 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
 static void v9fs_invalidate_page(struct page *page, unsigned long offset)
 {
+        /*
+         * If called with zero offset, we should release
+         * the private state assocated with the page
+         */
        if (offset == 0)
                v9fs_fscache_invalidate_page(page);
 }
+static int v9fs_vfs_writepage_locked(struct page *page)
+{
+        char *buffer;
+        int retval, len;
+        loff_t offset, size;
+        mm_segment_t old_fs;
+        struct v9fs_inode *v9inode;
+        struct inode *inode = page->mapping->host;
+        v9inode = V9FS_I(inode);
+        size = i_size_read(inode);
+        if (page->index == size >> PAGE_CACHE_SHIFT)
+                len = size & ~PAGE_CACHE_MASK;
+        else
+                len = PAGE_CACHE_SIZE;
+        set_page_writeback(page);
+        buffer = kmap(page);
+        offset = page_offset(page);
+        old_fs = get_fs();
+        set_fs(get_ds());
+        /* We should have writeback_fid always set */
+        BUG_ON(!v9inode->writeback_fid);
+        retval = v9fs_file_write_internal(inode,
+                                          v9inode->writeback_fid,
+                                          (__force const char __user *)buffer,
+                                          len, &offset, 0);
+        if (retval > 0)
+                retval = 0;
+        set_fs(old_fs);
+        kunmap(page);
+        end_page_writeback(page);
+        return retval;
+}
+static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+        int retval;
+        retval = v9fs_vfs_writepage_locked(page);
+        if (retval < 0) {
+                if (retval == -EAGAIN) {
+                        redirty_page_for_writepage(wbc, page);
+                        retval = 0;
+                } else {
+                        SetPageError(page);
+                        mapping_set_error(page->mapping, retval);
+                }
+        } else
+                retval = 0;
+        unlock_page(page);
+        return retval;
+}
 /**
 * v9fs_launder_page - Writeback a dirty page
- * Since the writes go directly to the server, we simply return a 0
- * here to indicate success.
- *
 * Returns 0 on success.
 */
 static int v9fs_launder_page(struct page *page)
 {
+        int retval;
+        struct inode *inode = page->mapping->host;
+        v9fs_fscache_wait_on_page_write(inode, page);
+        if (clear_page_dirty_for_io(page)) {
+                retval = v9fs_vfs_writepage_locked(page);
+                if (retval)
+                        return retval;
+        }
        return 0;
 }
@@ -173,9 +254,15 @@ static int v9fs_launder_page(struct page *page)
 * with an error.
 *
 */
-ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+static ssize_t
-                loff_t pos, unsigned long nr_segs)
+v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+               loff_t pos, unsigned long nr_segs)
 {
+        /*
+         * FIXME
+         * Now that we do caching with cache mode enabled, We need
+         * to support direct IO
+         */
        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
                        "off/no(%lld/%lu) EINVAL\n",
                        iocb->ki_filp->f_path.dentry->d_name.name,
@@ -183,11 +270,84 @@ ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
        return -EINVAL;
 }
+static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
+                            loff_t pos, unsigned len, unsigned flags,
+                            struct page **pagep, void **fsdata)
+{
+        int retval = 0;
+        struct page *page;
+        struct v9fs_inode *v9inode;
+        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+        struct inode *inode = mapping->host;
+        v9inode = V9FS_I(inode);
+start:
+        page = grab_cache_page_write_begin(mapping, index, flags);
+        if (!page) {
+                retval = -ENOMEM;
+                goto out;
+        }
+        BUG_ON(!v9inode->writeback_fid);
+        if (PageUptodate(page))
+                goto out;
+        if (len == PAGE_CACHE_SIZE)
+                goto out;
+        retval = v9fs_fid_readpage(v9inode->writeback_fid, page);
+        page_cache_release(page);
+        if (!retval)
+                goto start;
+out:
+        *pagep = page;
+        return retval;
+}
+static int v9fs_write_end(struct file *filp, struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned copied,
+                          struct page *page, void *fsdata)
+{
+        loff_t last_pos = pos + copied;
+        struct inode *inode = page->mapping->host;
+        if (unlikely(copied < len)) {
+                /*
+                 * zero out the rest of the area
+                 */
+                unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+                zero_user(page, from + copied, len - copied);
+                flush_dcache_page(page);
+        }
+        if (!PageUptodate(page))
+                SetPageUptodate(page);
+        /*
+         * No need to use i_size_read() here, the i_size
+         * cannot change under us because we hold the i_mutex.
+         */
+        if (last_pos > inode->i_size) {
+                inode_add_bytes(inode, last_pos - inode->i_size);
+                i_size_write(inode, last_pos);
+        }
+        set_page_dirty(page);
+        unlock_page(page);
+        page_cache_release(page);
+        return copied;
+}
 const struct address_space_operations v9fs_addr_operations = {
-      .readpage = v9fs_vfs_readpage,
+        .readpage = v9fs_vfs_readpage,
-      .readpages = v9fs_vfs_readpages,
+        .readpages = v9fs_vfs_readpages,
-      .releasepage = v9fs_release_page,
+        .set_page_dirty = __set_page_dirty_nobuffers,
-      .invalidatepage = v9fs_invalidate_page,
+        .writepage = v9fs_vfs_writepage,
-      .launder_page = v9fs_launder_page,
+        .write_begin = v9fs_write_begin,
-      .direct_IO = v9fs_direct_IO,
+        .write_end = v9fs_write_end,
+        .releasepage = v9fs_release_page,
+        .invalidatepage = v9fs_invalidate_page,
+        .launder_page = v9fs_launder_page,
+        .direct_IO = v9fs_direct_IO,
 };
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 233b7d4ffe5..b6a3b9f7fe4 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -63,20 +63,15 @@ static int v9fs_dentry_delete(const struct dentry *dentry)
 * v9fs_cached_dentry_delete - called when dentry refcount equals 0
 * @dentry:  dentry in question
 *
- * Only return 1 if our inode is invalid.  Only non-synthetic files
- * (ones without mtime == 0) should be calling this function.
- *
 */
 static int v9fs_cached_dentry_delete(const struct dentry *dentry)
 {
-        struct inode *inode = dentry->d_inode;
+        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n",
-        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
+                   dentry->d_name.name, dentry);
-                                                                        dentry);
-        if(!inode)
+        /* Don't cache negative dentries */
+        if (!dentry->d_inode)
                return 1;
        return 0;
 }
@@ -105,7 +100,41 @@ static void v9fs_dentry_release(struct dentry *dentry)
        }
 }
+static int v9fs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        struct p9_fid *fid;
+        struct inode *inode;
+        struct v9fs_inode *v9inode;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        if (!inode)
+                goto out_valid;
+        v9inode = V9FS_I(inode);
+        if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
+                int retval;
+                struct v9fs_session_info *v9ses;
+                fid = v9fs_fid_lookup(dentry);
+                if (IS_ERR(fid))
+                        return PTR_ERR(fid);
+                v9ses = v9fs_inode2v9ses(inode);
+                if (v9fs_proto_dotl(v9ses))
+                        retval = v9fs_refresh_inode_dotl(fid, inode);
+                else
+                        retval = v9fs_refresh_inode(fid, inode);
+                if (retval <= 0)
+                        return retval;
+        }
+out_valid:
+        return 1;
+}
 const struct dentry_operations v9fs_cached_dentry_operations = {
+        .d_revalidate = v9fs_lookup_revalidate,
        .d_delete = v9fs_cached_dentry_delete,
        .d_release = v9fs_dentry_release,
 };
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index b84ebe8cefe..9c2bdda5cd9 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -295,7 +295,6 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
        P9_DPRINTK(P9_DEBUG_VFS,
                        "v9fs_dir_release: inode: %p filp: %p fid: %d\n",
                        inode, filp, fid ? fid->fid : -1);
-        filemap_write_and_wait(inode->i_mapping);
        if (fid)
                p9_client_clunk(fid);
        return 0;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 240c3067439..78bcb97c342 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -44,8 +44,7 @@
 #include "fid.h"
 #include "cache.h"
-static const struct file_operations v9fs_cached_file_operations;
+static const struct vm_operations_struct v9fs_file_vm_ops;
-static const struct file_operations v9fs_cached_file_operations_dotl;
 /**
 * v9fs_file_open - open a file (or directory)
@@ -57,11 +56,13 @@ static const struct file_operations v9fs_cached_file_operations_dotl;
 int v9fs_file_open(struct inode *inode, struct file *file)
 {
        int err;
+        struct v9fs_inode *v9inode;
        struct v9fs_session_info *v9ses;
        struct p9_fid *fid;
        int omode;
        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
+        v9inode = V9FS_I(inode);
        v9ses = v9fs_inode2v9ses(inode);
        if (v9fs_proto_dotl(v9ses))
                omode = file->f_flags;
@@ -89,20 +90,30 @@ int v9fs_file_open(struct inode *inode, struct file *file)
        }
        file->private_data = fid;
-        if ((fid->qid.version) && (v9ses->cache)) {
+        if (v9ses->cache && !v9inode->writeback_fid) {
-                P9_DPRINTK(P9_DEBUG_VFS, "cached");
+                /*
-                /* enable cached file options */
+                 * clone a fid and add it to writeback_fid
-                if(file->f_op == &v9fs_file_operations)
+                 * we do it during open time instead of
-                        file->f_op = &v9fs_cached_file_operations;
+                 * page dirty time via write_begin/page_mkwrite
-                else if (file->f_op == &v9fs_file_operations_dotl)
+                 * because we want write after unlink usecase
-                        file->f_op = &v9fs_cached_file_operations_dotl;
+                 * to work.
+                 */
+                fid = v9fs_writeback_fid(file->f_path.dentry);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        goto out_error;
+                }
+                v9inode->writeback_fid = (void *) fid;
+        }
 #ifdef CONFIG_9P_FSCACHE
+        if (v9ses->cache)
                v9fs_cache_inode_set_cookie(inode, file);
 #endif
-        }
        return 0;
+out_error:
+        p9_client_clunk(file->private_data);
+        file->private_data = NULL;
+        return err;
 }
 /**
@@ -335,25 +346,22 @@ out_err:
 }
 /**
- * v9fs_file_readn - read from a file
+ * v9fs_fid_readn - read from a fid
- * @filp: file pointer to read
+ * @fid: fid to read
 * @data: data buffer to read data into
 * @udata: user data buffer to read data into
 * @count: size of buffer
 * @offset: offset at which to read data
 *
 */
 ssize_t
-v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
+v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count,
               u64 offset)
 {
        int n, total, size;
-        struct p9_fid *fid = filp->private_data;
        P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
-                                        (long long unsigned) offset, count);
+                   (long long unsigned) offset, count);
        n = 0;
        total = 0;
        size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -379,6 +387,22 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
 }
 /**
+ * v9fs_file_readn - read from a file
+ * @filp: file pointer to read
+ * @data: data buffer to read data into
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+ssize_t
+v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
+               u64 offset)
+{
+        return v9fs_fid_readn(filp->private_data, data, udata, count, offset);
+}
+/**
 * v9fs_file_read - read from a file
 * @filp: file pointer to read
 * @udata: user data buffer to read data into
@@ -410,45 +434,22 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
        return ret;
 }
-/**
+ssize_t
- * v9fs_file_write - write to a file
+v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
- * @filp: file pointer to write
+                         const char __user *data, size_t count,
- * @data: data buffer to write data from
+                         loff_t *offset, int invalidate)
- * @count: size of buffer
- * @offset: offset at which to write data
- *
- */
-static ssize_t
-v9fs_file_write(struct file *filp, const char __user * data,
-                size_t count, loff_t * offset)
 {
-        ssize_t retval;
-        size_t total = 0;
        int n;
-        struct p9_fid *fid;
+        loff_t i_size;
+        size_t total = 0;
        struct p9_client *clnt;
-        struct inode *inode = filp->f_path.dentry->d_inode;
        loff_t origin = *offset;
        unsigned long pg_start, pg_end;
        P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
                (int)count, (int)*offset);
-        fid = filp->private_data;
        clnt = fid->clnt;
-        retval = generic_write_checks(filp, &origin, &count, 0);
-        if (retval)
-                goto out;
-        retval = -EINVAL;
-        if ((ssize_t) count < 0)
-                goto out;
-        retval = 0;
-        if (!count)
-                goto out;
        do {
                n = p9_client_write(fid, NULL, data+total, origin+total, count);
                if (n <= 0)
@@ -457,25 +458,60 @@ v9fs_file_write(struct file *filp, const char __user * data,
                total += n;
        } while (count > 0);
-        if (total > 0) {
+        if (invalidate && (total > 0)) {
                pg_start = origin >> PAGE_CACHE_SHIFT;
                pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
                if (inode->i_mapping && inode->i_mapping->nrpages)
                        invalidate_inode_pages2_range(inode->i_mapping,
                                                      pg_start, pg_end);
                *offset += total;
-                i_size_write(inode, i_size_read(inode) + total);
+                i_size = i_size_read(inode);
-                inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+                if (*offset > i_size) {
+                        inode_add_bytes(inode, *offset - i_size);
+                        i_size_write(inode, *offset);
+                }
        }
        if (n < 0)
-                retval = n;
+                return n;
-        else
-                retval = total;
+        return total;
+}
+/**
+ * v9fs_file_write - write to a file
+ * @filp: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_file_write(struct file *filp, const char __user * data,
+                size_t count, loff_t *offset)
+{
+        ssize_t retval = 0;
+        loff_t origin = *offset;
+        retval = generic_write_checks(filp, &origin, &count, 0);
+        if (retval)
+                goto out;
+        retval = -EINVAL;
+        if ((ssize_t) count < 0)
+                goto out;
+        retval = 0;
+        if (!count)
+                goto out;
+        return v9fs_file_write_internal(filp->f_path.dentry->d_inode,
+                                        filp->private_data,
+                                        data, count, offset, 1);
 out:
        return retval;
 }
 static int v9fs_file_fsync(struct file *filp, int datasync)
 {
        struct p9_fid *fid;
@@ -505,28 +541,182 @@ int v9fs_file_fsync_dotl(struct file *filp, int datasync)
        return retval;
 }
-static const struct file_operations v9fs_cached_file_operations = {
+static int
+v9fs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        int retval;
+        retval = generic_file_mmap(file, vma);
+        if (!retval)
+                vma->vm_ops = &v9fs_file_vm_ops;
+        return retval;
+}
+static int
+v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct v9fs_inode *v9inode;
+        struct page *page = vmf->page;
+        struct file *filp = vma->vm_file;
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n",
+                   page, (unsigned long)filp->private_data);
+        v9inode = V9FS_I(inode);
+        /* make sure the cache has finished storing the page */
+        v9fs_fscache_wait_on_page_write(inode, page);
+        BUG_ON(!v9inode->writeback_fid);
+        lock_page(page);
+        if (page->mapping != inode->i_mapping)
+                goto out_unlock;
+        return VM_FAULT_LOCKED;
+out_unlock:
+        unlock_page(page);
+        return VM_FAULT_NOPAGE;
+}
+static ssize_t
+v9fs_direct_read(struct file *filp, char __user *udata, size_t count,
+                 loff_t *offsetp)
+{
+        loff_t size, offset;
+        struct inode *inode;
+        struct address_space *mapping;
+        offset = *offsetp;
+        mapping = filp->f_mapping;
+        inode = mapping->host;
+        if (!count)
+                return 0;
+        size = i_size_read(inode);
+        if (offset < size)
+                filemap_write_and_wait_range(mapping, offset,
+                                             offset + count - 1);
+        return v9fs_file_read(filp, udata, count, offsetp);
+}
+/**
+ * v9fs_cached_file_read - read from a file
+ * @filp: file pointer to read
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+static ssize_t
+v9fs_cached_file_read(struct file *filp, char __user *data, size_t count,
+                      loff_t *offset)
+{
+        if (filp->f_flags & O_DIRECT)
+                return v9fs_direct_read(filp, data, count, offset);
+        return do_sync_read(filp, data, count, offset);
+}
+static ssize_t
+v9fs_direct_write(struct file *filp, const char __user * data,
+                  size_t count, loff_t *offsetp)
+{
+        loff_t offset;
+        ssize_t retval;
+        struct inode *inode;
+        struct address_space *mapping;
+        offset = *offsetp;
+        mapping = filp->f_mapping;
+        inode = mapping->host;
+        if (!count)
+                return 0;
+        mutex_lock(&inode->i_mutex);
+        retval = filemap_write_and_wait_range(mapping, offset,
+                                              offset + count - 1);
+        if (retval)
+                goto err_out;
+        /*
+         * After a write we want buffered reads to be sure to go to disk to get
+         * the new data.  We invalidate clean cached page from the region we're
+         * about to write.  We do this *before* the write so that if we fail
+         * here we fall back to buffered write
+         */
+        if (mapping->nrpages) {
+                pgoff_t pg_start = offset >> PAGE_CACHE_SHIFT;
+                pgoff_t pg_end   = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+                retval = invalidate_inode_pages2_range(mapping,
+                                                        pg_start, pg_end);
+                /*
+                 * If a page can not be invalidated, fall back
+                 * to buffered write.
+                 */
+                if (retval) {
+                        if (retval == -EBUSY)
+                                goto buff_write;
+                        goto err_out;
+                }
+        }
+        retval = v9fs_file_write(filp, data, count, offsetp);
+err_out:
+        mutex_unlock(&inode->i_mutex);
+        return retval;
+buff_write:
+        mutex_unlock(&inode->i_mutex);
+        return do_sync_write(filp, data, count, offsetp);
+}
+/**
+ * v9fs_cached_file_write - write to a file
+ * @filp: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_cached_file_write(struct file *filp, const char __user * data,
+                       size_t count, loff_t *offset)
+{
+        if (filp->f_flags & O_DIRECT)
+                return v9fs_direct_write(filp, data, count, offset);
+        return do_sync_write(filp, data, count, offset);
+}
+static const struct vm_operations_struct v9fs_file_vm_ops = {
+        .fault = filemap_fault,
+        .page_mkwrite = v9fs_vm_page_mkwrite,
+};
+const struct file_operations v9fs_cached_file_operations = {
        .llseek = generic_file_llseek,
-        .read = do_sync_read,
+        .read = v9fs_cached_file_read,
+        .write = v9fs_cached_file_write,
        .aio_read = generic_file_aio_read,
-        .write = v9fs_file_write,
+        .aio_write = generic_file_aio_write,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
        .lock = v9fs_file_lock,
-        .mmap = generic_file_readonly_mmap,
+        .mmap = v9fs_file_mmap,
        .fsync = v9fs_file_fsync,
 };
-static const struct file_operations v9fs_cached_file_operations_dotl = {
+const struct file_operations v9fs_cached_file_operations_dotl = {
        .llseek = generic_file_llseek,
-        .read = do_sync_read,
+        .read = v9fs_cached_file_read,
+        .write = v9fs_cached_file_write,
        .aio_read = generic_file_aio_read,
-        .write = v9fs_file_write,
+        .aio_write = generic_file_aio_write,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
        .lock = v9fs_file_lock_dotl,
        .flock = v9fs_file_flock_dotl,
-        .mmap = generic_file_readonly_mmap,
+        .mmap = v9fs_file_mmap,
        .fsync = v9fs_file_fsync_dotl,
 };
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b76a40bdf4c..8a2c232f708 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -203,26 +203,25 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
        wstat->extension = NULL;
 }
-#ifdef CONFIG_9P_FSCACHE
 /**
 * v9fs_alloc_inode - helper function to allocate an inode
- * This callback is executed before setting up the inode so that we
- * can associate a vcookie with each inode.
 *
 */
 struct inode *v9fs_alloc_inode(struct super_block *sb)
 {
-        struct v9fs_cookie *vcookie;
+        struct v9fs_inode *v9inode;
-        vcookie = (struct v9fs_cookie *)kmem_cache_alloc(vcookie_cache,
+        v9inode = (struct v9fs_inode *)kmem_cache_alloc(v9fs_inode_cache,
-                                                         GFP_KERNEL);
+                                                        GFP_KERNEL);
-        if (!vcookie)
+        if (!v9inode)
                return NULL;
+#ifdef CONFIG_9P_FSCACHE
-        vcookie->fscache = NULL;
+        v9inode->fscache = NULL;
-        vcookie->qid = NULL;
+        v9inode->fscache_key = NULL;
-        spin_lock_init(&vcookie->lock);
+        spin_lock_init(&v9inode->fscache_lock);
-        return &vcookie->inode;
+#endif
+        v9inode->writeback_fid = NULL;
+        v9inode->cache_validity = 0;
+        return &v9inode->vfs_inode;
 }
 /**
@@ -234,35 +233,18 @@ static void v9fs_i_callback(struct rcu_head *head)
 {
        struct inode *inode = container_of(head, struct inode, i_rcu);
        INIT_LIST_HEAD(&inode->i_dentry);
-        kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
+        kmem_cache_free(v9fs_inode_cache, V9FS_I(inode));
 }
 void v9fs_destroy_inode(struct inode *inode)
 {
        call_rcu(&inode->i_rcu, v9fs_i_callback);
 }
-#endif
-/**
+int v9fs_init_inode(struct v9fs_session_info *v9ses,
- * v9fs_get_inode - helper function to setup an inode
+                    struct inode *inode, int mode)
- * @sb: superblock
- * @mode: mode to setup inode with
- *
- */
-struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 {
-        int err;
+        int err = 0;
-        struct inode *inode;
-        struct v9fs_session_info *v9ses = sb->s_fs_info;
-        P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
-        inode = new_inode(sb);
-        if (!inode) {
-                P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
-                return ERR_PTR(-ENOMEM);
-        }
        inode_init_owner(inode, NULL, mode);
        inode->i_blocks = 0;
@@ -292,14 +274,20 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
        case S_IFREG:
                if (v9fs_proto_dotl(v9ses)) {
                        inode->i_op = &v9fs_file_inode_operations_dotl;
-                        inode->i_fop = &v9fs_file_operations_dotl;
+                        if (v9ses->cache)
+                                inode->i_fop =
+                                        &v9fs_cached_file_operations_dotl;
+                        else
+                                inode->i_fop = &v9fs_file_operations_dotl;
                } else {
                        inode->i_op = &v9fs_file_inode_operations;
-                        inode->i_fop = &v9fs_file_operations;
+                        if (v9ses->cache)
+                                inode->i_fop = &v9fs_cached_file_operations;
+                        else
+                                inode->i_fop = &v9fs_file_operations;
                }
                break;
        case S_IFLNK:
                if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
                        P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
@@ -335,12 +323,37 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
                err = -EINVAL;
                goto error;
        }
+error:
+        return err;
-        return inode;
+}
-error:
+/**
-        iput(inode);
+ * v9fs_get_inode - helper function to setup an inode
-        return ERR_PTR(err);
+ * @sb: superblock
+ * @mode: mode to setup inode with
+ *
+ */
+struct inode *v9fs_get_inode(struct super_block *sb, int mode)
+{
+        int err;
+        struct inode *inode;
+        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
+        inode = new_inode(sb);
+        if (!inode) {
+                P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        err = v9fs_init_inode(v9ses, inode, mode);
+        if (err) {
+                iput(inode);
+                return ERR_PTR(err);
+        }
+        return inode;
 }
 /*
@@ -403,6 +416,8 @@ error:
 */
 void v9fs_evict_inode(struct inode *inode)
 {
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        truncate_inode_pages(inode->i_mapping, 0);
        end_writeback(inode);
        filemap_fdatawrite(inode->i_mapping);
@@ -410,41 +425,67 @@ void v9fs_evict_inode(struct inode *inode)
 #ifdef CONFIG_9P_FSCACHE
        v9fs_cache_inode_put_cookie(inode);
 #endif
+        /* clunk the fid stashed in writeback_fid */
+        if (v9inode->writeback_fid) {
+                p9_client_clunk(v9inode->writeback_fid);
+                v9inode->writeback_fid = NULL;
+        }
 }
-struct inode *
+static struct inode *v9fs_qid_iget(struct super_block *sb,
-v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+                                   struct p9_qid *qid,
-        struct super_block *sb)
+                                   struct p9_wstat *st)
 {
-        int err, umode;
+        int retval, umode;
-        struct inode *ret = NULL;
+        unsigned long i_ino;
-        struct p9_wstat *st;
+        struct inode *inode;
+        struct v9fs_session_info *v9ses = sb->s_fs_info;
-        st = p9_client_stat(fid);
-        if (IS_ERR(st))
-                return ERR_CAST(st);
+        i_ino = v9fs_qid2ino(qid);
+        inode = iget_locked(sb, i_ino);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        /*
+         * initialize the inode with the stat info
+         * FIXME!! we may need support for stale inodes
+         * later.
+         */
        umode = p9mode2unixmode(v9ses, st->mode);
-        ret = v9fs_get_inode(sb, umode);
+        retval = v9fs_init_inode(v9ses, inode, umode);
-        if (IS_ERR(ret)) {
+        if (retval)
-                err = PTR_ERR(ret);
                goto error;
-        }
-        v9fs_stat2inode(st, ret, sb);
-        ret->i_ino = v9fs_qid2ino(&st->qid);
+        v9fs_stat2inode(st, inode, sb);
 #ifdef CONFIG_9P_FSCACHE
-        v9fs_vcookie_set_qid(ret, &st->qid);
+        v9fs_fscache_set_key(inode, &st->qid);
-        v9fs_cache_inode_get_cookie(ret);
+        v9fs_cache_inode_get_cookie(inode);
 #endif
-        p9stat_free(st);
+        unlock_new_inode(inode);
-        kfree(st);
+        return inode;
-        return ret;
 error:
+        unlock_new_inode(inode);
+        iput(inode);
+        return ERR_PTR(retval);
+}
+struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+                    struct super_block *sb)
+{
+        struct p9_wstat *st;
+        struct inode *inode = NULL;
+        st = p9_client_stat(fid);
+        if (IS_ERR(st))
+                return ERR_CAST(st);
+        inode = v9fs_qid_iget(sb, &st->qid, st);
        p9stat_free(st);
        kfree(st);
-        return ERR_PTR(err);
+        return inode;
 }
 /**
@@ -458,8 +499,8 @@ error:
 static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 {
        int retval;
-        struct inode *file_inode;
        struct p9_fid *v9fid;
+        struct inode *file_inode;
        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
                rmdir);
@@ -470,8 +511,20 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
                return PTR_ERR(v9fid);
        retval = p9_client_remove(v9fid);
-        if (!retval)
+        if (!retval) {
-                drop_nlink(file_inode);
+                /*
+                 * directories on unlink should have zero
+                 * link count
+                 */
+                if (rmdir) {
+                        clear_nlink(file_inode);
+                        drop_nlink(dir);
+                } else
+                        drop_nlink(file_inode);
+                v9fs_invalidate_inode_attr(file_inode);
+                v9fs_invalidate_inode_attr(dir);
+        }
        return retval;
 }
@@ -531,7 +584,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        }
        /* instantiate inode and assign the unopened fid to the dentry */
-        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -570,9 +623,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        int err;
        u32 perm;
        int flags;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid;
        struct file *filp;
+        struct v9fs_inode *v9inode;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid, *inode_fid;
        err = 0;
        fid = NULL;
@@ -592,8 +646,25 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
                goto error;
        }
+        v9fs_invalidate_inode_attr(dir);
        /* if we are opening a file, assign the open fid to the file */
        if (nd && nd->flags & LOOKUP_OPEN) {
+                v9inode = V9FS_I(dentry->d_inode);
+                if (v9ses->cache && !v9inode->writeback_fid) {
+                        /*
+                         * clone a fid and add it to writeback_fid
+                         * we do it during open time instead of
+                         * page dirty time via write_begin/page_mkwrite
+                         * because we want write after unlink usecase
+                         * to work.
+                         */
+                        inode_fid = v9fs_writeback_fid(dentry);
+                        if (IS_ERR(inode_fid)) {
+                                err = PTR_ERR(inode_fid);
+                                goto error;
+                        }
+                        v9inode->writeback_fid = (void *) inode_fid;
+                }
                filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
                if (IS_ERR(filp)) {
                        err = PTR_ERR(filp);
@@ -601,6 +672,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
                }
                filp->private_data = fid;
+#ifdef CONFIG_9P_FSCACHE
+                if (v9ses->cache)
+                        v9fs_cache_inode_set_cookie(dentry->d_inode, filp);
+#endif
        } else
                p9_client_clunk(fid);
@@ -625,8 +700,8 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
        int err;
        u32 perm;
-        struct v9fs_session_info *v9ses;
        struct p9_fid *fid;
+        struct v9fs_session_info *v9ses;
        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
        err = 0;
@@ -636,6 +711,9 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (IS_ERR(fid)) {
                err = PTR_ERR(fid);
                fid = NULL;
+        } else {
+                inc_nlink(dir);
+                v9fs_invalidate_inode_attr(dir);
        }
        if (fid)
@@ -687,7 +765,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
                return ERR_PTR(result);
        }
-        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
        if (IS_ERR(inode)) {
                result = PTR_ERR(inode);
                inode = NULL;
@@ -747,17 +825,19 @@ int
 v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                struct inode *new_dir, struct dentry *new_dentry)
 {
+        int retval;
        struct inode *old_inode;
+        struct inode *new_inode;
        struct v9fs_session_info *v9ses;
        struct p9_fid *oldfid;
        struct p9_fid *olddirfid;
        struct p9_fid *newdirfid;
        struct p9_wstat wstat;
-        int retval;
        P9_DPRINTK(P9_DEBUG_VFS, "\n");
        retval = 0;
        old_inode = old_dentry->d_inode;
+        new_inode = new_dentry->d_inode;
        v9ses = v9fs_inode2v9ses(old_inode);
        oldfid = v9fs_fid_lookup(old_dentry);
        if (IS_ERR(oldfid))
@@ -798,9 +878,30 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        retval = p9_client_wstat(oldfid, &wstat);
 clunk_newdir:
-        if (!retval)
+        if (!retval) {
+                if (new_inode) {
+                        if (S_ISDIR(new_inode->i_mode))
+                                clear_nlink(new_inode);
+                        else
+                                drop_nlink(new_inode);
+                        /*
+                         * Work around vfs rename rehash bug with
+                         * FS_RENAME_DOES_D_MOVE
+                         */
+                        v9fs_invalidate_inode_attr(new_inode);
+                }
+                if (S_ISDIR(old_inode->i_mode)) {
+                        if (!new_inode)
+                                inc_nlink(new_dir);
+                        drop_nlink(old_dir);
+                }
+                v9fs_invalidate_inode_attr(old_inode);
+                v9fs_invalidate_inode_attr(old_dir);
+                v9fs_invalidate_inode_attr(new_dir);
                /* successful rename */
                d_move(old_dentry, new_dentry);
+        }
        up_write(&v9ses->rename_sem);
        p9_client_clunk(newdirfid);
@@ -831,9 +932,10 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
        err = -EPERM;
        v9ses = v9fs_inode2v9ses(dentry->d_inode);
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                return simple_getattr(mnt, dentry, stat);
+                generic_fillattr(dentry->d_inode, stat);
+                return 0;
+        }
        fid = v9fs_fid_lookup(dentry);
        if (IS_ERR(fid))
                return PTR_ERR(fid);
@@ -891,17 +993,20 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
                if (iattr->ia_valid & ATTR_GID)
                        wstat.n_gid = iattr->ia_gid;
        }
-        retval = p9_client_wstat(fid, &wstat);
-        if (retval < 0)
-                return retval;
        if ((iattr->ia_valid & ATTR_SIZE) &&
            iattr->ia_size != i_size_read(dentry->d_inode)) {
                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
                if (retval)
                        return retval;
        }
+        /* Write all dirty data */
+        if (S_ISREG(dentry->d_inode->i_mode))
+                filemap_write_and_wait(dentry->d_inode->i_mapping);
+        retval = p9_client_wstat(fid, &wstat);
+        if (retval < 0)
+                return retval;
+        v9fs_invalidate_inode_attr(dentry->d_inode);
        setattr_copy(dentry->d_inode, iattr);
        mark_inode_dirty(dentry->d_inode);
@@ -924,6 +1029,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
        char tag_name[14];
        unsigned int i_nlink;
        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        inode->i_nlink = 1;
@@ -983,6 +1089,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
        /* not real number of blocks, but 512 byte ones ... */
        inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+        v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
 }
 /**
@@ -1115,8 +1222,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
        int mode, const char *extension)
 {
        u32 perm;
-        struct v9fs_session_info *v9ses;
        struct p9_fid *fid;
+        struct v9fs_session_info *v9ses;
        v9ses = v9fs_inode2v9ses(dir);
        if (!v9fs_proto_dotu(v9ses)) {
@@ -1130,6 +1237,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(fid))
                return PTR_ERR(fid);
+        v9fs_invalidate_inode_attr(dir);
        p9_client_clunk(fid);
        return 0;
 }
@@ -1166,8 +1274,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
              struct dentry *dentry)
 {
        int retval;
-        struct p9_fid *oldfid;
        char *name;
+        struct p9_fid *oldfid;
        P9_DPRINTK(P9_DEBUG_VFS,
                " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
@@ -1186,7 +1294,10 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
        sprintf(name, "%d\n", oldfid->fid);
        retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name);
        __putname(name);
+        if (!retval) {
+                v9fs_refresh_inode(oldfid, old_dentry->d_inode);
+                v9fs_invalidate_inode_attr(dir);
+        }
 clunk_fid:
        p9_client_clunk(oldfid);
        return retval;
@@ -1237,6 +1348,32 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        return retval;
 }
+int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
+{
+        loff_t i_size;
+        struct p9_wstat *st;
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(inode);
+        st = p9_client_stat(fid);
+        if (IS_ERR(st))
+                return PTR_ERR(st);
+        spin_lock(&inode->i_lock);
+        /*
+         * We don't want to refresh inode->i_size,
+         * because we may have cached data
+         */
+        i_size = inode->i_size;
+        v9fs_stat2inode(st, inode, inode->i_sb);
+        if (v9ses->cache)
+                inode->i_size = i_size;
+        spin_unlock(&inode->i_lock);
+        p9stat_free(st);
+        kfree(st);
+        return 0;
+}
 static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index fe3ffa9aace..67c138e94fe 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -86,40 +86,63 @@ static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
        return dentry;
 }
+static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
+                                        struct p9_qid *qid,
+                                        struct p9_fid *fid,
+                                        struct p9_stat_dotl *st)
+{
+        int retval;
+        unsigned long i_ino;
+        struct inode *inode;
+        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        i_ino = v9fs_qid2ino(qid);
+        inode = iget_locked(sb, i_ino);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        /*
+         * initialize the inode with the stat info
+         * FIXME!! we may need support for stale inodes
+         * later.
+         */
+        retval = v9fs_init_inode(v9ses, inode, st->st_mode);
+        if (retval)
+                goto error;
+        v9fs_stat2inode_dotl(st, inode);
+#ifdef CONFIG_9P_FSCACHE
+        v9fs_fscache_set_key(inode, &st->qid);
+        v9fs_cache_inode_get_cookie(inode);
+#endif
+        retval = v9fs_get_acl(inode, fid);
+        if (retval)
+                goto error;
+        unlock_new_inode(inode);
+        return inode;
+error:
+        unlock_new_inode(inode);
+        iput(inode);
+        return ERR_PTR(retval);
+}
 struct inode *
-v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-        struct super_block *sb)
+                         struct super_block *sb)
 {
-        struct inode *ret = NULL;
-        int err;
        struct p9_stat_dotl *st;
+        struct inode *inode = NULL;
        st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
        if (IS_ERR(st))
                return ERR_CAST(st);
-        ret = v9fs_get_inode(sb, st->st_mode);
+        inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st);
-        if (IS_ERR(ret)) {
-                err = PTR_ERR(ret);
-                goto error;
-        }
-        v9fs_stat2inode_dotl(st, ret);
-        ret->i_ino = v9fs_qid2ino(&st->qid);
-#ifdef CONFIG_9P_FSCACHE
-        v9fs_vcookie_set_qid(ret, &st->qid);
-        v9fs_cache_inode_get_cookie(ret);
-#endif
-        err = v9fs_get_acl(ret, fid);
-        if (err) {
-                iput(ret);
-                goto error;
-        }
-        kfree(st);
-        return ret;
-error:
        kfree(st);
-        return ERR_PTR(err);
+        return inode;
 }
 /**
@@ -136,16 +159,17 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                struct nameidata *nd)
 {
        int err = 0;
-        char *name = NULL;
        gid_t gid;
        int flags;
        mode_t mode;
-        struct v9fs_session_info *v9ses;
+        char *name = NULL;
-        struct p9_fid *fid = NULL;
-        struct p9_fid *dfid, *ofid;
        struct file *filp;
        struct p9_qid qid;
        struct inode *inode;
+        struct p9_fid *fid = NULL;
+        struct v9fs_inode *v9inode;
+        struct p9_fid *dfid, *ofid, *inode_fid;
+        struct v9fs_session_info *v9ses;
        struct posix_acl *pacl = NULL, *dacl = NULL;
        v9ses = v9fs_inode2v9ses(dir);
@@ -196,6 +220,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                                err);
                goto error;
        }
+        v9fs_invalidate_inode_attr(dir);
        /* instantiate inode and assign the unopened fid to the dentry */
        fid = p9_client_walk(dfid, 1, &name, 1);
@@ -205,7 +230,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                fid = NULL;
                goto error;
        }
-        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -219,6 +244,22 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
        /* Now set the ACL based on the default value */
        v9fs_set_create_acl(dentry, dacl, pacl);
+        v9inode = V9FS_I(inode);
+        if (v9ses->cache && !v9inode->writeback_fid) {
+                /*
+                 * clone a fid and add it to writeback_fid
+                 * we do it during open time instead of
+                 * page dirty time via write_begin/page_mkwrite
+                 * because we want write after unlink usecase
+                 * to work.
+                 */
+                inode_fid = v9fs_writeback_fid(dentry);
+                if (IS_ERR(inode_fid)) {
+                        err = PTR_ERR(inode_fid);
+                        goto error;
+                }
+                v9inode->writeback_fid = (void *) inode_fid;
+        }
        /* Since we are opening a file, assign the open fid to the file */
        filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
        if (IS_ERR(filp)) {
@@ -226,6 +267,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                return PTR_ERR(filp);
        }
        filp->private_data = ofid;
+#ifdef CONFIG_9P_FSCACHE
+        if (v9ses->cache)
+                v9fs_cache_inode_set_cookie(inode, filp);
+#endif
        return 0;
 error:
@@ -300,7 +345,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
                        goto error;
                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -327,7 +372,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
        }
        /* Now set the ACL based on the default value */
        v9fs_set_create_acl(dentry, dacl, pacl);
+        inc_nlink(dir);
+        v9fs_invalidate_inode_attr(dir);
 error:
        if (fid)
                p9_client_clunk(fid);
@@ -346,9 +392,10 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
        err = -EPERM;
        v9ses = v9fs_inode2v9ses(dentry->d_inode);
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                return simple_getattr(mnt, dentry, stat);
+                generic_fillattr(dentry->d_inode, stat);
+                return 0;
+        }
        fid = v9fs_fid_lookup(dentry);
        if (IS_ERR(fid))
                return PTR_ERR(fid);
@@ -406,16 +453,20 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
        if (IS_ERR(fid))
                return PTR_ERR(fid);
-        retval = p9_client_setattr(fid, &p9attr);
-        if (retval < 0)
-                return retval;
        if ((iattr->ia_valid & ATTR_SIZE) &&
            iattr->ia_size != i_size_read(dentry->d_inode)) {
                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
                if (retval)
                        return retval;
        }
+        /* Write all dirty data */
+        if (S_ISREG(dentry->d_inode->i_mode))
+                filemap_write_and_wait(dentry->d_inode->i_mapping);
+        retval = p9_client_setattr(fid, &p9attr);
+        if (retval < 0)
+                return retval;
+        v9fs_invalidate_inode_attr(dentry->d_inode);
        setattr_copy(dentry->d_inode, iattr);
        mark_inode_dirty(dentry->d_inode);
@@ -439,6 +490,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 void
 v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 {
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
                inode->i_atime.tv_sec = stat->st_atime_sec;
@@ -497,20 +549,21 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
        /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
         * because the inode structure does not have fields for them.
         */
+        v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
 }
 static int
 v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
                const char *symname)
 {
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *dfid;
-        struct p9_fid *fid = NULL;
-        struct inode *inode;
-        struct p9_qid qid;
-        char *name;
        int err;
        gid_t gid;
+        char *name;
+        struct p9_qid qid;
+        struct inode *inode;
+        struct p9_fid *dfid;
+        struct p9_fid *fid = NULL;
+        struct v9fs_session_info *v9ses;
        name = (char *) dentry->d_name.name;
        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
@@ -534,6 +587,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
                goto error;
        }
+        v9fs_invalidate_inode_attr(dir);
        if (v9ses->cache) {
                /* Now walk from the parent so we can get an unopened fid. */
                fid = p9_client_walk(dfid, 1, &name, 1);
@@ -546,7 +600,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
                }
                /* instantiate inode and assign the unopened fid to dentry */
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -588,10 +642,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
                struct dentry *dentry)
 {
        int err;
-        struct p9_fid *dfid, *oldfid;
        char *name;
-        struct v9fs_session_info *v9ses;
        struct dentry *dir_dentry;
+        struct p9_fid *dfid, *oldfid;
+        struct v9fs_session_info *v9ses;
        P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
                        dir->i_ino, old_dentry->d_name.name,
@@ -616,29 +670,17 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
                return err;
        }
+        v9fs_invalidate_inode_attr(dir);
        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
                /* Get the latest stat info from server. */
                struct p9_fid *fid;
-                struct p9_stat_dotl *st;
                fid = v9fs_fid_lookup(old_dentry);
                if (IS_ERR(fid))
                        return PTR_ERR(fid);
-                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+                v9fs_refresh_inode_dotl(fid, old_dentry->d_inode);
-                if (IS_ERR(st))
-                        return PTR_ERR(st);
-                v9fs_stat2inode_dotl(st, old_dentry->d_inode);
-                kfree(st);
-        } else {
-                /* Caching disabled. No need to get upto date stat info.
-                 * This dentry will be released immediately. So, just hold the
-                 * inode
-                 */
-                ihold(old_dentry->d_inode);
        }
+        ihold(old_dentry->d_inode);
        d_instantiate(dentry, old_dentry->d_inode);
        return err;
@@ -657,12 +699,12 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
                dev_t rdev)
 {
        int err;
+        gid_t gid;
        char *name;
        mode_t mode;
        struct v9fs_session_info *v9ses;
        struct p9_fid *fid = NULL, *dfid = NULL;
        struct inode *inode;
-        gid_t gid;
        struct p9_qid qid;
        struct dentry *dir_dentry;
        struct posix_acl *dacl = NULL, *pacl = NULL;
@@ -699,6 +741,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
        if (err < 0)
                goto error;
+        v9fs_invalidate_inode_attr(dir);
        /* instantiate inode and assign the unopened fid to the dentry */
        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
                fid = p9_client_walk(dfid, 1, &name, 1);
@@ -710,7 +753,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
                        goto error;
                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -782,6 +825,31 @@ ndset:
        return NULL;
 }
+int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
+{
+        loff_t i_size;
+        struct p9_stat_dotl *st;
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(inode);
+        st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+        if (IS_ERR(st))
+                return PTR_ERR(st);
+        spin_lock(&inode->i_lock);
+        /*
+         * We don't want to refresh inode->i_size,
+         * because we may have cached data
+         */
+        i_size = inode->i_size;
+        v9fs_stat2inode_dotl(st, inode);
+        if (v9ses->cache)
+                inode->i_size = i_size;
+        spin_unlock(&inode->i_lock);
+        kfree(st);
+        return 0;
+}
 const struct inode_operations v9fs_dir_inode_operations_dotl = {
        .create = v9fs_vfs_create_dotl,
        .lookup = v9fs_vfs_lookup,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index dbaabe3b813..09fd08d1606 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -86,12 +86,15 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
        } else
                sb->s_op = &v9fs_super_ops;
        sb->s_bdi = &v9ses->bdi;
+        if (v9ses->cache)
+                sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE;
-        sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
+        sb->s_flags = flags | MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
-            MS_NOATIME;
+        if (!v9ses->cache)
+                sb->s_flags |= MS_SYNCHRONOUS;
 #ifdef CONFIG_9P_FS_POSIX_ACL
-        if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)
+        if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL)
                sb->s_flags |= MS_POSIXACL;
 #endif
@@ -151,7 +154,6 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
                retval = PTR_ERR(inode);
                goto release_sb;
        }
        root = d_alloc_root(inode);
        if (!root) {
                iput(inode);
@@ -166,7 +168,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
                        retval = PTR_ERR(st);
                        goto release_sb;
                }
+                root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
                v9fs_stat2inode_dotl(st, root->d_inode);
                kfree(st);
        } else {
@@ -183,10 +185,21 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
                p9stat_free(st);
                kfree(st);
        }
+        v9fs_fid_add(root, fid);
        retval = v9fs_get_acl(inode, fid);
        if (retval)
                goto release_sb;
-        v9fs_fid_add(root, fid);
+        /*
+         * Add the root fid to session info. This is used
+         * for file system sync. We want a cloned fid here
+         * so that we can do a sync_filesystem after a
+         * shrink_dcache_for_umount
+         */
+        v9ses->root_fid = v9fs_fid_clone(root);
+        if (IS_ERR(v9ses->root_fid)) {
+                retval = PTR_ERR(v9ses->root_fid);
+                goto release_sb;
+        }
        P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
        return dget(sb->s_root);
@@ -197,15 +210,11 @@ close_session:
        v9fs_session_close(v9ses);
        kfree(v9ses);
        return ERR_PTR(retval);
 release_sb:
        /*
-         * we will do the session_close and root dentry release
+         * we will do the session_close and root dentry
-         * in the below call. But we need to clunk fid, because we haven't
+         * release in the below call.
-         * attached the fid to dentry so it won't get clunked
-         * automatically.
         */
-        p9_client_clunk(fid);
        deactivate_locked_super(sb);
        return ERR_PTR(retval);
 }
@@ -223,7 +232,7 @@ static void v9fs_kill_super(struct super_block *s)
        P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
        kill_anon_super(s);
+        p9_client_clunk(v9ses->root_fid);
        v9fs_session_cancel(v9ses);
        v9fs_session_close(v9ses);
        kfree(v9ses);
@@ -276,11 +285,31 @@ done:
        return res;
 }
+static int v9fs_sync_fs(struct super_block *sb, int wait)
+{
+        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_sync_fs: super_block %p\n", sb);
+        return p9_client_sync_fs(v9ses->root_fid);
+}
+static int v9fs_drop_inode(struct inode *inode)
+{
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(inode);
+        if (v9ses->cache)
+                return generic_drop_inode(inode);
+        /*
+         * in case of non cached mode always drop the
+         * the inode because we want the inode attribute
+         * to always match that on the server.
+         */
+        return 1;
+}
 static const struct super_operations v9fs_super_ops = {
-#ifdef CONFIG_9P_FSCACHE
        .alloc_inode = v9fs_alloc_inode,
        .destroy_inode = v9fs_destroy_inode,
-#endif
        .statfs = simple_statfs,
        .evict_inode = v9fs_evict_inode,
        .show_options = generic_show_options,
@@ -288,11 +317,11 @@ static const struct super_operations v9fs_super_ops = {
 };
 static const struct super_operations v9fs_super_ops_dotl = {
-#ifdef CONFIG_9P_FSCACHE
        .alloc_inode = v9fs_alloc_inode,
        .destroy_inode = v9fs_destroy_inode,
-#endif
+        .sync_fs = v9fs_sync_fs,
        .statfs = v9fs_statfs,
+        .drop_inode = v9fs_drop_inode,
        .evict_inode = v9fs_evict_inode,
        .show_options = generic_show_options,
        .umount_begin = v9fs_umount_begin,
@@ -303,5 +332,5 @@ struct file_system_type v9fs_fs_type = {
        .mount = v9fs_mount,
        .kill_sb = v9fs_kill_super,
        .owner = THIS_MODULE,
-        .fs_flags = FS_RENAME_DOES_D_MOVE,
+        .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT,
 };
diff --git a/fs/Kconfig b/fs/Kconfig
index 3db9caa57ed..7cb53aafac1 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,7 +47,7 @@ config FS_POSIX_ACL
        def_bool n
 config EXPORTFS
-        tristate
+        bool
 config FILE_LOCKING
        bool "Enable POSIX file locking API" if EXPERT
diff --git a/fs/Makefile b/fs/Makefile
index a7f7cef0c0c..ba01202844c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -48,6 +48,8 @@ obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o xattr_acl.o
 obj-$(CONFIG_NFS_COMMON)        += nfs_common/
 obj-$(CONFIG_GENERIC_ACL)       += generic_acl.o
+obj-$(CONFIG_FHANDLE)           += fhandle.o
 obj-y                           += quota/
 obj-$(CONFIG_PROC_FS)           += proc/
diff --git a/fs/aio.c b/fs/aio.c
index 26869cde395..7f54f43b8f7 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -85,7 +85,7 @@ static int __init aio_setup(void)
        kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
        kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
-        aio_wq = create_workqueue("aio");
+        aio_wq = alloc_workqueue("aio", 0, 1);  /* used to limit concurrency */
        abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
        BUG_ON(!aio_wq || !abe_pool);
@@ -577,7 +577,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
                spin_lock(&fput_lock);
                list_add(&req->ki_list, &fput_head);
                spin_unlock(&fput_lock);
-                queue_work(aio_wq, &fput_work);
+                schedule_work(&fput_work);
        } else {
                req->ki_filp = NULL;
                really_put_req(ctx, req);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ff27d7a477b..b4ffad859ad 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
        int len = *max_len;
        int type;
-        if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
+        if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
-            (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
+                *max_len = BTRFS_FID_SIZE_CONNECTABLE;
                return 255;
+        } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
+                *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+                return 255;
+        }
        len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
        type = FILEID_BTRFS_WITHOUT_PARENT;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 93323ac26b0..512c3d1da08 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4807,9 +4807,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        int err;
        int drop_inode = 0;
-        if (inode->i_nlink == 0)
-                return -ENOENT;
        /* do not allow sys_link's with other subvols of the same device */
        if (root->objectid != BTRFS_I(inode)->root->objectid)
                return -EPERM;
diff --git a/fs/compat.c b/fs/compat.c
index 691c3fd8ce1..c6d31a3bab8 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -262,35 +262,19 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
 */
 asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
 {
-        struct path path;
+        struct kstatfs tmp;
-        int error;
+        int error = user_statfs(pathname, &tmp);
+        if (!error)
-        error = user_path(pathname, &path);
+                error = put_compat_statfs(buf, &tmp);
-        if (!error) {
-                struct kstatfs tmp;
-                error = vfs_statfs(&path, &tmp);
-                if (!error)
-                        error = put_compat_statfs(buf, &tmp);
-                path_put(&path);
-        }
        return error;
 }
 asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf)
 {
-        struct file * file;
        struct kstatfs tmp;
-        int error;
+        int error = fd_statfs(fd, &tmp);
-        error = -EBADF;
-        file = fget(fd);
-        if (!file)
-                goto out;
-        error = vfs_statfs(&file->f_path, &tmp);
        if (!error)
                error = put_compat_statfs(buf, &tmp);
-        fput(file);
-out:
        return error;
 }
@@ -329,41 +313,29 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
 asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf)
 {
-        struct path path;
+        struct kstatfs tmp;
        int error;
        if (sz != sizeof(*buf))
                return -EINVAL;
-        error = user_path(pathname, &path);
+        error = user_statfs(pathname, &tmp);
-        if (!error) {
+        if (!error)
-                struct kstatfs tmp;
+                error = put_compat_statfs64(buf, &tmp);
-                error = vfs_statfs(&path, &tmp);
-                if (!error)
-                        error = put_compat_statfs64(buf, &tmp);
-                path_put(&path);
-        }
        return error;
 }
 asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf)
 {
-        struct file * file;
        struct kstatfs tmp;
        int error;
        if (sz != sizeof(*buf))
                return -EINVAL;
-        error = -EBADF;
+        error = fd_statfs(fd, &tmp);
-        file = fget(fd);
-        if (!file)
-                goto out;
-        error = vfs_statfs(&file->f_path, &tmp);
        if (!error)
                error = put_compat_statfs64(buf, &tmp);
-        fput(file);
-out:
        return error;
 }
@@ -2312,3 +2284,16 @@ asmlinkage long compat_sys_timerfd_gettime(int ufd,
 }
 #endif /* CONFIG_TIMERFD */
+#ifdef CONFIG_FHANDLE
+/*
+ * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
+ * doesn't set the O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_open_by_handle_at(int mountdirfd,
+                             struct file_handle __user *handle, int flags)
+{
+        return do_handle_open(mountdirfd, handle, flags);
+}
+#endif
diff --git a/fs/dcache.c b/fs/dcache.c
index 611ffe928c0..a39fe47c466 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -296,8 +296,12 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
        __releases(parent->d_lock)
        __releases(dentry->d_inode->i_lock)
 {
-        dentry->d_parent = NULL;
        list_del(&dentry->d_u.d_child);
+        /*
+         * Inform try_to_ascend() that we are no longer attached to the
+         * dentry tree
+         */
+        dentry->d_flags |= DCACHE_DISCONNECTED;
        if (parent)
                spin_unlock(&parent->d_lock);
        dentry_iput(dentry);
@@ -1012,6 +1016,35 @@ void shrink_dcache_for_umount(struct super_block *sb)
 }
 /*
+ * This tries to ascend one level of parenthood, but
+ * we can race with renaming, so we need to re-check
+ * the parenthood after dropping the lock and check
+ * that the sequence number still matches.
+ */
+static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq)
+{
+        struct dentry *new = old->d_parent;
+        rcu_read_lock();
+        spin_unlock(&old->d_lock);
+        spin_lock(&new->d_lock);
+        /*
+         * might go back up the wrong parent if we have had a rename
+         * or deletion
+         */
+        if (new != old->d_parent ||
+                 (old->d_flags & DCACHE_DISCONNECTED) ||
+                 (!locked && read_seqretry(&rename_lock, seq))) {
+                spin_unlock(&new->d_lock);
+                new = NULL;
+        }
+        rcu_read_unlock();
+        return new;
+}
+/*
 * Search for at least 1 mount point in the dentry's subdirs.
 * We descend to the next level whenever the d_subdirs
 * list is non-empty and continue searching.
@@ -1066,24 +1099,10 @@ resume:
         * All done at this level ... ascend and resume the search.
         */
        if (this_parent != parent) {
-                struct dentry *tmp;
+                struct dentry *child = this_parent;
-                struct dentry *child;
+                this_parent = try_to_ascend(this_parent, locked, seq);
+                if (!this_parent)
-                tmp = this_parent->d_parent;
-                rcu_read_lock();
-                spin_unlock(&this_parent->d_lock);
-                child = this_parent;
-                this_parent = tmp;
-                spin_lock(&this_parent->d_lock);
-                /* might go back up the wrong parent if we have had a rename
-                 * or deletion */
-                if (this_parent != child->d_parent ||
-                         (!locked && read_seqretry(&rename_lock, seq))) {
-                        spin_unlock(&this_parent->d_lock);
-                        rcu_read_unlock();
                        goto rename_retry;
-                }
-                rcu_read_unlock();
                next = child->d_u.d_child.next;
                goto resume;
        }
@@ -1181,24 +1200,10 @@ resume:
         * All done at this level ... ascend and resume the search.
         */
        if (this_parent != parent) {
-                struct dentry *tmp;
+                struct dentry *child = this_parent;
-                struct dentry *child;
+                this_parent = try_to_ascend(this_parent, locked, seq);
+                if (!this_parent)
-                tmp = this_parent->d_parent;
-                rcu_read_lock();
-                spin_unlock(&this_parent->d_lock);
-                child = this_parent;
-                this_parent = tmp;
-                spin_lock(&this_parent->d_lock);
-                /* might go back up the wrong parent if we have had a rename
-                 * or deletion */
-                if (this_parent != child->d_parent ||
-                        (!locked && read_seqretry(&rename_lock, seq))) {
-                        spin_unlock(&this_parent->d_lock);
-                        rcu_read_unlock();
                        goto rename_retry;
-                }
-                rcu_read_unlock();
                next = child->d_u.d_child.next;
                goto resume;
        }
@@ -2942,28 +2947,14 @@ resume:
                spin_unlock(&dentry->d_lock);
        }
        if (this_parent != root) {
-                struct dentry *tmp;
+                struct dentry *child = this_parent;
-                struct dentry *child;
-                tmp = this_parent->d_parent;
                if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
                        this_parent->d_flags |= DCACHE_GENOCIDE;
                        this_parent->d_count--;
                }
-                rcu_read_lock();
+                this_parent = try_to_ascend(this_parent, locked, seq);
-                spin_unlock(&this_parent->d_lock);
+                if (!this_parent)
-                child = this_parent;
-                this_parent = tmp;
-                spin_lock(&this_parent->d_lock);
-                /* might go back up the wrong parent if we have had a rename
-                 * or deletion */
-                if (this_parent != child->d_parent ||
-                         (!locked && read_seqretry(&rename_lock, seq))) {
-                        spin_unlock(&this_parent->d_lock);
-                        rcu_read_unlock();
                        goto rename_retry;
-                }
-                rcu_read_unlock();
                next = child->d_u.d_child.next;
                goto resume;
        }
diff --git a/fs/exec.c b/fs/exec.c
index 52a447d9b6a..ba99e1abb1a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -115,13 +115,16 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
        struct file *file;
        char *tmp = getname(library);
        int error = PTR_ERR(tmp);
+        static const struct open_flags uselib_flags = {
+                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+                .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
+                .intent = LOOKUP_OPEN
+        };
        if (IS_ERR(tmp))
                goto out;
-        file = do_filp_open(AT_FDCWD, tmp,
+        file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
-                                O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
-                                MAY_READ | MAY_EXEC | MAY_OPEN);
        putname(tmp);
        error = PTR_ERR(file);
        if (IS_ERR(file))
@@ -721,10 +724,13 @@ struct file *open_exec(const char *name)
 {
        struct file *file;
        int err;
+        static const struct open_flags open_exec_flags = {
+                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+                .acc_mode = MAY_EXEC | MAY_OPEN,
+                .intent = LOOKUP_OPEN
+        };
-        file = do_filp_open(AT_FDCWD, name,
+        file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW);
-                                O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
-                                MAY_EXEC | MAY_OPEN);
        if (IS_ERR(file))
                goto out;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 4b6825740dd..b05acb79613 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -320,9 +320,14 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid,
        struct inode * inode = dentry->d_inode;
        int len = *max_len;
        int type = FILEID_INO32_GEN;
-        
-        if (len < 2 || (connectable && len < 4))
+        if (connectable && (len < 4)) {
+                *max_len = 4;
+                return 255;
+        } else if (len < 2) {
+                *max_len = 2;
                return 255;
+        }
        len = 2;
        fid->i32.ino = inode->i_ino;
@@ -369,6 +374,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
        /*
         * Try to get any dentry for the given file handle from the filesystem.
         */
+        if (!nop || !nop->fh_to_dentry)
+                return ERR_PTR(-ESTALE);
        result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
        if (!result)
                result = ERR_PTR(-ESTALE);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 9dba3bd69d9..0521a007ae6 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2253,13 +2253,6 @@ static int ext3_link (struct dentry * old_dentry,
        dquot_initialize(dir);
-        /*
-         * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-         * otherwise has the potential to corrupt the orphan inode list.
-         */
-        if (inode->i_nlink == 0)
-                return -ENOENT;
 retry:
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 85c8cc8f247..9cc19a1dea8 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1936,6 +1936,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        sb->s_qcop = &ext3_qctl_operations;
        sb->dq_op = &ext3_quota_operations;
 #endif
+        memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
        mutex_init(&sbi->s_orphan_lock);
        mutex_init(&sbi->s_resize_lock);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5485390d32c..e781b7ea563 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2304,13 +2304,6 @@ static int ext4_link(struct dentry *old_dentry,
        dquot_initialize(dir);
-        /*
-         * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-         * otherwise has the potential to corrupt the orphan inode list.
-         */
-        if (inode->i_nlink == 0)
-                return -ENOENT;
 retry:
        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f6a318f836b..203f9e4a70b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3415,6 +3415,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_qcop = &ext4_qctl_operations;
        sb->dq_op = &ext4_quota_operations;
 #endif
+        memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
        mutex_init(&sbi->s_orphan_lock);
        mutex_init(&sbi->s_resize_lock);
@@ -3509,7 +3511,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
 no_journal:
-        EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
+        /*
+         * The maximum number of concurrent works can be high and
+         * concurrency isn't really necessary.  Limit it to 1.
+         */
+        EXT4_SB(sb)->dio_unwritten_wq =
+                alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM, 1);
        if (!EXT4_SB(sb)->dio_unwritten_wq) {
                printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
                goto failed_mount_wq;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 86753fe10bd..0e277ec4b61 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -757,8 +757,10 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
        struct inode *inode =  de->d_inode;
        u32 ipos_h, ipos_m, ipos_l;
-        if (len < 5)
+        if (len < 5) {
+                *lenp = 5;
                return 255; /* no room */
+        }
        ipos_h = MSDOS_I(inode)->i_pos >> 8;
        ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index cb1026181bd..6c82e5bac03 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -131,7 +131,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 SYSCALL_DEFINE1(dup, unsigned int, fildes)
 {
        int ret = -EBADF;
-        struct file *file = fget(fildes);
+        struct file *file = fget_raw(fildes);
        if (file) {
                ret = get_unused_fd();
@@ -426,15 +426,35 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
        return err;
 }
+static int check_fcntl_cmd(unsigned cmd)
+{
+        switch (cmd) {
+        case F_DUPFD:
+        case F_DUPFD_CLOEXEC:
+        case F_GETFD:
+        case F_SETFD:
+        case F_GETFL:
+                return 1;
+        }
+        return 0;
+}
 SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 {       
        struct file *filp;
        long err = -EBADF;
-        filp = fget(fd);
+        filp = fget_raw(fd);
        if (!filp)
                goto out;
+        if (unlikely(filp->f_mode & FMODE_PATH)) {
+                if (!check_fcntl_cmd(cmd)) {
+                        fput(filp);
+                        goto out;
+                }
+        }
        err = security_file_fcntl(filp, cmd, arg);
        if (err) {
                fput(filp);
@@ -456,10 +476,17 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
        long err;
        err = -EBADF;
-        filp = fget(fd);
+        filp = fget_raw(fd);
        if (!filp)
                goto out;
+        if (unlikely(filp->f_mode & FMODE_PATH)) {
+                if (!check_fcntl_cmd(cmd)) {
+                        fput(filp);
+                        goto out;
+                }
+        }
        err = security_file_fcntl(filp, cmd, arg);
        if (err) {
                fput(filp);
@@ -808,14 +835,14 @@ static int __init fcntl_init(void)
         * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
         * is defined as O_NONBLOCK on some platforms and not on others.
         */
-        BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+        BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
                O_RDONLY        | O_WRONLY      | O_RDWR        |
                O_CREAT         | O_EXCL        | O_NOCTTY      |
                O_TRUNC         | O_APPEND      | /* O_NONBLOCK | */
                __O_SYNC        | O_DSYNC       | FASYNC        |
                O_DIRECT        | O_LARGEFILE   | O_DIRECTORY   |
                O_NOFOLLOW      | O_NOATIME     | O_CLOEXEC     |
-                __FMODE_EXEC
+                __FMODE_EXEC    | O_PATH
                ));
        fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/fhandle.c b/fs/fhandle.c
new file mode 100644
index 00000000000..bf93ad2bee0
--- /dev/null
+++ b/fs/fhandle.c
@@ -0,0 +1,265 @@
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/exportfs.h>
+#include <linux/fs_struct.h>
+#include <linux/fsnotify.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+static long do_sys_name_to_handle(struct path *path,
+                                  struct file_handle __user *ufh,
+                                  int __user *mnt_id)
+{
+        long retval;
+        struct file_handle f_handle;
+        int handle_dwords, handle_bytes;
+        struct file_handle *handle = NULL;
+        /*
+         * We need t make sure wether the file system
+         * support decoding of the file handle
+         */
+        if (!path->mnt->mnt_sb->s_export_op ||
+            !path->mnt->mnt_sb->s_export_op->fh_to_dentry)
+                return -EOPNOTSUPP;
+        if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
+                return -EFAULT;
+        if (f_handle.handle_bytes > MAX_HANDLE_SZ)
+                return -EINVAL;
+        handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+                         GFP_KERNEL);
+        if (!handle)
+                return -ENOMEM;
+        /* convert handle size to  multiple of sizeof(u32) */
+        handle_dwords = f_handle.handle_bytes >> 2;
+        /* we ask for a non connected handle */
+        retval = exportfs_encode_fh(path->dentry,
+                                    (struct fid *)handle->f_handle,
+                                    &handle_dwords,  0);
+        handle->handle_type = retval;
+        /* convert handle size to bytes */
+        handle_bytes = handle_dwords * sizeof(u32);
+        handle->handle_bytes = handle_bytes;
+        if ((handle->handle_bytes > f_handle.handle_bytes) ||
+            (retval == 255) || (retval == -ENOSPC)) {
+                /* As per old exportfs_encode_fh documentation
+                 * we could return ENOSPC to indicate overflow
+                 * But file system returned 255 always. So handle
+                 * both the values
+                 */
+                /*
+                 * set the handle size to zero so we copy only
+                 * non variable part of the file_handle
+                 */
+                handle_bytes = 0;
+                retval = -EOVERFLOW;
+        } else
+                retval = 0;
+        /* copy the mount id */
+        if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) ||
+            copy_to_user(ufh, handle,
+                         sizeof(struct file_handle) + handle_bytes))
+                retval = -EFAULT;
+        kfree(handle);
+        return retval;
+}
+/**
+ * sys_name_to_handle_at: convert name to handle
+ * @dfd: directory relative to which name is interpreted if not absolute
+ * @name: name that should be converted to handle.
+ * @handle: resulting file handle
+ * @mnt_id: mount id of the file system containing the file
+ * @flag: flag value to indicate whether to follow symlink or not
+ *
+ * @handle->handle_size indicate the space available to store the
+ * variable part of the file handle in bytes. If there is not
+ * enough space, the field is updated to return the minimum
+ * value required.
+ */
+SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
+                struct file_handle __user *, handle, int __user *, mnt_id,
+                int, flag)
+{
+        struct path path;
+        int lookup_flags;
+        int err;
+        if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
+                return -EINVAL;
+        lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
+        if (flag & AT_EMPTY_PATH)
+                lookup_flags |= LOOKUP_EMPTY;
+        err = user_path_at(dfd, name, lookup_flags, &path);
+        if (!err) {
+                err = do_sys_name_to_handle(&path, handle, mnt_id);
+                path_put(&path);
+        }
+        return err;
+}
+static struct vfsmount *get_vfsmount_from_fd(int fd)
+{
+        struct path path;
+        if (fd == AT_FDCWD) {
+                struct fs_struct *fs = current->fs;
+                spin_lock(&fs->lock);
+                path = fs->pwd;
+                mntget(path.mnt);
+                spin_unlock(&fs->lock);
+        } else {
+                int fput_needed;
+                struct file *file = fget_light(fd, &fput_needed);
+                if (!file)
+                        return ERR_PTR(-EBADF);
+                path = file->f_path;
+                mntget(path.mnt);
+                fput_light(file, fput_needed);
+        }
+        return path.mnt;
+}
+static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
+{
+        return 1;
+}
+static int do_handle_to_path(int mountdirfd, struct file_handle *handle,
+                             struct path *path)
+{
+        int retval = 0;
+        int handle_dwords;
+        path->mnt = get_vfsmount_from_fd(mountdirfd);
+        if (IS_ERR(path->mnt)) {
+                retval = PTR_ERR(path->mnt);
+                goto out_err;
+        }
+        /* change the handle size to multiple of sizeof(u32) */
+        handle_dwords = handle->handle_bytes >> 2;
+        path->dentry = exportfs_decode_fh(path->mnt,
+                                          (struct fid *)handle->f_handle,
+                                          handle_dwords, handle->handle_type,
+                                          vfs_dentry_acceptable, NULL);
+        if (IS_ERR(path->dentry)) {
+                retval = PTR_ERR(path->dentry);
+                goto out_mnt;
+        }
+        return 0;
+out_mnt:
+        mntput(path->mnt);
+out_err:
+        return retval;
+}
+static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
+                   struct path *path)
+{
+        int retval = 0;
+        struct file_handle f_handle;
+        struct file_handle *handle = NULL;
+        /*
+         * With handle we don't look at the execute bit on the
+         * the directory. Ideally we would like CAP_DAC_SEARCH.
+         * But we don't have that
+         */
+        if (!capable(CAP_DAC_READ_SEARCH)) {
+                retval = -EPERM;
+                goto out_err;
+        }
+        if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
+                retval = -EFAULT;
+                goto out_err;
+        }
+        if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
+            (f_handle.handle_bytes == 0)) {
+                retval = -EINVAL;
+                goto out_err;
+        }
+        handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+                         GFP_KERNEL);
+        if (!handle) {
+                retval = -ENOMEM;
+                goto out_err;
+        }
+        /* copy the full handle */
+        if (copy_from_user(handle, ufh,
+                           sizeof(struct file_handle) +
+                           f_handle.handle_bytes)) {
+                retval = -EFAULT;
+                goto out_handle;
+        }
+        retval = do_handle_to_path(mountdirfd, handle, path);
+out_handle:
+        kfree(handle);
+out_err:
+        return retval;
+}
+long do_handle_open(int mountdirfd,
+                    struct file_handle __user *ufh, int open_flag)
+{
+        long retval = 0;
+        struct path path;
+        struct file *file;
+        int fd;
+        retval = handle_to_path(mountdirfd, ufh, &path);
+        if (retval)
+                return retval;
+        fd = get_unused_fd_flags(open_flag);
+        if (fd < 0) {
+                path_put(&path);
+                return fd;
+        }
+        file = file_open_root(path.dentry, path.mnt, "", open_flag);
+        if (IS_ERR(file)) {
+                put_unused_fd(fd);
+                retval =  PTR_ERR(file);
+        } else {
+                retval = fd;
+                fsnotify_open(file);
+                fd_install(fd, file);
+        }
+        path_put(&path);
+        return retval;
+}
+/**
+ * sys_open_by_handle_at: Open the file handle
+ * @mountdirfd: directory file descriptor
+ * @handle: file handle to be opened
+ * @flag: open flags.
+ *
+ * @mountdirfd indicate the directory file descriptor
+ * of the mount point. file handle is decoded relative
+ * to the vfsmount pointed by the @mountdirfd. @flags
+ * value is same as the open(2) flags.
+ */
+SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
+                struct file_handle __user *, handle,
+                int, flags)
+{
+        long ret;
+        if (force_o_largefile())
+                flags |= O_LARGEFILE;
+        ret = do_handle_open(mountdirfd, handle, flags);
+        return ret;
+}
diff --git a/fs/file_table.c b/fs/file_table.c
index cbeec70ee31..bfab973c6c5 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -279,11 +279,10 @@ struct file *fget(unsigned int fd)
        rcu_read_lock();
        file = fcheck_files(files, fd);
        if (file) {
-                if (!atomic_long_inc_not_zero(&file->f_count)) {
+                /* File object ref couldn't be taken */
-                        /* File object ref couldn't be taken */
+                if (file->f_mode & FMODE_PATH ||
-                        rcu_read_unlock();
+                    !atomic_long_inc_not_zero(&file->f_count))
-                        return NULL;
+                        file = NULL;
-                }
        }
        rcu_read_unlock();
@@ -292,6 +291,25 @@ struct file *fget(unsigned int fd)
 EXPORT_SYMBOL(fget);
+struct file *fget_raw(unsigned int fd)
+{
+        struct file *file;
+        struct files_struct *files = current->files;
+        rcu_read_lock();
+        file = fcheck_files(files, fd);
+        if (file) {
+                /* File object ref couldn't be taken */
+                if (!atomic_long_inc_not_zero(&file->f_count))
+                        file = NULL;
+        }
+        rcu_read_unlock();
+        return file;
+}
+EXPORT_SYMBOL(fget_raw);
 /*
 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
 *
@@ -316,6 +334,33 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
        *fput_needed = 0;
        if (atomic_read(&files->count) == 1) {
                file = fcheck_files(files, fd);
+                if (file && (file->f_mode & FMODE_PATH))
+                        file = NULL;
+        } else {
+                rcu_read_lock();
+                file = fcheck_files(files, fd);
+                if (file) {
+                        if (!(file->f_mode & FMODE_PATH) &&
+                            atomic_long_inc_not_zero(&file->f_count))
+                                *fput_needed = 1;
+                        else
+                                /* Didn't get the reference, someone's freed */
+                                file = NULL;
+                }
+                rcu_read_unlock();
+        }
+        return file;
+}
+struct file *fget_raw_light(unsigned int fd, int *fput_needed)
+{
+        struct file *file;
+        struct files_struct *files = current->files;
+        *fput_needed = 0;
+        if (atomic_read(&files->count) == 1) {
+                file = fcheck_files(files, fd);
        } else {
                rcu_read_lock();
                file = fcheck_files(files, fd);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9e3f68cc1bd..051b1a08452 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -637,8 +637,10 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
        u64 nodeid;
        u32 generation;
-        if (*max_len < len)
+        if (*max_len < len) {
+                *max_len = len;
                return  255;
+        }
        nodeid = get_fuse_inode(inode)->nodeid;
        generation = inode->i_generation;
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 7118f1a780a..cbc07155b1a 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -80,8 +80,11 @@ int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
        struct posix_acl *acl;
        int error;
-        if (flags & IPERM_FLAG_RCU)
+        if (flags & IPERM_FLAG_RCU) {
-                return -ECHILD;
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
        acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 4f36f8832b9..aad77e4f61b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -695,6 +695,7 @@ out:
        if (error == 0)
                return 0;
+        unlock_page(page);
        page_cache_release(page);
        gfs2_trans_end(sdp);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3c4039d5eef..ef3dc4b9fae 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -21,6 +21,7 @@
 #include "meta_io.h"
 #include "quota.h"
 #include "rgrp.h"
+#include "super.h"
 #include "trans.h"
 #include "dir.h"
 #include "util.h"
@@ -757,7 +758,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrp_list rlist;
        u64 bn, bstart;
-        u32 blen;
+        u32 blen, btotal;
        __be64 *p;
        unsigned int rg_blocks = 0;
        int metadata;
@@ -839,6 +840,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        bstart = 0;
        blen = 0;
+        btotal = 0;
        for (p = top; p < bottom; p++) {
                if (!*p)
@@ -851,9 +853,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
                else {
                        if (bstart) {
                                if (metadata)
-                                        gfs2_free_meta(ip, bstart, blen);
+                                        __gfs2_free_meta(ip, bstart, blen);
                                else
-                                        gfs2_free_data(ip, bstart, blen);
+                                        __gfs2_free_data(ip, bstart, blen);
+                                btotal += blen;
                        }
                        bstart = bn;
@@ -865,11 +869,17 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        }
        if (bstart) {
                if (metadata)
-                        gfs2_free_meta(ip, bstart, blen);
+                        __gfs2_free_meta(ip, bstart, blen);
                else
-                        gfs2_free_data(ip, bstart, blen);
+                        __gfs2_free_data(ip, bstart, blen);
+                btotal += blen;
        }
+        gfs2_statfs_change(sdp, 0, +btotal, 0);
+        gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
+                          ip->i_inode.i_gid);
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_dinode_out(ip, dibh->b_data);
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9023db8184f..b5a5e60df0d 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -36,9 +36,13 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
        struct super_block *sb = inode->i_sb;
        struct gfs2_inode *ip = GFS2_I(inode);
-        if (*len < GFS2_SMALL_FH_SIZE ||
+        if (connectable && (*len < GFS2_LARGE_FH_SIZE)) {
-            (connectable && *len < GFS2_LARGE_FH_SIZE))
+                *len = GFS2_LARGE_FH_SIZE;
                return 255;
+        } else if (*len < GFS2_SMALL_FH_SIZE) {
+                *len = GFS2_SMALL_FH_SIZE;
+                return 255;
+        }
        fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
        fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 7cfdcb91336..4074b952b05 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -448,15 +448,20 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-        if (!(file->f_flags & O_NOATIME)) {
+        if (!(file->f_flags & O_NOATIME) &&
+            !IS_NOATIME(&ip->i_inode)) {
                struct gfs2_holder i_gh;
                int error;
-                gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+                gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
                error = gfs2_glock_nq(&i_gh);
-                file_accessed(file);
+                if (error == 0) {
-                if (error == 0)
+                        file_accessed(file);
-                        gfs2_glock_dq_uninit(&i_gh);
+                        gfs2_glock_dq(&i_gh);
+                }
+                gfs2_holder_uninit(&i_gh);
+                if (error)
+                        return error;
        }
        vma->vm_ops = &gfs2_vm_ops;
        vma->vm_flags |= VM_CAN_NONLINEAR;
@@ -617,8 +622,7 @@ static void empty_write_end(struct page *page, unsigned from,
 {
        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-        page_zero_new_buffers(page, from, to);
+        zero_user(page, from, to-from);
-        flush_dcache_page(page);
        mark_page_accessed(page);
        if (!gfs2_is_writeback(ip))
@@ -627,36 +631,43 @@ static void empty_write_end(struct page *page, unsigned from,
        block_commit_write(page, from, to);
 }
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+static int needs_empty_write(sector_t block, struct inode *inode)
 {
-        unsigned start, end, next;
-        struct buffer_head *bh, *head;
        int error;
+        struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
-        if (!page_has_buffers(page)) {
+        bh_map.b_size = 1 << inode->i_blkbits;
-                error = __block_write_begin(page, from, to - from, gfs2_block_map);
+        error = gfs2_block_map(inode, block, &bh_map, 0);
-                if (unlikely(error))
+        if (unlikely(error))
-                        return error;
+                return error;
+        return !buffer_mapped(&bh_map);
+}
-                empty_write_end(page, from, to);
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
-                return 0;
+{
-        }
+        struct inode *inode = page->mapping->host;
+        unsigned start, end, next, blksize;
+        sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        int ret;
-        bh = head = page_buffers(page);
+        blksize = 1 << inode->i_blkbits;
        next = end = 0;
        while (next < from) {
-                next += bh->b_size;
+                next += blksize;
-                bh = bh->b_this_page;
+                block++;
        }
        start = next;
        do {
-                next += bh->b_size;
+                next += blksize;
-                if (buffer_mapped(bh)) {
+                ret = needs_empty_write(block, inode);
+                if (unlikely(ret < 0))
+                        return ret;
+                if (ret == 0) {
                        if (end) {
-                                error = __block_write_begin(page, start, end - start,
+                                ret = __block_write_begin(page, start, end - start,
-                                                            gfs2_block_map);
+                                                          gfs2_block_map);
-                                if (unlikely(error))
+                                if (unlikely(ret))
-                                        return error;
+                                        return ret;
                                empty_write_end(page, start, end);
                                end = 0;
                        }
@@ -664,13 +675,13 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
                }
                else
                        end = next;
-                bh = bh->b_this_page;
+                block++;
        } while (next < to);
        if (end) {
-                error = __block_write_begin(page, start, end - start, gfs2_block_map);
+                ret = __block_write_begin(page, start, end - start, gfs2_block_map);
-                if (unlikely(error))
+                if (unlikely(ret))
-                        return error;
+                        return ret;
                empty_write_end(page, start, end);
        }
@@ -976,8 +987,10 @@ static void do_unflock(struct file *file, struct file_lock *fl)
        mutex_lock(&fp->f_fl_mutex);
        flock_lock_file_wait(file, fl);
-        if (fl_gh->gh_gl)
+        if (fl_gh->gh_gl) {
-                gfs2_glock_dq_uninit(fl_gh);
+                gfs2_glock_dq_wait(fl_gh);
+                gfs2_holder_uninit(fl_gh);
+        }
        mutex_unlock(&fp->f_fl_mutex);
 }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 7cd9a5a68d5..e2431313491 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -26,6 +26,9 @@
 #include <linux/freezer.h>
 #include <linux/workqueue.h>
 #include <linux/jiffies.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
+#include <linux/bit_spinlock.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -41,10 +44,6 @@
 #define CREATE_TRACE_POINTS
 #include "trace_gfs2.h"
-struct gfs2_gl_hash_bucket {
-        struct hlist_head hb_list;
-};
 struct gfs2_glock_iter {
        int hash;                       /* hash bucket index         */
        struct gfs2_sbd *sdp;           /* incore superblock         */
@@ -54,7 +53,6 @@ struct gfs2_glock_iter {
 typedef void (*glock_examiner) (struct gfs2_glock * gl);
-static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
 static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
 #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
 static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
@@ -70,57 +68,9 @@ static DEFINE_SPINLOCK(lru_lock);
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
 #define GFS2_GL_HASH_MASK       (GFS2_GL_HASH_SIZE - 1)
-static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE];
+static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE];
 static struct dentry *gfs2_root;
-/*
- * Despite what you might think, the numbers below are not arbitrary :-)
- * They are taken from the ipv4 routing hash code, which is well tested
- * and thus should be nearly optimal. Later on we might tweek the numbers
- * but for now this should be fine.
- *
- * The reason for putting the locks in a separate array from the list heads
- * is that we can have fewer locks than list heads and save memory. We use
- * the same hash function for both, but with a different hash mask.
- */
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
-        defined(CONFIG_PROVE_LOCKING)
-#ifdef CONFIG_LOCKDEP
-# define GL_HASH_LOCK_SZ        256
-#else
-# if NR_CPUS >= 32
-#  define GL_HASH_LOCK_SZ       4096
-# elif NR_CPUS >= 16
-#  define GL_HASH_LOCK_SZ       2048
-# elif NR_CPUS >= 8
-#  define GL_HASH_LOCK_SZ       1024
-# elif NR_CPUS >= 4
-#  define GL_HASH_LOCK_SZ       512
-# else
-#  define GL_HASH_LOCK_SZ       256
-# endif
-#endif
-/* We never want more locks than chains */
-#if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ
-# undef GL_HASH_LOCK_SZ
-# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE
-#endif
-static rwlock_t gl_hash_locks[GL_HASH_LOCK_SZ];
-static inline rwlock_t *gl_lock_addr(unsigned int x)
-{
-        return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)];
-}
-#else /* not SMP, so no spinlocks required */
-static inline rwlock_t *gl_lock_addr(unsigned int x)
-{
-        return NULL;
-}
-#endif
 /**
 * gl_hash() - Turn glock number into hash bucket number
 * @lock: The glock number
@@ -141,25 +91,35 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp,
        return h;
 }
-/**
+static inline void spin_lock_bucket(unsigned int hash)
- * glock_free() - Perform a few checks and then release struct gfs2_glock
+{
- * @gl: The glock to release
+        struct hlist_bl_head *bl = &gl_hash_table[hash];
- *
+        bit_spin_lock(0, (unsigned long *)bl);
- * Also calls lock module to release its internal structure for this glock.
+}
- *
- */
-static void glock_free(struct gfs2_glock *gl)
+static inline void spin_unlock_bucket(unsigned int hash)
+{
+        struct hlist_bl_head *bl = &gl_hash_table[hash];
+        __bit_spin_unlock(0, (unsigned long *)bl);
+}
+static void gfs2_glock_dealloc(struct rcu_head *rcu)
+{
+        struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
+        if (gl->gl_ops->go_flags & GLOF_ASPACE)
+                kmem_cache_free(gfs2_glock_aspace_cachep, gl);
+        else
+                kmem_cache_free(gfs2_glock_cachep, gl);
+}
+void gfs2_glock_free(struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
-        struct address_space *mapping = gfs2_glock2aspace(gl);
-        struct kmem_cache *cachep = gfs2_glock_cachep;
-        GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
+        call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
-        trace_gfs2_glock_put(gl);
+        if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-        if (mapping)
+                wake_up(&sdp->sd_glock_wait);
-                cachep = gfs2_glock_aspace_cachep;
-        sdp->sd_lockstruct.ls_ops->lm_put_lock(cachep, gl);
 }
 /**
@@ -185,34 +145,49 @@ static int demote_ok(const struct gfs2_glock *gl)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        /* assert_spin_locked(&gl->gl_spin); */
        if (gl->gl_state == LM_ST_UNLOCKED)
                return 0;
-        if (!list_empty(&gl->gl_holders))
+        if (test_bit(GLF_LFLUSH, &gl->gl_flags))
+                return 0;
+        if ((gl->gl_name.ln_type != LM_TYPE_INODE) &&
+            !list_empty(&gl->gl_holders))
                return 0;
        if (glops->go_demote_ok)
                return glops->go_demote_ok(gl);
        return 1;
 }
 /**
- * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
+ * __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
 * @gl: the glock
 *
+ * If the glock is demotable, then we add it (or move it) to the end
+ * of the glock LRU list.
 */
-static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+static void __gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 {
-        int may_reclaim;
+        if (demote_ok(gl)) {
-        may_reclaim = (demote_ok(gl) &&
+                spin_lock(&lru_lock);
-                       (atomic_read(&gl->gl_ref) == 1 ||
-                        (gl->gl_name.ln_type == LM_TYPE_INODE &&
+                if (!list_empty(&gl->gl_lru))
-                         atomic_read(&gl->gl_ref) <= 2)));
+                        list_del_init(&gl->gl_lru);
-        spin_lock(&lru_lock);
+                else
-        if (list_empty(&gl->gl_lru) && may_reclaim) {
+                        atomic_inc(&lru_count);
                list_add_tail(&gl->gl_lru, &lru_list);
-                atomic_inc(&lru_count);
+                spin_unlock(&lru_lock);
        }
-        spin_unlock(&lru_lock);
+}
+void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+{
+        spin_lock(&gl->gl_spin);
+        __gfs2_glock_schedule_for_reclaim(gl);
+        spin_unlock(&gl->gl_spin);
 }
 /**
@@ -227,7 +202,6 @@ void gfs2_glock_put_nolock(struct gfs2_glock *gl)
 {
        if (atomic_dec_and_test(&gl->gl_ref))
                GLOCK_BUG_ON(gl, 1);
-        gfs2_glock_schedule_for_reclaim(gl);
 }
 /**
@@ -236,30 +210,26 @@ void gfs2_glock_put_nolock(struct gfs2_glock *gl)
 *
 */
-int gfs2_glock_put(struct gfs2_glock *gl)
+void gfs2_glock_put(struct gfs2_glock *gl)
 {
-        int rv = 0;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct address_space *mapping = gfs2_glock2aspace(gl);
-        write_lock(gl_lock_addr(gl->gl_hash));
+        if (atomic_dec_and_test(&gl->gl_ref)) {
-        if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) {
+                spin_lock_bucket(gl->gl_hash);
-                hlist_del(&gl->gl_list);
+                hlist_bl_del_rcu(&gl->gl_list);
+                spin_unlock_bucket(gl->gl_hash);
+                spin_lock(&lru_lock);
                if (!list_empty(&gl->gl_lru)) {
                        list_del_init(&gl->gl_lru);
                        atomic_dec(&lru_count);
                }
                spin_unlock(&lru_lock);
-                write_unlock(gl_lock_addr(gl->gl_hash));
                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
-                glock_free(gl);
+                GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
-                rv = 1;
+                trace_gfs2_glock_put(gl);
-                goto out;
+                sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
        }
-        spin_lock(&gl->gl_spin);
-        gfs2_glock_schedule_for_reclaim(gl);
-        spin_unlock(&gl->gl_spin);
-        write_unlock(gl_lock_addr(gl->gl_hash));
-out:
-        return rv;
 }
 /**
@@ -275,17 +245,15 @@ static struct gfs2_glock *search_bucket(unsigned int hash,
                                        const struct lm_lockname *name)
 {
        struct gfs2_glock *gl;
-        struct hlist_node *h;
+        struct hlist_bl_node *h;
-        hlist_for_each_entry(gl, h, &gl_hash_table[hash].hb_list, gl_list) {
+        hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) {
                if (!lm_name_equal(&gl->gl_name, name))
                        continue;
                if (gl->gl_sbd != sdp)
                        continue;
+                if (atomic_inc_not_zero(&gl->gl_ref))
-                atomic_inc(&gl->gl_ref);
+                        return gl;
-                return gl;
        }
        return NULL;
@@ -743,10 +711,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        struct gfs2_glock *gl, *tmp;
        unsigned int hash = gl_hash(sdp, &name);
        struct address_space *mapping;
+        struct kmem_cache *cachep;
-        read_lock(gl_lock_addr(hash));
+        rcu_read_lock();
        gl = search_bucket(hash, sdp, &name);
-        read_unlock(gl_lock_addr(hash));
+        rcu_read_unlock();
        *glp = gl;
        if (gl)
@@ -755,9 +724,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                return -ENOENT;
        if (glops->go_flags & GLOF_ASPACE)
-                gl = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_KERNEL);
+                cachep = gfs2_glock_aspace_cachep;
        else
-                gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
+                cachep = gfs2_glock_cachep;
+        gl = kmem_cache_alloc(cachep, GFP_KERNEL);
        if (!gl)
                return -ENOMEM;
@@ -790,15 +760,16 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                mapping->writeback_index = 0;
        }
-        write_lock(gl_lock_addr(hash));
+        spin_lock_bucket(hash);
        tmp = search_bucket(hash, sdp, &name);
        if (tmp) {
-                write_unlock(gl_lock_addr(hash));
+                spin_unlock_bucket(hash);
-                glock_free(gl);
+                kmem_cache_free(cachep, gl);
+                atomic_dec(&sdp->sd_glock_disposal);
                gl = tmp;
        } else {
-                hlist_add_head(&gl->gl_list, &gl_hash_table[hash].hb_list);
+                hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]);
-                write_unlock(gl_lock_addr(hash));
+                spin_unlock_bucket(hash);
        }
        *glp = gl;
@@ -1007,13 +978,13 @@ fail:
                        insert_pt = &gh2->gh_list;
        }
        set_bit(GLF_QUEUED, &gl->gl_flags);
+        trace_gfs2_glock_queue(gh, 1);
        if (likely(insert_pt == NULL)) {
                list_add_tail(&gh->gh_list, &gl->gl_holders);
                if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
                        goto do_cancel;
                return;
        }
-        trace_gfs2_glock_queue(gh, 1);
        list_add_tail(&gh->gh_list, insert_pt);
 do_cancel:
        gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
@@ -1113,6 +1084,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
                    !test_bit(GLF_DEMOTE, &gl->gl_flags))
                        fast_path = 1;
        }
+        __gfs2_glock_schedule_for_reclaim(gl);
        trace_gfs2_glock_queue(gh, 0);
        spin_unlock(&gl->gl_spin);
        if (likely(fast_path))
@@ -1276,10 +1248,8 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
 {
-        unsigned int x;
+        while (num_gh--)
+                gfs2_glock_dq(&ghs[num_gh]);
-        for (x = 0; x < num_gh; x++)
-                gfs2_glock_dq(&ghs[x]);
 }
 /**
@@ -1291,10 +1261,8 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
 {
-        unsigned int x;
+        while (num_gh--)
+                gfs2_glock_dq_uninit(&ghs[num_gh]);
-        for (x = 0; x < num_gh; x++)
-                gfs2_glock_dq_uninit(&ghs[x]);
 }
 void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
@@ -1440,42 +1408,30 @@ static struct shrinker glock_shrinker = {
 * @sdp: the filesystem
 * @bucket: the bucket
 *
- * Returns: 1 if the bucket has entries
 */
-static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
+static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
                          unsigned int hash)
 {
-        struct gfs2_glock *gl, *prev = NULL;
+        struct gfs2_glock *gl;
-        int has_entries = 0;
+        struct hlist_bl_head *head = &gl_hash_table[hash];
-        struct hlist_head *head = &gl_hash_table[hash].hb_list;
+        struct hlist_bl_node *pos;
-        read_lock(gl_lock_addr(hash));
+        rcu_read_lock();
-        /* Can't use hlist_for_each_entry - don't want prefetch here */
+        hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
-        if (hlist_empty(head))
+                if ((gl->gl_sbd == sdp) && atomic_read(&gl->gl_ref))
-                goto out;
-        gl = list_entry(head->first, struct gfs2_glock, gl_list);
-        while(1) {
-                if (!sdp || gl->gl_sbd == sdp) {
-                        gfs2_glock_hold(gl);
-                        read_unlock(gl_lock_addr(hash));
-                        if (prev)
-                                gfs2_glock_put(prev);
-                        prev = gl;
                        examiner(gl);
-                        has_entries = 1;
-                        read_lock(gl_lock_addr(hash));
-                }
-                if (gl->gl_list.next == NULL)
-                        break;
-                gl = list_entry(gl->gl_list.next, struct gfs2_glock, gl_list);
        }
-out:
+        rcu_read_unlock();
-        read_unlock(gl_lock_addr(hash));
-        if (prev)
-                gfs2_glock_put(prev);
        cond_resched();
-        return has_entries;
+}
+static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
+{
+        unsigned x;
+        for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+                examine_bucket(examiner, sdp, x);
 }
@@ -1529,10 +1485,21 @@ static void clear_glock(struct gfs2_glock *gl)
 void gfs2_glock_thaw(struct gfs2_sbd *sdp)
 {
-        unsigned x;
+        glock_hash_walk(thaw_glock, sdp);
+}
-        for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
-                examine_bucket(thaw_glock, sdp, x);
+{
+        int ret;
+        spin_lock(&gl->gl_spin);
+        ret = __dump_glock(seq, gl);
+        spin_unlock(&gl->gl_spin);
+        return ret;
+}
+static void dump_glock_func(struct gfs2_glock *gl)
+{
+        dump_glock(NULL, gl);
 }
 /**
@@ -1545,13 +1512,10 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
-        unsigned int x;
+        glock_hash_walk(clear_glock, sdp);
-        for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
-                examine_bucket(clear_glock, sdp, x);
        flush_workqueue(glock_workqueue);
        wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
-        gfs2_dump_lockstate(sdp);
+        glock_hash_walk(dump_glock_func, sdp);
 }
 void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
@@ -1717,66 +1681,15 @@ out:
        return error;
 }
-static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
-{
-        int ret;
-        spin_lock(&gl->gl_spin);
-        ret = __dump_glock(seq, gl);
-        spin_unlock(&gl->gl_spin);
-        return ret;
-}
-/**
- * gfs2_dump_lockstate - print out the current lockstate
- * @sdp: the filesystem
- * @ub: the buffer to copy the information into
- *
- * If @ub is NULL, dump the lockstate to the console.
- *
- */
-static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
-{
-        struct gfs2_glock *gl;
-        struct hlist_node *h;
-        unsigned int x;
-        int error = 0;
-        for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
-                read_lock(gl_lock_addr(x));
-                hlist_for_each_entry(gl, h, &gl_hash_table[x].hb_list, gl_list) {
-                        if (gl->gl_sbd != sdp)
-                                continue;
-                        error = dump_glock(NULL, gl);
-                        if (error)
-                                break;
-                }
-                read_unlock(gl_lock_addr(x));
-                if (error)
-                        break;
-        }
-        return error;
-}
 int __init gfs2_glock_init(void)
 {
        unsigned i;
        for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
-                INIT_HLIST_HEAD(&gl_hash_table[i].hb_list);
+                INIT_HLIST_BL_HEAD(&gl_hash_table[i]);
-        }
-#ifdef GL_HASH_LOCK_SZ
-        for(i = 0; i < GL_HASH_LOCK_SZ; i++) {
-                rwlock_init(&gl_hash_locks[i]);
        }
-#endif
        glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
                                          WQ_HIGHPRI | WQ_FREEZABLE, 0);
@@ -1802,62 +1715,54 @@ void gfs2_glock_exit(void)
        destroy_workqueue(gfs2_delete_workqueue);
 }
+static inline struct gfs2_glock *glock_hash_chain(unsigned hash)
+{
+        return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]),
+                              struct gfs2_glock, gl_list);
+}
+static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl)
+{
+        return hlist_bl_entry(rcu_dereference(gl->gl_list.next),
+                              struct gfs2_glock, gl_list);
+}
 static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 {
        struct gfs2_glock *gl;
-restart:
+        do {
-        read_lock(gl_lock_addr(gi->hash));
+                gl = gi->gl;
-        gl = gi->gl;
+                if (gl) {
-        if (gl) {
+                        gi->gl = glock_hash_next(gl);
-                gi->gl = hlist_entry(gl->gl_list.next,
+                } else {
-                                     struct gfs2_glock, gl_list);
+                        gi->gl = glock_hash_chain(gi->hash);
-        } else {
+                }
-                gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
+                while (gi->gl == NULL) {
-                                     struct gfs2_glock, gl_list);
+                        gi->hash++;
-        }
+                        if (gi->hash >= GFS2_GL_HASH_SIZE) {
-        if (gi->gl)
+                                rcu_read_unlock();
-                gfs2_glock_hold(gi->gl);
+                                return 1;
-        read_unlock(gl_lock_addr(gi->hash));
+                        }
-        if (gl)
+                        gi->gl = glock_hash_chain(gi->hash);
-                gfs2_glock_put(gl);
+                }
-        while (gi->gl == NULL) {
+        /* Skip entries for other sb and dead entries */
-                gi->hash++;
+        } while (gi->sdp != gi->gl->gl_sbd || atomic_read(&gi->gl->gl_ref) == 0);
-                if (gi->hash >= GFS2_GL_HASH_SIZE)
-                        return 1;
-                read_lock(gl_lock_addr(gi->hash));
-                gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
-                                     struct gfs2_glock, gl_list);
-                if (gi->gl)
-                        gfs2_glock_hold(gi->gl);
-                read_unlock(gl_lock_addr(gi->hash));
-        }
-        if (gi->sdp != gi->gl->gl_sbd)
-                goto restart;
        return 0;
 }
-static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi)
-{
-        if (gi->gl)
-                gfs2_glock_put(gi->gl);
-        gi->gl = NULL;
-}
 static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
 {
        struct gfs2_glock_iter *gi = seq->private;
        loff_t n = *pos;
        gi->hash = 0;
+        rcu_read_lock();
        do {
-                if (gfs2_glock_iter_next(gi)) {
+                if (gfs2_glock_iter_next(gi))
-                        gfs2_glock_iter_free(gi);
                        return NULL;
-                }
        } while (n--);
        return gi->gl;
@@ -1870,10 +1775,8 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
        (*pos)++;
-        if (gfs2_glock_iter_next(gi)) {
+        if (gfs2_glock_iter_next(gi))
-                gfs2_glock_iter_free(gi);
                return NULL;
-        }
        return gi->gl;
 }
@@ -1881,7 +1784,10 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
 static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
 {
        struct gfs2_glock_iter *gi = seq->private;
-        gfs2_glock_iter_free(gi);
+        if (gi->gl)
+                rcu_read_unlock();
+        gi->gl = NULL;
 }
 static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 691851ceb61..aea160690e9 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -118,7 +118,7 @@ struct lm_lockops {
        int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
        void (*lm_unmount) (struct gfs2_sbd *sdp);
        void (*lm_withdraw) (struct gfs2_sbd *sdp);
-        void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
+        void (*lm_put_lock) (struct gfs2_glock *gl);
        int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
                        unsigned int flags);
        void (*lm_cancel) (struct gfs2_glock *gl);
@@ -174,7 +174,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp,
                   int create, struct gfs2_glock **glp);
 void gfs2_glock_hold(struct gfs2_glock *gl);
 void gfs2_glock_put_nolock(struct gfs2_glock *gl);
-int gfs2_glock_put(struct gfs2_glock *gl);
+void gfs2_glock_put(struct gfs2_glock *gl);
 void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
                      struct gfs2_holder *gh);
 void gfs2_holder_reinit(unsigned int state, unsigned flags,
@@ -223,25 +223,22 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
        return error;
 }
-/*  Lock Value Block functions  */
+extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
+extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
-int gfs2_lvb_hold(struct gfs2_glock *gl);
+extern void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
-void gfs2_lvb_unhold(struct gfs2_glock *gl);
+extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
-void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
+extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
-void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
+extern void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
-void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
+extern void gfs2_glock_free(struct gfs2_glock *gl);
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
-void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
+extern int __init gfs2_glock_init(void);
-void gfs2_glock_thaw(struct gfs2_sbd *sdp);
+extern void gfs2_glock_exit(void);
-int __init gfs2_glock_init(void);
+extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
-void gfs2_glock_exit(void);
+extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
+extern int gfs2_register_debugfs(void);
-int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
+extern void gfs2_unregister_debugfs(void);
-void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
-int gfs2_register_debugfs(void);
-void gfs2_unregister_debugfs(void);
 extern const struct lm_lockops gfs2_dlm_ops;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 263561bf1a5..3754e3cbf02 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -56,20 +56,26 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
        BUG_ON(current->journal_info);
        current->journal_info = &tr;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        while (!list_empty(head)) {
                bd = list_entry(head->next, struct gfs2_bufdata,
                                bd_ail_gl_list);
                bh = bd->bd_bh;
                gfs2_remove_from_ail(bd);
+                spin_unlock(&sdp->sd_ail_lock);
                bd->bd_bh = NULL;
                bh->b_private = NULL;
                bd->bd_blkno = bh->b_blocknr;
+                gfs2_log_lock(sdp);
                gfs2_assert_withdraw(sdp, !buffer_busy(bh));
                gfs2_trans_add_revoke(sdp, bd);
+                gfs2_log_unlock(sdp);
+                spin_lock(&sdp->sd_ail_lock);
        }
        gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
        gfs2_trans_end(sdp);
        gfs2_log_flush(sdp, NULL);
@@ -206,8 +212,17 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 static int inode_go_demote_ok(const struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_holder *gh;
        if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
                return 0;
+        if (!list_empty(&gl->gl_holders)) {
+                gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+                if (gh->gh_list.next != &gl->gl_holders)
+                        return 0;
+        }
        return 1;
 }
@@ -272,19 +287,6 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 }
 /**
- * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
- * @gl: the glock
- *
- * Returns: 1 if it's ok
- */
-static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
-{
-        const struct address_space *mapping = (const struct address_space *)(gl + 1);
-        return !mapping->nrpages;
-}
-/**
 * rgrp_go_lock - operation done after an rgrp lock is locked by
 *    a first holder on this node.
 * @gl: the glock
@@ -410,7 +412,6 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
        .go_xmote_th = rgrp_go_sync,
        .go_inval = rgrp_go_inval,
-        .go_demote_ok = rgrp_go_demote_ok,
        .go_lock = rgrp_go_lock,
        .go_unlock = rgrp_go_unlock,
        .go_dump = gfs2_rgrp_dump,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a79790c0627..870a89d6d4d 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -15,6 +15,8 @@
 #include <linux/workqueue.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
 #define DIO_WAIT        0x00000010
 #define DIO_METADATA    0x00000020
@@ -201,7 +203,7 @@ enum {
 };
 struct gfs2_glock {
-        struct hlist_node gl_list;
+        struct hlist_bl_node gl_list;
        unsigned long gl_flags;         /* GLF_... */
        struct lm_lockname gl_name;
        atomic_t gl_ref;
@@ -234,6 +236,7 @@ struct gfs2_glock {
        atomic_t gl_ail_count;
        struct delayed_work gl_work;
        struct work_struct gl_delete;
+        struct rcu_head gl_rcu;
 };
 #define GFS2_MIN_LVB_SIZE 32    /* Min size of LVB that gfs2 supports */
@@ -314,6 +317,7 @@ enum {
        QDF_USER                = 0,
        QDF_CHANGE              = 1,
        QDF_LOCKED              = 2,
+        QDF_REFRESH             = 3,
 };
 struct gfs2_quota_data {
@@ -647,6 +651,7 @@ struct gfs2_sbd {
        unsigned int sd_log_flush_head;
        u64 sd_log_flush_wrapped;
+        spinlock_t sd_ail_lock;
        struct list_head sd_ail1_list;
        struct list_head sd_ail2_list;
        u64 sd_ail_sync_gen;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 6e493aee28f..98c80d8c2a6 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -22,7 +22,6 @@ static void gdlm_ast(void *arg)
 {
        struct gfs2_glock *gl = arg;
        unsigned ret = gl->gl_state;
-        struct gfs2_sbd *sdp = gl->gl_sbd;
        BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
@@ -31,12 +30,7 @@ static void gdlm_ast(void *arg)
        switch (gl->gl_lksb.sb_status) {
        case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
-                if (gl->gl_ops->go_flags & GLOF_ASPACE)
+                gfs2_glock_free(gl);
-                        kmem_cache_free(gfs2_glock_aspace_cachep, gl);
-                else
-                        kmem_cache_free(gfs2_glock_cachep, gl);
-                if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-                        wake_up(&sdp->sd_glock_wait);
                return;
        case -DLM_ECANCEL: /* Cancel while getting lock */
                ret |= LM_OUT_CANCELED;
@@ -164,16 +158,14 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
                        GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
 }
-static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
+static void gdlm_put_lock(struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
        int error;
        if (gl->gl_lksb.sb_lkid == 0) {
-                kmem_cache_free(cachep, gl);
+                gfs2_glock_free(gl);
-                if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-                        wake_up(&sdp->sd_glock_wait);
                return;
        }
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index eb01f3575e1..e7ed31f858d 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -67,7 +67,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
 * @mapping: The associated mapping (maybe NULL)
 * @bd: The gfs2_bufdata to remove
 *
- * The log lock _must_ be held when calling this function
+ * The ail lock _must_ be held when calling this function
 *
 */
@@ -88,8 +88,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
 */
 static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
-__releases(&sdp->sd_log_lock)
+__releases(&sdp->sd_ail_lock)
-__acquires(&sdp->sd_log_lock)
+__acquires(&sdp->sd_ail_lock)
 {
        struct gfs2_bufdata *bd, *s;
        struct buffer_head *bh;
@@ -117,7 +117,7 @@ __acquires(&sdp->sd_log_lock)
                        list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
                        get_bh(bh);
-                        gfs2_log_unlock(sdp);
+                        spin_unlock(&sdp->sd_ail_lock);
                        lock_buffer(bh);
                        if (test_clear_buffer_dirty(bh)) {
                                bh->b_end_io = end_buffer_write_sync;
@@ -126,7 +126,7 @@ __acquires(&sdp->sd_log_lock)
                                unlock_buffer(bh);
                                brelse(bh);
                        }
-                        gfs2_log_lock(sdp);
+                        spin_lock(&sdp->sd_ail_lock);
                        retry = 1;
                        break;
@@ -175,10 +175,10 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
        struct gfs2_ail *ai;
        int done = 0;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        head = &sdp->sd_ail1_list;
        if (list_empty(head)) {
-                gfs2_log_unlock(sdp);
+                spin_unlock(&sdp->sd_ail_lock);
                return;
        }
        sync_gen = sdp->sd_ail_sync_gen++;
@@ -189,13 +189,13 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
                        if (ai->ai_sync_gen >= sync_gen)
                                continue;
                        ai->ai_sync_gen = sync_gen;
-                        gfs2_ail1_start_one(sdp, ai); /* This may drop log lock */
+                        gfs2_ail1_start_one(sdp, ai); /* This may drop ail lock */
                        done = 0;
                        break;
                }
        }
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
 }
 static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
@@ -203,7 +203,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
        struct gfs2_ail *ai, *s;
        int ret;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
                if (gfs2_ail1_empty_one(sdp, ai, flags))
@@ -214,7 +214,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
        ret = list_empty(&sdp->sd_ail1_list);
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
        return ret;
 }
@@ -247,7 +247,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
        int wrap = (new_tail < old_tail);
        int a, b, rm;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
                a = (old_tail <= ai->ai_first);
@@ -263,7 +263,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
                kfree(ai);
        }
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
 }
 /**
@@ -421,7 +421,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
        struct gfs2_ail *ai;
        unsigned int tail;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        if (list_empty(&sdp->sd_ail1_list)) {
                tail = sdp->sd_log_head;
@@ -430,7 +430,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
                tail = ai->ai_first;
        }
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
        return tail;
 }
@@ -743,10 +743,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
        sdp->sd_log_commited_databuf = 0;
        sdp->sd_log_commited_revoke = 0;
+        spin_lock(&sdp->sd_ail_lock);
        if (!list_empty(&ai->ai_ail1_list)) {
                list_add(&ai->ai_list, &sdp->sd_ail1_list);
                ai = NULL;
        }
+        spin_unlock(&sdp->sd_ail_lock);
        gfs2_log_unlock(sdp);
        trace_gfs2_log_flush(sdp, 0);
        up_write(&sdp->sd_log_flush_lock);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index bf33f822058..e919abf25ec 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -51,8 +51,10 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
        /* If this buffer is in the AIL and it has already been written
         * to in-place disk block, remove it from the AIL.
         */
+        spin_lock(&sdp->sd_ail_lock);
        if (bd->bd_ail)
                list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
+        spin_unlock(&sdp->sd_ail_lock);
        get_bh(bh);
        atomic_inc(&sdp->sd_log_pinned);
        trace_gfs2_pin(bd, 1);
@@ -80,7 +82,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
        mark_buffer_dirty(bh);
        clear_buffer_pinned(bh);
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        if (bd->bd_ail) {
                list_del(&bd->bd_ail_st_list);
                brelse(bh);
@@ -91,9 +93,11 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
        }
        bd->bd_ail = ai;
        list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
-        clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+        spin_unlock(&sdp->sd_ail_lock);
+        if (test_and_clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags))
+                gfs2_glock_schedule_for_reclaim(bd->bd_gl);
        trace_gfs2_pin(bd, 0);
-        gfs2_log_unlock(sdp);
        unlock_buffer(bh);
        atomic_dec(&sdp->sd_log_pinned);
 }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 72c31a315d9..888a5f5a1a5 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -14,6 +14,8 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
 #include <asm/atomic.h>
 #include "gfs2.h"
@@ -45,7 +47,7 @@ static void gfs2_init_glock_once(void *foo)
 {
        struct gfs2_glock *gl = foo;
-        INIT_HLIST_NODE(&gl->gl_list);
+        INIT_HLIST_BL_NODE(&gl->gl_list);
        spin_lock_init(&gl->gl_spin);
        INIT_LIST_HEAD(&gl->gl_holders);
        INIT_LIST_HEAD(&gl->gl_lru);
@@ -191,6 +193,8 @@ static void __exit exit_gfs2_fs(void)
        unregister_filesystem(&gfs2meta_fs_type);
        destroy_workqueue(gfs_recovery_wq);
+        rcu_barrier();
        kmem_cache_destroy(gfs2_quotad_cachep);
        kmem_cache_destroy(gfs2_rgrpd_cachep);
        kmem_cache_destroy(gfs2_bufdata_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 939739c7b3f..01d97f48655 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -326,6 +326,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
                brelse(bh);
        }
        if (bd) {
+                spin_lock(&sdp->sd_ail_lock);
                if (bd->bd_ail) {
                        gfs2_remove_from_ail(bd);
                        bh->b_private = NULL;
@@ -333,6 +334,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
                        bd->bd_blkno = bh->b_blocknr;
                        gfs2_trans_add_revoke(sdp, bd);
                }
+                spin_unlock(&sdp->sd_ail_lock);
        }
        clear_buffer_dirty(bh);
        clear_buffer_uptodate(bh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 777927ce6f7..42ef24355af 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -99,6 +99,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        init_waitqueue_head(&sdp->sd_log_waitq);
        init_waitqueue_head(&sdp->sd_logd_waitq);
+        spin_lock_init(&sdp->sd_ail_lock);
        INIT_LIST_HEAD(&sdp->sd_ail1_list);
        INIT_LIST_HEAD(&sdp->sd_ail2_list);
@@ -928,17 +929,9 @@ static const match_table_t nolock_tokens = {
        { Opt_err, NULL },
 };
-static void nolock_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        kmem_cache_free(cachep, gl);
-        if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-                wake_up(&sdp->sd_glock_wait);
-}
 static const struct lm_lockops nolock_ops = {
        .lm_proto_name = "lock_nolock",
-        .lm_put_lock = nolock_put_lock,
+        .lm_put_lock = gfs2_glock_free,
        .lm_tokens = &nolock_tokens,
 };
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index d8b26ac2e20..09e436a5072 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1026,9 +1026,9 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 /**
 * gfs2_permission -
- * @inode:
+ * @inode: The inode
- * @mask:
+ * @mask: The mask to be tested
- * @nd: passed from Linux VFS, ignored by us
+ * @flags: Indicates whether this is an RCU path walk or not
 *
 * This may be called from the VFS directly, or from within GFS2 with the
 * inode locked, so we look to see if the glock is already locked and only
@@ -1044,11 +1044,11 @@ int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
        int error;
        int unlock = 0;
-        if (flags & IPERM_FLAG_RCU)
-                return -ECHILD;
        ip = GFS2_I(inode);
        if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
+                if (flags & IPERM_FLAG_RCU)
+                        return -ECHILD;
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
                if (error)
                        return error;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a689901963d..e23d9864c41 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -834,6 +834,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
                        goto out_end_trans;
                do_qc(qd, -qd->qd_change_sync);
+                set_bit(QDF_REFRESH, &qd->qd_flags);
        }
        error = 0;
@@ -929,6 +930,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al = ip->i_alloc;
+        struct gfs2_quota_data *qd;
        unsigned int x;
        int error = 0;
@@ -942,7 +944,11 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
             sort_qd, NULL);
        for (x = 0; x < al->al_qd_num; x++) {
-                error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]);
+                int force = NO_FORCE;
+                qd = al->al_qd[x];
+                if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags))
+                        force = FORCE;
+                error = do_glock(qd, force, &al->al_qd_ghs[x]);
                if (error)
                        break;
        }
@@ -1587,6 +1593,8 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
        offset = qd2offset(qd);
        alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota));
+        if (gfs2_is_stuffed(ip))
+                alloc_required = 1;
        if (alloc_required) {
                al = gfs2_alloc_get(ip);
                if (al == NULL)
@@ -1600,7 +1608,9 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
                blocks += gfs2_rg_blocks(al);
        }
-        error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
+        /* Some quotas span block boundaries and can update two blocks,
+           adding an extra block to the transaction to handle such quotas */
+        error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 2, 0);
        if (error)
                goto out_release;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7293ea27020..cf930cd9664 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1602,7 +1602,7 @@ rgrp_error:
 *
 */
-void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
+void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd;
@@ -1617,7 +1617,21 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_trans_add_rg(rgd);
+}
+/**
+ * gfs2_free_data - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        __gfs2_free_data(ip, bstart, blen);
        gfs2_statfs_change(sdp, 0, +blen, 0);
        gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
 }
@@ -1630,7 +1644,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
 *
 */
-void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd;
@@ -1645,10 +1659,24 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_trans_add_rg(rgd);
+        gfs2_meta_wipe(ip, bstart, blen);
+}
+/**
+ * gfs2_free_meta - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        __gfs2_free_meta(ip, bstart, blen);
        gfs2_statfs_change(sdp, 0, +blen, 0);
        gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
-        gfs2_meta_wipe(ip, bstart, blen);
 }
 void gfs2_unlink_di(struct inode *inode)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 50c2bb04369..a80e3034ac4 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -52,7 +52,9 @@ extern int gfs2_ri_update(struct gfs2_inode *ip);
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
 extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
+extern void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
 extern void gfs2_unlink_di(struct inode *inode);
diff --git a/fs/inode.c b/fs/inode.c
index 0647d80accf..9910c039f02 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -84,16 +84,13 @@ static struct hlist_head *inode_hashtable __read_mostly;
 DEFINE_SPINLOCK(inode_lock);
 /*
- * iprune_sem provides exclusion between the kswapd or try_to_free_pages
+ * iprune_sem provides exclusion between the icache shrinking and the
- * icache shrinking path, and the umount path.  Without this exclusion,
+ * umount path.
- * by the time prune_icache calls iput for the inode whose pages it has
- * been invalidating, or by the time it calls clear_inode & destroy_inode
- * from its final dispose_list, the struct super_block they refer to
- * (for inode->i_sb->s_op) may already have been freed and reused.
 *
- * We make this an rwsem because the fastpath is icache shrinking. In
+ * We don't actually need it to protect anything in the umount path,
- * some cases a filesystem may be doing a significant amount of work in
+ * but only need to cycle through it to make sure any inode that
- * its inode reclaim code, so this should improve parallelism.
+ * prune_icache took off the LRU list has been fully torn down by the
+ * time we are past evict_inodes.
 */
 static DECLARE_RWSEM(iprune_sem);
@@ -516,17 +513,12 @@ void evict_inodes(struct super_block *sb)
        struct inode *inode, *next;
        LIST_HEAD(dispose);
-        down_write(&iprune_sem);
        spin_lock(&inode_lock);
        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
                if (atomic_read(&inode->i_count))
                        continue;
+                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
-                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
-                        WARN_ON(1);
                        continue;
-                }
                inode->i_state |= I_FREEING;
@@ -542,6 +534,13 @@ void evict_inodes(struct super_block *sb)
        spin_unlock(&inode_lock);
        dispose_list(&dispose);
+        /*
+         * Cycle through iprune_sem to make sure any inode that prune_icache
+         * moved off the list before we took the lock has been fully torn
+         * down.
+         */
+        down_write(&iprune_sem);
        up_write(&iprune_sem);
 }
@@ -561,8 +560,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
        struct inode *inode, *next;
        LIST_HEAD(dispose);
-        down_write(&iprune_sem);
        spin_lock(&inode_lock);
        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
@@ -590,7 +587,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
        spin_unlock(&inode_lock);
        dispose_list(&dispose);
-        up_write(&iprune_sem);
        return busy;
 }
diff --git a/fs/internal.h b/fs/internal.h
index 9b976b57d7f..f3d15de44b1 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -106,6 +106,19 @@ extern void put_super(struct super_block *sb);
 struct nameidata;
 extern struct file *nameidata_to_filp(struct nameidata *);
 extern void release_open_intent(struct nameidata *);
+struct open_flags {
+        int open_flag;
+        int mode;
+        int acc_mode;
+        int intent;
+};
+extern struct file *do_filp_open(int dfd, const char *pathname,
+                const struct open_flags *op, int lookup_flags);
+extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
+                const char *, const struct open_flags *, int lookup_flags);
+extern long do_handle_open(int mountdirfd,
+                           struct file_handle __user *ufh, int open_flag);
 /*
 * inode.c
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index ed752cb3847..dd4687ff30d 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -124,9 +124,13 @@ isofs_export_encode_fh(struct dentry *dentry,
         * offset of the inode and the upper 16 bits of fh32[1] to
         * hold the offset of the parent.
         */
+        if (connectable && (len < 5)) {
-        if (len < 3 || (connectable && len < 5))
+                *max_len = 5;
+                return 255;
+        } else if (len < 3) {
+                *max_len = 3;
                return 255;
+        }
        len = 3;
        fh32[0] = ei->i_iget5_block;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index c61600ece4f..eaaf2b511e8 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -809,9 +809,6 @@ static int jfs_link(struct dentry *old_dentry,
        if (ip->i_nlink == JFS_LINK_MAX)
                return -EMLINK;
-        if (ip->i_nlink == 0)
-                return -ENOENT;
        dquot_initialize(dir);
        tid = txBegin(ip->i_sb, 0);
diff --git a/fs/namei.c b/fs/namei.c
index a4689eb2df2..b912b7abe74 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -136,7 +136,7 @@ static int do_getname(const char __user *filename, char *page)
        return retval;
 }
-char * getname(const char __user * filename)
+static char *getname_flags(const char __user * filename, int flags)
 {
        char *tmp, *result;
@@ -147,14 +147,21 @@ char * getname(const char __user * filename)
                result = tmp;
                if (retval < 0) {
-                        __putname(tmp);
+                        if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
-                        result = ERR_PTR(retval);
+                                __putname(tmp);
+                                result = ERR_PTR(retval);
+                        }
                }
        }
        audit_getname(result);
        return result;
 }
+char *getname(const char __user * filename)
+{
+        return getname_flags(filename, 0);
+}
 #ifdef CONFIG_AUDITSYSCALL
 void putname(const char *name)
 {
@@ -401,9 +408,11 @@ static int nameidata_drop_rcu(struct nameidata *nd)
 {
        struct fs_struct *fs = current->fs;
        struct dentry *dentry = nd->path.dentry;
+        int want_root = 0;
        BUG_ON(!(nd->flags & LOOKUP_RCU));
-        if (nd->root.mnt) {
+        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+                want_root = 1;
                spin_lock(&fs->lock);
                if (nd->root.mnt != fs->root.mnt ||
                                nd->root.dentry != fs->root.dentry)
@@ -414,7 +423,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
                goto err;
        BUG_ON(nd->inode != dentry->d_inode);
        spin_unlock(&dentry->d_lock);
-        if (nd->root.mnt) {
+        if (want_root) {
                path_get(&nd->root);
                spin_unlock(&fs->lock);
        }
@@ -427,7 +436,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
 err:
        spin_unlock(&dentry->d_lock);
 err_root:
-        if (nd->root.mnt)
+        if (want_root)
                spin_unlock(&fs->lock);
        return -ECHILD;
 }
@@ -454,9 +463,11 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
 {
        struct fs_struct *fs = current->fs;
        struct dentry *parent = nd->path.dentry;
+        int want_root = 0;
        BUG_ON(!(nd->flags & LOOKUP_RCU));
-        if (nd->root.mnt) {
+        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+                want_root = 1;
                spin_lock(&fs->lock);
                if (nd->root.mnt != fs->root.mnt ||
                                nd->root.dentry != fs->root.dentry)
@@ -476,7 +487,7 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
        parent->d_count++;
        spin_unlock(&dentry->d_lock);
        spin_unlock(&parent->d_lock);
-        if (nd->root.mnt) {
+        if (want_root) {
                path_get(&nd->root);
                spin_unlock(&fs->lock);
        }
@@ -490,7 +501,7 @@ err:
        spin_unlock(&dentry->d_lock);
        spin_unlock(&parent->d_lock);
 err_root:
-        if (nd->root.mnt)
+        if (want_root)
                spin_unlock(&fs->lock);
        return -ECHILD;
 }
@@ -498,8 +509,16 @@ err_root:
 /* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
 static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
 {
-        if (nd->flags & LOOKUP_RCU)
+        if (nd->flags & LOOKUP_RCU) {
-                return nameidata_dentry_drop_rcu(nd, dentry);
+                if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
+                        nd->flags &= ~LOOKUP_RCU;
+                        if (!(nd->flags & LOOKUP_ROOT))
+                                nd->root.mnt = NULL;
+                        rcu_read_unlock();
+                        br_read_unlock(vfsmount_lock);
+                        return -ECHILD;
+                }
+        }
        return 0;
 }
@@ -518,7 +537,8 @@ static int nameidata_drop_rcu_last(struct nameidata *nd)
        BUG_ON(!(nd->flags & LOOKUP_RCU));
        nd->flags &= ~LOOKUP_RCU;
-        nd->root.mnt = NULL;
+        if (!(nd->flags & LOOKUP_ROOT))
+                nd->root.mnt = NULL;
        spin_lock(&dentry->d_lock);
        if (!__d_rcu_to_refcount(dentry, nd->seq))
                goto err_unlock;
@@ -539,14 +559,6 @@ err_unlock:
        return -ECHILD;
 }
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
-{
-        if (likely(nd->flags & LOOKUP_RCU))
-                return nameidata_drop_rcu_last(nd);
-        return 0;
-}
 /**
 * release_open_intent - free up open intent resources
 * @nd: pointer to nameidata
@@ -590,42 +602,8 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
        return dentry;
 }
-static inline struct dentry *
-do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd)
-{
-        int status = d_revalidate(dentry, nd);
-        if (likely(status > 0))
-                return dentry;
-        if (status == -ECHILD) {
-                if (nameidata_dentry_drop_rcu(nd, dentry))
-                        return ERR_PTR(-ECHILD);
-                return do_revalidate(dentry, nd);
-        }
-        if (status < 0)
-                return ERR_PTR(status);
-        /* Don't d_invalidate in rcu-walk mode */
-        if (nameidata_dentry_drop_rcu(nd, dentry))
-                return ERR_PTR(-ECHILD);
-        if (!d_invalidate(dentry)) {
-                dput(dentry);
-                dentry = NULL;
-        }
-        return dentry;
-}
-static inline int need_reval_dot(struct dentry *dentry)
-{
-        if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
-                return 0;
-        if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
-                return 0;
-        return 1;
-}
 /*
- * force_reval_path - force revalidation of a dentry
+ * handle_reval_path - force revalidation of a dentry
 *
 * In some situations the path walking code will trust dentries without
 * revalidating them. This causes problems for filesystems that depend on
@@ -639,27 +617,28 @@ static inline int need_reval_dot(struct dentry *dentry)
 * invalidate the dentry. It's up to the caller to handle putting references
 * to the path if necessary.
 */
-static int
+static inline int handle_reval_path(struct nameidata *nd)
-force_reval_path(struct path *path, struct nameidata *nd)
 {
+        struct dentry *dentry = nd->path.dentry;
        int status;
-        struct dentry *dentry = path->dentry;
-        /*
+        if (likely(!(nd->flags & LOOKUP_JUMPED)))
-         * only check on filesystems where it's possible for the dentry to
+                return 0;
-         * become stale.
-         */
+        if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
-        if (!need_reval_dot(dentry))
+                return 0;
+        if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
                return 0;
+        /* Note: we do not d_invalidate() */
        status = d_revalidate(dentry, nd);
        if (status > 0)
                return 0;
-        if (!status) {
+        if (!status)
-                d_invalidate(dentry);
                status = -ESTALE;
-        }
        return status;
 }
@@ -728,6 +707,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
                path_put(&nd->path);
                nd->path = nd->root;
                path_get(&nd->root);
+                nd->flags |= LOOKUP_JUMPED;
        }
        nd->inode = nd->path.dentry->d_inode;
@@ -757,19 +737,42 @@ static inline void path_to_nameidata(const struct path *path,
        nd->path.dentry = path->dentry;
 }
+static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
+{
+        struct inode *inode = link->dentry->d_inode;
+        if (!IS_ERR(cookie) && inode->i_op->put_link)
+                inode->i_op->put_link(link->dentry, nd, cookie);
+        path_put(link);
+}
 static __always_inline int
-__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
+follow_link(struct path *link, struct nameidata *nd, void **p)
 {
        int error;
        struct dentry *dentry = link->dentry;
        BUG_ON(nd->flags & LOOKUP_RCU);
+        if (link->mnt == nd->path.mnt)
+                mntget(link->mnt);
+        if (unlikely(current->total_link_count >= 40)) {
+                *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
+                path_put(&nd->path);
+                return -ELOOP;
+        }
+        cond_resched();
+        current->total_link_count++;
        touch_atime(link->mnt, dentry);
        nd_set_link(nd, NULL);
-        if (link->mnt == nd->path.mnt)
+        error = security_inode_follow_link(link->dentry, nd);
-                mntget(link->mnt);
+        if (error) {
+                *p = ERR_PTR(error); /* no ->put_link(), please */
+                path_put(&nd->path);
+                return error;
+        }
        nd->last_type = LAST_BIND;
        *p = dentry->d_inode->i_op->follow_link(dentry, nd);
@@ -780,56 +783,18 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
                if (s)
                        error = __vfs_follow_link(nd, s);
                else if (nd->last_type == LAST_BIND) {
-                        error = force_reval_path(&nd->path, nd);
+                        nd->flags |= LOOKUP_JUMPED;
-                        if (error)
+                        nd->inode = nd->path.dentry->d_inode;
+                        if (nd->inode->i_op->follow_link) {
+                                /* stepped on a _really_ weird one */
                                path_put(&nd->path);
+                                error = -ELOOP;
+                        }
                }
        }
        return error;
 }
-/*
- * This limits recursive symlink follows to 8, while
- * limiting consecutive symlinks to 40.
- *
- * Without that kind of total limit, nasty chains of consecutive
- * symlinks can cause almost arbitrarily long lookups. 
- */
-static inline int do_follow_link(struct inode *inode, struct path *path, struct nameidata *nd)
-{
-        void *cookie;
-        int err = -ELOOP;
-        /* We drop rcu-walk here */
-        if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
-                return -ECHILD;
-        BUG_ON(inode != path->dentry->d_inode);
-        if (current->link_count >= MAX_NESTED_LINKS)
-                goto loop;
-        if (current->total_link_count >= 40)
-                goto loop;
-        BUG_ON(nd->depth >= MAX_NESTED_LINKS);
-        cond_resched();
-        err = security_inode_follow_link(path->dentry, nd);
-        if (err)
-                goto loop;
-        current->link_count++;
-        current->total_link_count++;
-        nd->depth++;
-        err = __do_follow_link(path, nd, &cookie);
-        if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
-                path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
-        path_put(path);
-        current->link_count--;
-        nd->depth--;
-        return err;
-loop:
-        path_put_conditional(path, nd);
-        path_put(&nd->path);
-        return err;
-}
 static int follow_up_rcu(struct path *path)
 {
        struct vfsmount *parent;
@@ -1068,7 +1033,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                        seq = read_seqcount_begin(&parent->d_seq);
                        if (read_seqcount_retry(&old->d_seq, nd->seq))
-                                return -ECHILD;
+                                goto failed;
                        inode = parent->d_inode;
                        nd->path.dentry = parent;
                        nd->seq = seq;
@@ -1081,8 +1046,15 @@ static int follow_dotdot_rcu(struct nameidata *nd)
        }
        __follow_mount_rcu(nd, &nd->path, &inode, true);
        nd->inode = inode;
        return 0;
+failed:
+        nd->flags &= ~LOOKUP_RCU;
+        if (!(nd->flags & LOOKUP_ROOT))
+                nd->root.mnt = NULL;
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        return -ECHILD;
 }
 /*
@@ -1216,68 +1188,85 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 {
        struct vfsmount *mnt = nd->path.mnt;
        struct dentry *dentry, *parent = nd->path.dentry;
-        struct inode *dir;
+        int need_reval = 1;
+        int status = 1;
        int err;
        /*
-         * See if the low-level filesystem might want
-         * to use its own hash..
-         */
-        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-                err = parent->d_op->d_hash(parent, nd->inode, name);
-                if (err < 0)
-                        return err;
-        }
-        /*
         * Rename seqlock is not required here because in the off chance
         * of a false negative due to a concurrent rename, we're going to
         * do the non-racy lookup, below.
         */
        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;
                *inode = nd->inode;
                dentry = __d_lookup_rcu(parent, name, &seq, inode);
-                if (!dentry) {
+                if (!dentry)
-                        if (nameidata_drop_rcu(nd))
+                        goto unlazy;
-                                return -ECHILD;
-                        goto need_lookup;
-                }
                /* Memory barrier in read_seqcount_begin of child is enough */
                if (__read_seqcount_retry(&parent->d_seq, nd->seq))
                        return -ECHILD;
                nd->seq = seq;
                if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
-                        dentry = do_revalidate_rcu(dentry, nd);
+                        status = d_revalidate(dentry, nd);
-                        if (!dentry)
+                        if (unlikely(status <= 0)) {
-                                goto need_lookup;
+                                if (status != -ECHILD)
-                        if (IS_ERR(dentry))
+                                        need_reval = 0;
-                                goto fail;
+                                goto unlazy;
-                        if (!(nd->flags & LOOKUP_RCU))
+                        }
-                                goto done;
                }
                path->mnt = mnt;
                path->dentry = dentry;
                if (likely(__follow_mount_rcu(nd, path, inode, false)))
                        return 0;
-                if (nameidata_drop_rcu(nd))
+unlazy:
-                        return -ECHILD;
+                if (dentry) {
-                /* fallthru */
+                        if (nameidata_dentry_drop_rcu(nd, dentry))
+                                return -ECHILD;
+                } else {
+                        if (nameidata_drop_rcu(nd))
+                                return -ECHILD;
+                }
+        } else {
+                dentry = __d_lookup(parent, name);
        }
-        dentry = __d_lookup(parent, name);
-        if (!dentry)
+retry:
-                goto need_lookup;
+        if (unlikely(!dentry)) {
-found:
+                struct inode *dir = parent->d_inode;
-        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
+                BUG_ON(nd->inode != dir);
-                dentry = do_revalidate(dentry, nd);
-                if (!dentry)
+                mutex_lock(&dir->i_mutex);
-                        goto need_lookup;
+                dentry = d_lookup(parent, name);
-                if (IS_ERR(dentry))
+                if (likely(!dentry)) {
-                        goto fail;
+                        dentry = d_alloc_and_lookup(parent, name, nd);
+                        if (IS_ERR(dentry)) {
+                                mutex_unlock(&dir->i_mutex);
+                                return PTR_ERR(dentry);
+                        }
+                        /* known good */
+                        need_reval = 0;
+                        status = 1;
+                }
+                mutex_unlock(&dir->i_mutex);
        }
-done:
+        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
+                status = d_revalidate(dentry, nd);
+        if (unlikely(status <= 0)) {
+                if (status < 0) {
+                        dput(dentry);
+                        return status;
+                }
+                if (!d_invalidate(dentry)) {
+                        dput(dentry);
+                        dentry = NULL;
+                        need_reval = 1;
+                        goto retry;
+                }
+        }
        path->mnt = mnt;
        path->dentry = dentry;
        err = follow_managed(path, nd->flags);
@@ -1287,39 +1276,113 @@ done:
        }
        *inode = path->dentry->d_inode;
        return 0;
+}
+static inline int may_lookup(struct nameidata *nd)
+{
+        if (nd->flags & LOOKUP_RCU) {
+                int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+                if (err != -ECHILD)
+                        return err;
+                if (nameidata_drop_rcu(nd))
+                        return -ECHILD;
+        }
+        return exec_permission(nd->inode, 0);
+}
-need_lookup:
+static inline int handle_dots(struct nameidata *nd, int type)
-        dir = parent->d_inode;
+{
-        BUG_ON(nd->inode != dir);
+        if (type == LAST_DOTDOT) {
+                if (nd->flags & LOOKUP_RCU) {
+                        if (follow_dotdot_rcu(nd))
+                                return -ECHILD;
+                } else
+                        follow_dotdot(nd);
+        }
+        return 0;
+}
-        mutex_lock(&dir->i_mutex);
+static void terminate_walk(struct nameidata *nd)
-        /*
+{
-         * First re-do the cached lookup just in case it was created
+        if (!(nd->flags & LOOKUP_RCU)) {
-         * while we waited for the directory semaphore, or the first
+                path_put(&nd->path);
-         * lookup failed due to an unrelated rename.
+        } else {
-         *
+                nd->flags &= ~LOOKUP_RCU;
-         * This could use version numbering or similar to avoid unnecessary
+                if (!(nd->flags & LOOKUP_ROOT))
-         * cache lookups, but then we'd have to do the first lookup in the
+                        nd->root.mnt = NULL;
-         * non-racy way. However in the common case here, everything should
+                rcu_read_unlock();
-         * be hot in cache, so would it be a big win?
+                br_read_unlock(vfsmount_lock);
-         */
-        dentry = d_lookup(parent, name);
-        if (likely(!dentry)) {
-                dentry = d_alloc_and_lookup(parent, name, nd);
-                mutex_unlock(&dir->i_mutex);
-                if (IS_ERR(dentry))
-                        goto fail;
-                goto done;
        }
+}
+static inline int walk_component(struct nameidata *nd, struct path *path,
+                struct qstr *name, int type, int follow)
+{
+        struct inode *inode;
+        int err;
        /*
-         * Uhhuh! Nasty case: the cache was re-populated while
+         * "." and ".." are special - ".." especially so because it has
-         * we waited on the semaphore. Need to revalidate.
+         * to be able to know about the current root directory and
+         * parent relationships.
         */
-        mutex_unlock(&dir->i_mutex);
+        if (unlikely(type != LAST_NORM))
-        goto found;
+                return handle_dots(nd, type);
+        err = do_lookup(nd, name, path, &inode);
+        if (unlikely(err)) {
+                terminate_walk(nd);
+                return err;
+        }
+        if (!inode) {
+                path_to_nameidata(path, nd);
+                terminate_walk(nd);
+                return -ENOENT;
+        }
+        if (unlikely(inode->i_op->follow_link) && follow) {
+                if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
+                        return -ECHILD;
+                BUG_ON(inode != path->dentry->d_inode);
+                return 1;
+        }
+        path_to_nameidata(path, nd);
+        nd->inode = inode;
+        return 0;
+}
-fail:
+/*
-        return PTR_ERR(dentry);
+ * This limits recursive symlink follows to 8, while
+ * limiting consecutive symlinks to 40.
+ *
+ * Without that kind of total limit, nasty chains of consecutive
+ * symlinks can cause almost arbitrarily long lookups.
+ */
+static inline int nested_symlink(struct path *path, struct nameidata *nd)
+{
+        int res;
+        BUG_ON(nd->depth >= MAX_NESTED_LINKS);
+        if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
+                path_put_conditional(path, nd);
+                path_put(&nd->path);
+                return -ELOOP;
+        }
+        nd->depth++;
+        current->link_count++;
+        do {
+                struct path link = *path;
+                void *cookie;
+                res = follow_link(&link, nd, &cookie);
+                if (!res)
+                        res = walk_component(nd, path, &nd->last,
+                                             nd->last_type, LOOKUP_FOLLOW);
+                put_link(nd, &link, cookie);
+        } while (res > 0);
+        current->link_count--;
+        nd->depth--;
+        return res;
 }
 /*
@@ -1339,30 +1402,18 @@ static int link_path_walk(const char *name, struct nameidata *nd)
        while (*name=='/')
                name++;
        if (!*name)
-                goto return_reval;
+                return 0;
-        if (nd->depth)
-                lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
        /* At this point we know we have a real path component. */
        for(;;) {
-                struct inode *inode;
                unsigned long hash;
                struct qstr this;
                unsigned int c;
+                int type;
                nd->flags |= LOOKUP_CONTINUE;
-                if (nd->flags & LOOKUP_RCU) {
-                        err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+                err = may_lookup(nd);
-                        if (err == -ECHILD) {
-                                if (nameidata_drop_rcu(nd))
-                                        return -ECHILD;
-                                goto exec_again;
-                        }
-                } else {
-exec_again:
-                        err = exec_permission(nd->inode, 0);
-                }
                if (err)
                        break;
@@ -1378,52 +1429,43 @@ exec_again:
                this.len = name - (const char *) this.name;
                this.hash = end_name_hash(hash);
+                type = LAST_NORM;
+                if (this.name[0] == '.') switch (this.len) {
+                        case 2:
+                                if (this.name[1] == '.') {
+                                        type = LAST_DOTDOT;
+                                        nd->flags |= LOOKUP_JUMPED;
+                                }
+                                break;
+                        case 1:
+                                type = LAST_DOT;
+                }
+                if (likely(type == LAST_NORM)) {
+                        struct dentry *parent = nd->path.dentry;
+                        nd->flags &= ~LOOKUP_JUMPED;
+                        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
+                                err = parent->d_op->d_hash(parent, nd->inode,
+                                                           &this);
+                                if (err < 0)
+                                        break;
+                        }
+                }
                /* remove trailing slashes? */
                if (!c)
                        goto last_component;
                while (*++name == '/');
                if (!*name)
-                        goto last_with_slashes;
+                        goto last_component;
-                /*
+                err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
-                 * "." and ".." are special - ".." especially so because it has
+                if (err < 0)
-                 * to be able to know about the current root directory and
+                        return err;
-                 * parent relationships.
-                 */
-                if (this.name[0] == '.') switch (this.len) {
-                        default:
-                                break;
-                        case 2:
-                                if (this.name[1] != '.')
-                                        break;
-                                if (nd->flags & LOOKUP_RCU) {
-                                        if (follow_dotdot_rcu(nd))
-                                                return -ECHILD;
-                                } else
-                                        follow_dotdot(nd);
-                                /* fallthrough */
-                        case 1:
-                                continue;
-                }
-                /* This does the actual lookups.. */
-                err = do_lookup(nd, &this, &next, &inode);
-                if (err)
-                        break;
-                err = -ENOENT;
-                if (!inode)
-                        goto out_dput;
-                if (inode->i_op->follow_link) {
+                if (err) {
-                        err = do_follow_link(inode, &next, nd);
+                        err = nested_symlink(&next, nd);
                        if (err)
-                                goto return_err;
+                                return err;
-                        nd->inode = nd->path.dentry->d_inode;
-                        err = -ENOENT;
-                        if (!nd->inode)
-                                break;
-                } else {
-                        path_to_nameidata(&next, nd);
-                        nd->inode = inode;
                }
                err = -ENOTDIR; 
                if (!nd->inode->i_op->lookup)
@@ -1431,210 +1473,109 @@ exec_again:
                continue;
                /* here ends the main loop */
-last_with_slashes:
-                lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 last_component:
                /* Clear LOOKUP_CONTINUE iff it was previously unset */
                nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
-                if (lookup_flags & LOOKUP_PARENT)
-                        goto lookup_parent;
-                if (this.name[0] == '.') switch (this.len) {
-                        default:
-                                break;
-                        case 2:
-                                if (this.name[1] != '.')
-                                        break;
-                                if (nd->flags & LOOKUP_RCU) {
-                                        if (follow_dotdot_rcu(nd))
-                                                return -ECHILD;
-                                } else
-                                        follow_dotdot(nd);
-                                /* fallthrough */
-                        case 1:
-                                goto return_reval;
-                }
-                err = do_lookup(nd, &this, &next, &inode);
-                if (err)
-                        break;
-                if (inode && unlikely(inode->i_op->follow_link) &&
-                    (lookup_flags & LOOKUP_FOLLOW)) {
-                        err = do_follow_link(inode, &next, nd);
-                        if (err)
-                                goto return_err;
-                        nd->inode = nd->path.dentry->d_inode;
-                } else {
-                        path_to_nameidata(&next, nd);
-                        nd->inode = inode;
-                }
-                err = -ENOENT;
-                if (!nd->inode)
-                        break;
-                if (lookup_flags & LOOKUP_DIRECTORY) {
-                        err = -ENOTDIR; 
-                        if (!nd->inode->i_op->lookup)
-                                break;
-                }
-                goto return_base;
-lookup_parent:
                nd->last = this;
-                nd->last_type = LAST_NORM;
+                nd->last_type = type;
-                if (this.name[0] != '.')
-                        goto return_base;
-                if (this.len == 1)
-                        nd->last_type = LAST_DOT;
-                else if (this.len == 2 && this.name[1] == '.')
-                        nd->last_type = LAST_DOTDOT;
-                else
-                        goto return_base;
-return_reval:
-                /*
-                 * We bypassed the ordinary revalidation routines.
-                 * We may need to check the cached dentry for staleness.
-                 */
-                if (need_reval_dot(nd->path.dentry)) {
-                        if (nameidata_drop_rcu_last_maybe(nd))
-                                return -ECHILD;
-                        /* Note: we do not d_invalidate() */
-                        err = d_revalidate(nd->path.dentry, nd);
-                        if (!err)
-                                err = -ESTALE;
-                        if (err < 0)
-                                break;
-                        return 0;
-                }
-return_base:
-                if (nameidata_drop_rcu_last_maybe(nd))
-                        return -ECHILD;
                return 0;
-out_dput:
-                if (!(nd->flags & LOOKUP_RCU))
-                        path_put_conditional(&next, nd);
-                break;
        }
-        if (!(nd->flags & LOOKUP_RCU))
+        terminate_walk(nd);
-                path_put(&nd->path);
-return_err:
        return err;
 }
-static inline int path_walk_rcu(const char *name, struct nameidata *nd)
+static int path_init(int dfd, const char *name, unsigned int flags,
-{
+                     struct nameidata *nd, struct file **fp)
-        current->total_link_count = 0;
-        return link_path_walk(name, nd);
-}
-static inline int path_walk_simple(const char *name, struct nameidata *nd)
-{
-        current->total_link_count = 0;
-        return link_path_walk(name, nd);
-}
-static int path_walk(const char *name, struct nameidata *nd)
-{
-        struct path save = nd->path;
-        int result;
-        current->total_link_count = 0;
-        /* make sure the stuff we saved doesn't go away */
-        path_get(&save);
-        result = link_path_walk(name, nd);
-        if (result == -ESTALE) {
-                /* nd->path had been dropped */
-                current->total_link_count = 0;
-                nd->path = save;
-                nd->inode = save.dentry->d_inode;
-                path_get(&nd->path);
-                nd->flags |= LOOKUP_REVAL;
-                result = link_path_walk(name, nd);
-        }
-        path_put(&save);
-        return result;
-}
-static void path_finish_rcu(struct nameidata *nd)
-{
-        if (nd->flags & LOOKUP_RCU) {
-                /* RCU dangling. Cancel it. */
-                nd->flags &= ~LOOKUP_RCU;
-                nd->root.mnt = NULL;
-                rcu_read_unlock();
-                br_read_unlock(vfsmount_lock);
-        }
-        if (nd->file)
-                fput(nd->file);
-}
-static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
        int retval = 0;
        int fput_needed;
        struct file *file;
        nd->last_type = LAST_ROOT; /* if there are only slashes... */
-        nd->flags = flags | LOOKUP_RCU;
+        nd->flags = flags | LOOKUP_JUMPED;
        nd->depth = 0;
+        if (flags & LOOKUP_ROOT) {
+                struct inode *inode = nd->root.dentry->d_inode;
+                if (*name) {
+                        if (!inode->i_op->lookup)
+                                return -ENOTDIR;
+                        retval = inode_permission(inode, MAY_EXEC);
+                        if (retval)
+                                return retval;
+                }
+                nd->path = nd->root;
+                nd->inode = inode;
+                if (flags & LOOKUP_RCU) {
+                        br_read_lock(vfsmount_lock);
+                        rcu_read_lock();
+                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                } else {
+                        path_get(&nd->path);
+                }
+                return 0;
+        }
        nd->root.mnt = NULL;
-        nd->file = NULL;
        if (*name=='/') {
-                struct fs_struct *fs = current->fs;
+                if (flags & LOOKUP_RCU) {
-                unsigned seq;
+                        br_read_lock(vfsmount_lock);
+                        rcu_read_lock();
-                br_read_lock(vfsmount_lock);
+                        set_root_rcu(nd);
-                rcu_read_lock();
+                } else {
+                        set_root(nd);
-                do {
+                        path_get(&nd->root);
-                        seq = read_seqcount_begin(&fs->seq);
+                }
-                        nd->root = fs->root;
+                nd->path = nd->root;
-                        nd->path = nd->root;
-                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-                } while (read_seqcount_retry(&fs->seq, seq));
        } else if (dfd == AT_FDCWD) {
-                struct fs_struct *fs = current->fs;
+                if (flags & LOOKUP_RCU) {
-                unsigned seq;
+                        struct fs_struct *fs = current->fs;
+                        unsigned seq;
-                br_read_lock(vfsmount_lock);
-                rcu_read_lock();
-                do {
+                        br_read_lock(vfsmount_lock);
-                        seq = read_seqcount_begin(&fs->seq);
+                        rcu_read_lock();
-                        nd->path = fs->pwd;
-                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-                } while (read_seqcount_retry(&fs->seq, seq));
+                        do {
+                                seq = read_seqcount_begin(&fs->seq);
+                                nd->path = fs->pwd;
+                                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                        } while (read_seqcount_retry(&fs->seq, seq));
+                } else {
+                        get_fs_pwd(current->fs, &nd->path);
+                }
        } else {
                struct dentry *dentry;
-                file = fget_light(dfd, &fput_needed);
+                file = fget_raw_light(dfd, &fput_needed);
                retval = -EBADF;
                if (!file)
                        goto out_fail;
                dentry = file->f_path.dentry;
-                retval = -ENOTDIR;
+                if (*name) {
-                if (!S_ISDIR(dentry->d_inode->i_mode))
+                        retval = -ENOTDIR;
-                        goto fput_fail;
+                        if (!S_ISDIR(dentry->d_inode->i_mode))
+                                goto fput_fail;
-                retval = file_permission(file, MAY_EXEC);
+                        retval = file_permission(file, MAY_EXEC);
-                if (retval)
+                        if (retval)
-                        goto fput_fail;
+                                goto fput_fail;
+                }
                nd->path = file->f_path;
-                if (fput_needed)
+                if (flags & LOOKUP_RCU) {
-                        nd->file = file;
+                        if (fput_needed)
+                                *fp = file;
-                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-                br_read_lock(vfsmount_lock);
+                        br_read_lock(vfsmount_lock);
-                rcu_read_lock();
+                        rcu_read_lock();
+                } else {
+                        path_get(&file->f_path);
+                        fput_light(file, fput_needed);
+                }
        }
        nd->inode = nd->path.dentry->d_inode;
        return 0;
@@ -1644,60 +1585,23 @@ out_fail:
        return retval;
 }
-static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+static inline int lookup_last(struct nameidata *nd, struct path *path)
 {
-        int retval = 0;
+        if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
-        int fput_needed;
+                nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
-        struct file *file;
-        nd->last_type = LAST_ROOT; /* if there are only slashes... */
-        nd->flags = flags;
-        nd->depth = 0;
-        nd->root.mnt = NULL;
-        if (*name=='/') {
-                set_root(nd);
-                nd->path = nd->root;
-                path_get(&nd->root);
-        } else if (dfd == AT_FDCWD) {
-                get_fs_pwd(current->fs, &nd->path);
-        } else {
-                struct dentry *dentry;
-                file = fget_light(dfd, &fput_needed);
-                retval = -EBADF;
-                if (!file)
-                        goto out_fail;
-                dentry = file->f_path.dentry;
-                retval = -ENOTDIR;
-                if (!S_ISDIR(dentry->d_inode->i_mode))
-                        goto fput_fail;
-                retval = file_permission(file, MAY_EXEC);
-                if (retval)
-                        goto fput_fail;
-                nd->path = file->f_path;
+        nd->flags &= ~LOOKUP_PARENT;
-                path_get(&file->f_path);
+        return walk_component(nd, path, &nd->last, nd->last_type,
+                                        nd->flags & LOOKUP_FOLLOW);
-                fput_light(file, fput_needed);
-        }
-        nd->inode = nd->path.dentry->d_inode;
-        return 0;
-fput_fail:
-        fput_light(file, fput_needed);
-out_fail:
-        return retval;
 }
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-static int do_path_lookup(int dfd, const char *name,
+static int path_lookupat(int dfd, const char *name,
                                unsigned int flags, struct nameidata *nd)
 {
-        int retval;
+        struct file *base = NULL;
+        struct path path;
+        int err;
        /*
         * Path walking is largely split up into 2 different synchronisation
@@ -1713,44 +1617,75 @@ static int do_path_lookup(int dfd, const char *name,
         * be handled by restarting a traditional ref-walk (which will always
         * be able to complete).
         */
-        retval = path_init_rcu(dfd, name, flags, nd);
+        err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
-        if (unlikely(retval))
-                return retval;
+        if (unlikely(err))
-        retval = path_walk_rcu(name, nd);
+                return err;
-        path_finish_rcu(nd);
-        if (nd->root.mnt) {
+        current->total_link_count = 0;
-                path_put(&nd->root);
+        err = link_path_walk(name, nd);
-                nd->root.mnt = NULL;
+        if (!err && !(flags & LOOKUP_PARENT)) {
+                err = lookup_last(nd, &path);
+                while (err > 0) {
+                        void *cookie;
+                        struct path link = path;
+                        nd->flags |= LOOKUP_PARENT;
+                        err = follow_link(&link, nd, &cookie);
+                        if (!err)
+                                err = lookup_last(nd, &path);
+                        put_link(nd, &link, cookie);
+                }
        }
-        if (unlikely(retval == -ECHILD || retval == -ESTALE)) {
+        if (nd->flags & LOOKUP_RCU) {
-                /* slower, locked walk */
+                /* went all way through without dropping RCU */
-                if (retval == -ESTALE)
+                BUG_ON(err);
-                        flags |= LOOKUP_REVAL;
+                if (nameidata_drop_rcu_last(nd))
-                retval = path_init(dfd, name, flags, nd);
+                        err = -ECHILD;
-                if (unlikely(retval))
+        }
-                        return retval;
-                retval = path_walk(name, nd);
+        if (!err)
-                if (nd->root.mnt) {
+                err = handle_reval_path(nd);
-                        path_put(&nd->root);
-                        nd->root.mnt = NULL;
+        if (!err && nd->flags & LOOKUP_DIRECTORY) {
+                if (!nd->inode->i_op->lookup) {
+                        path_put(&nd->path);
+                        return -ENOTDIR;
                }
        }
+        if (base)
+                fput(base);
+        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+                path_put(&nd->root);
+                nd->root.mnt = NULL;
+        }
+        return err;
+}
+static int do_path_lookup(int dfd, const char *name,
+                                unsigned int flags, struct nameidata *nd)
+{
+        int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
+        if (unlikely(retval == -ECHILD))
+                retval = path_lookupat(dfd, name, flags, nd);
+        if (unlikely(retval == -ESTALE))
+                retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
        if (likely(!retval)) {
                if (unlikely(!audit_dummy_context())) {
                        if (nd->path.dentry && nd->inode)
                                audit_inode(name, nd->path.dentry);
                }
        }
        return retval;
 }
-int path_lookup(const char *name, unsigned int flags,
+int kern_path_parent(const char *name, struct nameidata *nd)
-                        struct nameidata *nd)
 {
-        return do_path_lookup(AT_FDCWD, name, flags, nd);
+        return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd);
 }
 int kern_path(const char *name, unsigned int flags, struct path *path)
@@ -1774,29 +1709,10 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
                    const char *name, unsigned int flags,
                    struct nameidata *nd)
 {
-        int retval;
+        nd->root.dentry = dentry;
+        nd->root.mnt = mnt;
-        /* same as do_path_lookup */
+        /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
-        nd->last_type = LAST_ROOT;
+        return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd);
-        nd->flags = flags;
-        nd->depth = 0;
-        nd->path.dentry = dentry;
-        nd->path.mnt = mnt;
-        path_get(&nd->path);
-        nd->root = nd->path;
-        path_get(&nd->root);
-        nd->inode = nd->path.dentry->d_inode;
-        retval = path_walk(name, nd);
-        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
-                                nd->inode))
-                audit_inode(name, nd->path.dentry);
-        path_put(&nd->root);
-        nd->root.mnt = NULL;
-        return retval;
 }
 static struct dentry *__lookup_hash(struct qstr *name,
@@ -1811,17 +1727,6 @@ static struct dentry *__lookup_hash(struct qstr *name,
                return ERR_PTR(err);
        /*
-         * See if the low-level filesystem might want
-         * to use its own hash..
-         */
-        if (base->d_flags & DCACHE_OP_HASH) {
-                err = base->d_op->d_hash(base, inode, name);
-                dentry = ERR_PTR(err);
-                if (err < 0)
-                        goto out;
-        }
-        /*
         * Don't bother with __d_lookup: callers are for creat as
         * well as unlink, so a lot of the time it would cost
         * a double lookup.
@@ -1833,7 +1738,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
        if (!dentry)
                dentry = d_alloc_and_lookup(base, name, nd);
-out:
        return dentry;
 }
@@ -1847,28 +1752,6 @@ static struct dentry *lookup_hash(struct nameidata *nd)
        return __lookup_hash(&nd->last, nd->path.dentry, nd);
 }
-static int __lookup_one_len(const char *name, struct qstr *this,
-                struct dentry *base, int len)
-{
-        unsigned long hash;
-        unsigned int c;
-        this->name = name;
-        this->len = len;
-        if (!len)
-                return -EACCES;
-        hash = init_name_hash();
-        while (len--) {
-                c = *(const unsigned char *)name++;
-                if (c == '/' || c == '\0')
-                        return -EACCES;
-                hash = partial_name_hash(c, hash);
-        }
-        this->hash = end_name_hash(hash);
-        return 0;
-}
 /**
 * lookup_one_len - filesystem helper to lookup single pathname component
 * @name:       pathname component to lookup
@@ -1882,14 +1765,34 @@ static int __lookup_one_len(const char *name, struct qstr *this,
 */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
-        int err;
        struct qstr this;
+        unsigned long hash;
+        unsigned int c;
        WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
-        err = __lookup_one_len(name, &this, base, len);
+        this.name = name;
-        if (err)
+        this.len = len;
-                return ERR_PTR(err);
+        if (!len)
+                return ERR_PTR(-EACCES);
+        hash = init_name_hash();
+        while (len--) {
+                c = *(const unsigned char *)name++;
+                if (c == '/' || c == '\0')
+                        return ERR_PTR(-EACCES);
+                hash = partial_name_hash(c, hash);
+        }
+        this.hash = end_name_hash(hash);
+        /*
+         * See if the low-level filesystem might want
+         * to use its own hash..
+         */
+        if (base->d_flags & DCACHE_OP_HASH) {
+                int err = base->d_op->d_hash(base, base->d_inode, &this);
+                if (err < 0)
+                        return ERR_PTR(err);
+        }
        return __lookup_hash(&this, base, NULL);
 }
@@ -1898,7 +1801,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
                 struct path *path)
 {
        struct nameidata nd;
-        char *tmp = getname(name);
+        char *tmp = getname_flags(name, flags);
        int err = PTR_ERR(tmp);
        if (!IS_ERR(tmp)) {
@@ -2078,12 +1981,16 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        return error;
 }
-int may_open(struct path *path, int acc_mode, int flag)
+static int may_open(struct path *path, int acc_mode, int flag)
 {
        struct dentry *dentry = path->dentry;
        struct inode *inode = dentry->d_inode;
        int error;
+        /* O_PATH? */
+        if (!acc_mode)
+                return 0;
        if (!inode)
                return -ENOENT;
@@ -2152,34 +2059,6 @@ static int handle_truncate(struct file *filp)
 }
 /*
- * Be careful about ever adding any more callers of this
- * function.  Its flags must be in the namei format, not
- * what get passed to sys_open().
- */
-static int __open_namei_create(struct nameidata *nd, struct path *path,
-                                int open_flag, int mode)
-{
-        int error;
-        struct dentry *dir = nd->path.dentry;
-        if (!IS_POSIXACL(dir->d_inode))
-                mode &= ~current_umask();
-        error = security_path_mknod(&nd->path, path->dentry, mode, 0);
-        if (error)
-                goto out_unlock;
-        error = vfs_create(dir->d_inode, path->dentry, mode, nd);
-out_unlock:
-        mutex_unlock(&dir->d_inode->i_mutex);
-        dput(nd->path.dentry);
-        nd->path.dentry = path->dentry;
-        if (error)
-                return error;
-        /* Don't check for write permission, don't truncate */
-        return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
-}
-/*
 * Note that while the flag value (low two bits) for sys_open means:
 *      00 - read-only
 *      01 - write-only
@@ -2203,126 +2082,115 @@ static inline int open_to_namei_flags(int flag)
        return flag;
 }
-static int open_will_truncate(int flag, struct inode *inode)
-{
-        /*
-         * We'll never write to the fs underlying
-         * a device file.
-         */
-        if (special_file(inode->i_mode))
-                return 0;
-        return (flag & O_TRUNC);
-}
-static struct file *finish_open(struct nameidata *nd,
-                                int open_flag, int acc_mode)
-{
-        struct file *filp;
-        int will_truncate;
-        int error;
-        will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
-        if (will_truncate) {
-                error = mnt_want_write(nd->path.mnt);
-                if (error)
-                        goto exit;
-        }
-        error = may_open(&nd->path, acc_mode, open_flag);
-        if (error) {
-                if (will_truncate)
-                        mnt_drop_write(nd->path.mnt);
-                goto exit;
-        }
-        filp = nameidata_to_filp(nd);
-        if (!IS_ERR(filp)) {
-                error = ima_file_check(filp, acc_mode);
-                if (error) {
-                        fput(filp);
-                        filp = ERR_PTR(error);
-                }
-        }
-        if (!IS_ERR(filp)) {
-                if (will_truncate) {
-                        error = handle_truncate(filp);
-                        if (error) {
-                                fput(filp);
-                                filp = ERR_PTR(error);
-                        }
-                }
-        }
-        /*
-         * It is now safe to drop the mnt write
-         * because the filp has had a write taken
-         * on its behalf.
-         */
-        if (will_truncate)
-                mnt_drop_write(nd->path.mnt);
-        path_put(&nd->path);
-        return filp;
-exit:
-        path_put(&nd->path);
-        return ERR_PTR(error);
-}
 /*
- * Handle O_CREAT case for do_filp_open
+ * Handle the last step of open()
 */
 static struct file *do_last(struct nameidata *nd, struct path *path,
-                            int open_flag, int acc_mode,
+                            const struct open_flags *op, const char *pathname)
-                            int mode, const char *pathname)
 {
        struct dentry *dir = nd->path.dentry;
+        struct dentry *dentry;
+        int open_flag = op->open_flag;
+        int will_truncate = open_flag & O_TRUNC;
+        int want_write = 0;
+        int acc_mode = op->acc_mode;
        struct file *filp;
-        int error = -EISDIR;
+        int error;
+        nd->flags &= ~LOOKUP_PARENT;
+        nd->flags |= op->intent;
        switch (nd->last_type) {
        case LAST_DOTDOT:
-                follow_dotdot(nd);
-                dir = nd->path.dentry;
        case LAST_DOT:
-                if (need_reval_dot(dir)) {
+                error = handle_dots(nd, nd->last_type);
-                        int status = d_revalidate(nd->path.dentry, nd);
+                if (error)
-                        if (!status)
+                        return ERR_PTR(error);
-                                status = -ESTALE;
-                        if (status < 0) {
-                                error = status;
-                                goto exit;
-                        }
-                }
                /* fallthrough */
        case LAST_ROOT:
-                goto exit;
+                if (nd->flags & LOOKUP_RCU) {
+                        if (nameidata_drop_rcu_last(nd))
+                                return ERR_PTR(-ECHILD);
+                }
+                error = handle_reval_path(nd);
+                if (error)
+                        goto exit;
+                audit_inode(pathname, nd->path.dentry);
+                if (open_flag & O_CREAT) {
+                        error = -EISDIR;
+                        goto exit;
+                }
+                goto ok;
        case LAST_BIND:
+                /* can't be RCU mode here */
+                error = handle_reval_path(nd);
+                if (error)
+                        goto exit;
                audit_inode(pathname, dir);
                goto ok;
        }
+        if (!(open_flag & O_CREAT)) {
+                int symlink_ok = 0;
+                if (nd->last.name[nd->last.len])
+                        nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+                if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
+                        symlink_ok = 1;
+                /* we _can_ be in RCU mode here */
+                error = walk_component(nd, path, &nd->last, LAST_NORM,
+                                        !symlink_ok);
+                if (error < 0)
+                        return ERR_PTR(error);
+                if (error) /* symlink */
+                        return NULL;
+                /* sayonara */
+                if (nd->flags & LOOKUP_RCU) {
+                        if (nameidata_drop_rcu_last(nd))
+                                return ERR_PTR(-ECHILD);
+                }
+                error = -ENOTDIR;
+                if (nd->flags & LOOKUP_DIRECTORY) {
+                        if (!nd->inode->i_op->lookup)
+                                goto exit;
+                }
+                audit_inode(pathname, nd->path.dentry);
+                goto ok;
+        }
+        /* create side of things */
+        if (nd->flags & LOOKUP_RCU) {
+                if (nameidata_drop_rcu_last(nd))
+                        return ERR_PTR(-ECHILD);
+        }
+        audit_inode(pathname, dir);
+        error = -EISDIR;
        /* trailing slashes? */
        if (nd->last.name[nd->last.len])
                goto exit;
        mutex_lock(&dir->d_inode->i_mutex);
-        path->dentry = lookup_hash(nd);
+        dentry = lookup_hash(nd);
-        path->mnt = nd->path.mnt;
+        error = PTR_ERR(dentry);
+        if (IS_ERR(dentry)) {
-        error = PTR_ERR(path->dentry);
-        if (IS_ERR(path->dentry)) {
                mutex_unlock(&dir->d_inode->i_mutex);
                goto exit;
        }
-        if (IS_ERR(nd->intent.open.file)) {
+        path->dentry = dentry;
-                error = PTR_ERR(nd->intent.open.file);
+        path->mnt = nd->path.mnt;
-                goto exit_mutex_unlock;
-        }
        /* Negative dentry, just create the file */
-        if (!path->dentry->d_inode) {
+        if (!dentry->d_inode) {
+                int mode = op->mode;
+                if (!IS_POSIXACL(dir->d_inode))
+                        mode &= ~current_umask();
                /*
                 * This write is needed to ensure that a
-                 * ro->rw transition does not occur between
+                 * rw->ro transition does not occur between
                 * the time when the file is created and when
                 * a permanent write count is taken through
                 * the 'struct file' in nameidata_to_filp().
@@ -2330,22 +2198,21 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                error = mnt_want_write(nd->path.mnt);
                if (error)
                        goto exit_mutex_unlock;
-                error = __open_namei_create(nd, path, open_flag, mode);
+                want_write = 1;
-                if (error) {
+                /* Don't check for write permission, don't truncate */
-                        mnt_drop_write(nd->path.mnt);
+                open_flag &= ~O_TRUNC;
-                        goto exit;
+                will_truncate = 0;
-                }
+                acc_mode = MAY_OPEN;
-                filp = nameidata_to_filp(nd);
+                error = security_path_mknod(&nd->path, dentry, mode, 0);
-                mnt_drop_write(nd->path.mnt);
+                if (error)
-                path_put(&nd->path);
+                        goto exit_mutex_unlock;
-                if (!IS_ERR(filp)) {
+                error = vfs_create(dir->d_inode, dentry, mode, nd);
-                        error = ima_file_check(filp, acc_mode);
+                if (error)
-                        if (error) {
+                        goto exit_mutex_unlock;
-                                fput(filp);
+                mutex_unlock(&dir->d_inode->i_mutex);
-                                filp = ERR_PTR(error);
+                dput(nd->path.dentry);
-                        }
+                nd->path.dentry = dentry;
-                }
+                goto common;
-                return filp;
        }
        /*
@@ -2375,7 +2242,40 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        if (S_ISDIR(nd->inode->i_mode))
                goto exit;
 ok:
-        filp = finish_open(nd, open_flag, acc_mode);
+        if (!S_ISREG(nd->inode->i_mode))
+                will_truncate = 0;
+        if (will_truncate) {
+                error = mnt_want_write(nd->path.mnt);
+                if (error)
+                        goto exit;
+                want_write = 1;
+        }
+common:
+        error = may_open(&nd->path, acc_mode, open_flag);
+        if (error)
+                goto exit;
+        filp = nameidata_to_filp(nd);
+        if (!IS_ERR(filp)) {
+                error = ima_file_check(filp, op->acc_mode);
+                if (error) {
+                        fput(filp);
+                        filp = ERR_PTR(error);
+                }
+        }
+        if (!IS_ERR(filp)) {
+                if (will_truncate) {
+                        error = handle_truncate(filp);
+                        if (error) {
+                                fput(filp);
+                                filp = ERR_PTR(error);
+                        }
+                }
+        }
+out:
+        if (want_write)
+                mnt_drop_write(nd->path.mnt);
+        path_put(&nd->path);
        return filp;
 exit_mutex_unlock:
@@ -2383,204 +2283,103 @@ exit_mutex_unlock:
 exit_dput:
        path_put_conditional(path, nd);
 exit:
-        path_put(&nd->path);
+        filp = ERR_PTR(error);
-        return ERR_PTR(error);
+        goto out;
 }
-/*
+static struct file *path_openat(int dfd, const char *pathname,
- * Note that the low bits of the passed in "open_flag"
+                struct nameidata *nd, const struct open_flags *op, int flags)
- * are not the same as in the local variable "flag". See
- * open_to_namei_flags() for more details.
- */
-struct file *do_filp_open(int dfd, const char *pathname,
-                int open_flag, int mode, int acc_mode)
 {
+        struct file *base = NULL;
        struct file *filp;
-        struct nameidata nd;
-        int error;
        struct path path;
-        int count = 0;
+        int error;
-        int flag = open_to_namei_flags(open_flag);
-        int flags;
-        if (!(open_flag & O_CREAT))
-                mode = 0;
-        /* Must never be set by userspace */
-        open_flag &= ~FMODE_NONOTIFY;
-        /*
-         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
-         * check for O_DSYNC if the need any syncing at all we enforce it's
-         * always set instead of having to deal with possibly weird behaviour
-         * for malicious applications setting only __O_SYNC.
-         */
-        if (open_flag & __O_SYNC)
-                open_flag |= O_DSYNC;
-        if (!acc_mode)
-                acc_mode = MAY_OPEN | ACC_MODE(open_flag);
-        /* O_TRUNC implies we need access checks for write permissions */
-        if (open_flag & O_TRUNC)
-                acc_mode |= MAY_WRITE;
-        /* Allow the LSM permission hook to distinguish append 
-           access from general write access. */
-        if (open_flag & O_APPEND)
-                acc_mode |= MAY_APPEND;
-        flags = LOOKUP_OPEN;
-        if (open_flag & O_CREAT) {
-                flags |= LOOKUP_CREATE;
-                if (open_flag & O_EXCL)
-                        flags |= LOOKUP_EXCL;
-        }
-        if (open_flag & O_DIRECTORY)
-                flags |= LOOKUP_DIRECTORY;
-        if (!(open_flag & O_NOFOLLOW))
-                flags |= LOOKUP_FOLLOW;
        filp = get_empty_filp();
        if (!filp)
                return ERR_PTR(-ENFILE);
-        filp->f_flags = open_flag;
+        filp->f_flags = op->open_flag;
-        nd.intent.open.file = filp;
+        nd->intent.open.file = filp;
-        nd.intent.open.flags = flag;
+        nd->intent.open.flags = open_to_namei_flags(op->open_flag);
-        nd.intent.open.create_mode = mode;
+        nd->intent.open.create_mode = op->mode;
-        if (open_flag & O_CREAT)
-                goto creat;
-        /* !O_CREAT, simple open */
+        error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base);
-        error = do_path_lookup(dfd, pathname, flags, &nd);
        if (unlikely(error))
-                goto out_filp2;
-        error = -ELOOP;
-        if (!(nd.flags & LOOKUP_FOLLOW)) {
-                if (nd.inode->i_op->follow_link)
-                        goto out_path2;
-        }
-        error = -ENOTDIR;
-        if (nd.flags & LOOKUP_DIRECTORY) {
-                if (!nd.inode->i_op->lookup)
-                        goto out_path2;
-        }
-        audit_inode(pathname, nd.path.dentry);
-        filp = finish_open(&nd, open_flag, acc_mode);
-out2:
-        release_open_intent(&nd);
-        return filp;
-out_path2:
-        path_put(&nd.path);
-out_filp2:
-        filp = ERR_PTR(error);
-        goto out2;
-creat:
-        /* OK, have to create the file. Find the parent. */
-        error = path_init_rcu(dfd, pathname,
-                        LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
-        if (error)
                goto out_filp;
-        error = path_walk_rcu(pathname, &nd);
-        path_finish_rcu(&nd);
-        if (unlikely(error == -ECHILD || error == -ESTALE)) {
-                /* slower, locked walk */
-                if (error == -ESTALE) {
-reval:
-                        flags |= LOOKUP_REVAL;
-                }
-                error = path_init(dfd, pathname,
-                                LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
-                if (error)
-                        goto out_filp;
-                error = path_walk_simple(pathname, &nd);
+        current->total_link_count = 0;
-        }
+        error = link_path_walk(pathname, nd);
        if (unlikely(error))
                goto out_filp;
-        if (unlikely(!audit_dummy_context()))
-                audit_inode(pathname, nd.path.dentry);
-        /*
+        filp = do_last(nd, &path, op, pathname);
-         * We have the parent and last component.
-         */
-        nd.flags = flags;
-        filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
        while (unlikely(!filp)) { /* trailing symlink */
                struct path link = path;
-                struct inode *linki = link.dentry->d_inode;
                void *cookie;
-                error = -ELOOP;
+                if (!(nd->flags & LOOKUP_FOLLOW)) {
-                if (!(nd.flags & LOOKUP_FOLLOW))
+                        path_put_conditional(&path, nd);
-                        goto exit_dput;
+                        path_put(&nd->path);
-                if (count++ == 32)
+                        filp = ERR_PTR(-ELOOP);
-                        goto exit_dput;
+                        break;
-                /*
-                 * This is subtle. Instead of calling do_follow_link() we do
-                 * the thing by hands. The reason is that this way we have zero
-                 * link_count and path_walk() (called from ->follow_link)
-                 * honoring LOOKUP_PARENT.  After that we have the parent and
-                 * last component, i.e. we are in the same situation as after
-                 * the first path_walk().  Well, almost - if the last component
-                 * is normal we get its copy stored in nd->last.name and we will
-                 * have to putname() it when we are done. Procfs-like symlinks
-                 * just set LAST_BIND.
-                 */
-                nd.flags |= LOOKUP_PARENT;
-                error = security_inode_follow_link(link.dentry, &nd);
-                if (error)
-                        goto exit_dput;
-                error = __do_follow_link(&link, &nd, &cookie);
-                if (unlikely(error)) {
-                        if (!IS_ERR(cookie) && linki->i_op->put_link)
-                                linki->i_op->put_link(link.dentry, &nd, cookie);
-                        /* nd.path had been dropped */
-                        nd.path = link;
-                        goto out_path;
                }
-                nd.flags &= ~LOOKUP_PARENT;
+                nd->flags |= LOOKUP_PARENT;
-                filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
+                nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
-                if (linki->i_op->put_link)
+                error = follow_link(&link, nd, &cookie);
-                        linki->i_op->put_link(link.dentry, &nd, cookie);
+                if (unlikely(error))
-                path_put(&link);
+                        filp = ERR_PTR(error);
+                else
+                        filp = do_last(nd, &path, op, pathname);
+                put_link(nd, &link, cookie);
        }
 out:
-        if (nd.root.mnt)
+        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
-                path_put(&nd.root);
+                path_put(&nd->root);
-        if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
+        if (base)
-                goto reval;
+                fput(base);
-        release_open_intent(&nd);
+        release_open_intent(nd);
        return filp;
-exit_dput:
-        path_put_conditional(&path, &nd);
-out_path:
-        path_put(&nd.path);
 out_filp:
        filp = ERR_PTR(error);
        goto out;
 }
-/**
+struct file *do_filp_open(int dfd, const char *pathname,
- * filp_open - open file and return file pointer
+                const struct open_flags *op, int flags)
- *
- * @filename:   path to open
- * @flags:      open flags as per the open(2) second argument
- * @mode:       mode for the new file if O_CREAT is set, else ignored
- *
- * This is the helper to open a file from kernelspace if you really
- * have to.  But in generally you should not do this, so please move
- * along, nothing to see here..
- */
-struct file *filp_open(const char *filename, int flags, int mode)
 {
-        return do_filp_open(AT_FDCWD, filename, flags, mode, 0);
+        struct nameidata nd;
+        struct file *filp;
+        filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
+        if (unlikely(filp == ERR_PTR(-ECHILD)))
+                filp = path_openat(dfd, pathname, &nd, op, flags);
+        if (unlikely(filp == ERR_PTR(-ESTALE)))
+                filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
+        return filp;
+}
+struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+                const char *name, const struct open_flags *op, int flags)
+{
+        struct nameidata nd;
+        struct file *file;
+        nd.root.mnt = mnt;
+        nd.root.dentry = dentry;
+        flags |= LOOKUP_ROOT;
+        if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
+                return ERR_PTR(-ELOOP);
+        file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
+        if (unlikely(file == ERR_PTR(-ECHILD)))
+                file = path_openat(-1, name, &nd, op, flags);
+        if (unlikely(file == ERR_PTR(-ESTALE)))
+                file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
+        return file;
 }
-EXPORT_SYMBOL(filp_open);
 /**
 * lookup_create - lookup a dentry, creating it if it doesn't exist
@@ -3119,7 +2918,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
                return error;
        mutex_lock(&inode->i_mutex);
-        error = dir->i_op->link(old_dentry, dir, new_dentry);
+        /* Make sure we don't allow creating hardlink to an unlinked file */
+        if (inode->i_nlink == 0)
+                error =  -ENOENT;
+        else
+                error = dir->i_op->link(old_dentry, dir, new_dentry);
        mutex_unlock(&inode->i_mutex);
        if (!error)
                fsnotify_link(dir, inode, new_dentry);
@@ -3141,15 +2944,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
        struct dentry *new_dentry;
        struct nameidata nd;
        struct path old_path;
+        int how = 0;
        int error;
        char *to;
-        if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
+        if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;
+        /*
+         * To use null names we require CAP_DAC_READ_SEARCH
+         * This ensures that not everyone will be able to create
+         * handlink using the passed filedescriptor.
+         */
+        if (flags & AT_EMPTY_PATH) {
+                if (!capable(CAP_DAC_READ_SEARCH))
+                        return -ENOENT;
+                how = LOOKUP_EMPTY;
+        }
+        if (flags & AT_SYMLINK_FOLLOW)
+                how |= LOOKUP_FOLLOW;
-        error = user_path_at(olddfd, oldname,
+        error = user_path_at(olddfd, oldname, how, &old_path);
-                             flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
-                             &old_path);
        if (error)
                return error;
@@ -3586,7 +3401,7 @@ EXPORT_SYMBOL(page_readlink);
 EXPORT_SYMBOL(__page_symlink);
 EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
-EXPORT_SYMBOL(path_lookup);
+EXPORT_SYMBOL(kern_path_parent);
 EXPORT_SYMBOL(kern_path);
 EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(inode_permission);
diff --git a/fs/namespace.c b/fs/namespace.c
index a66feed7311..e96e03782de 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1002,6 +1002,18 @@ const struct seq_operations mounts_op = {
        .show   = show_vfsmnt
 };
+static int uuid_is_nil(u8 *uuid)
+{
+        int i;
+        u8  *cp = (u8 *)uuid;
+        for (i = 0; i < 16; i++) {
+                if (*cp++)
+                        return 0;
+        }
+        return 1;
+}
 static int show_mountinfo(struct seq_file *m, void *v)
 {
        struct proc_mounts *p = m->private;
@@ -1040,6 +1052,10 @@ static int show_mountinfo(struct seq_file *m, void *v)
        if (IS_MNT_UNBINDABLE(mnt))
                seq_puts(m, " unbindable");
+        if (!uuid_is_nil(mnt->mnt_sb->s_uuid))
+                /* print the uuid */
+                seq_printf(m, " uuid:%pU", mnt->mnt_sb->s_uuid);
        /* Filesystem specific data */
        seq_puts(m, " - ");
        show_type(m, sb);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2f8e61816d7..01768e5e2c9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1518,7 +1518,7 @@ static int nfsiod_start(void)
 {
        struct workqueue_struct *wq;
        dprintk("RPC:       creating workqueue nfsiod\n");
-        wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0);
+        wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0);
        if (wq == NULL)
                return -ENOMEM;
        nfsiod_workqueue = wq;
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index bf9cbd242dd..124e8fcb0dd 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -22,30 +22,17 @@
 static struct file *do_open(char *name, int flags)
 {
-        struct nameidata nd;
        struct vfsmount *mnt;
-        int error;
+        struct file *file;
        mnt = do_kern_mount("nfsd", 0, "nfsd", NULL);
        if (IS_ERR(mnt))
                return (struct file *)mnt;
-        error = vfs_path_lookup(mnt->mnt_root, mnt, name, 0, &nd);
+        file = file_open_root(mnt->mnt_root, mnt, name, flags);
-        mntput(mnt);    /* drop do_kern_mount reference */
-        if (error)
-                return ERR_PTR(error);
-        if (flags == O_RDWR)
-                error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags);
-        else
-                error = may_open(&nd.path, MAY_WRITE, flags);
-        if (!error)
+        mntput(mnt);    /* drop do_kern_mount reference */
-                return dentry_open(nd.path.dentry, nd.path.mnt, flags,
+        return file;
-                                   current_cred());
-        path_put(&nd.path);
-        return ERR_PTR(error);
 }
 static struct {
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 5dbc3062b4f..254652a9b54 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -197,8 +197,12 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
                   dentry->d_name.len, dentry->d_name.name,
                   fh, len, connectable);
-        if (len < 3 || (connectable && len < 6)) {
+        if (connectable && (len < 6)) {
-                mlog(ML_ERROR, "fh buffer is too small for encoding\n");
+                *max_len = 6;
+                type = 255;
+                goto bail;
+        } else if (len < 3) {
+                *max_len = 3;
                type = 255;
                goto bail;
        }
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 196fcb52d95..d5ab56cbe5c 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -114,7 +114,4 @@ int ocfs2_local_write_dquot(struct dquot *dquot);
 extern const struct dquot_operations ocfs2_quota_operations;
 extern struct quota_format_type ocfs2_quota_format;
-int ocfs2_quota_setup(void);
-void ocfs2_quota_shutdown(void);
 #endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 4607923eb24..a73f6416648 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -63,8 +63,6 @@
 *        write to gf
 */
-static struct workqueue_struct *ocfs2_quota_wq = NULL;
 static void qsync_work_fn(struct work_struct *work);
 static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
@@ -400,8 +398,8 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
                                                OCFS2_QBLK_RESERVED_SPACE;
        oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
        INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
-        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+        schedule_delayed_work(&oinfo->dqi_sync_work,
-                           msecs_to_jiffies(oinfo->dqi_syncms));
+                              msecs_to_jiffies(oinfo->dqi_syncms));
 out_err:
        mlog_exit(status);
@@ -635,8 +633,8 @@ static void qsync_work_fn(struct work_struct *work)
        struct super_block *sb = oinfo->dqi_gqinode->i_sb;
        dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
-        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+        schedule_delayed_work(&oinfo->dqi_sync_work,
-                           msecs_to_jiffies(oinfo->dqi_syncms));
+                              msecs_to_jiffies(oinfo->dqi_syncms));
 }
 /*
@@ -923,20 +921,3 @@ const struct dquot_operations ocfs2_quota_operations = {
        .alloc_dquot    = ocfs2_alloc_dquot,
        .destroy_dquot  = ocfs2_destroy_dquot,
 };
-int ocfs2_quota_setup(void)
-{
-        ocfs2_quota_wq = create_workqueue("o2quot");
-        if (!ocfs2_quota_wq)
-                return -ENOMEM;
-        return 0;
-}
-void ocfs2_quota_shutdown(void)
-{
-        if (ocfs2_quota_wq) {
-                flush_workqueue(ocfs2_quota_wq);
-                destroy_workqueue(ocfs2_quota_wq);
-                ocfs2_quota_wq = NULL;
-        }
-}
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 35798b88042..c384d634872 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4380,7 +4380,7 @@ static int ocfs2_user_path_parent(const char __user *path,
        if (IS_ERR(s))
                return PTR_ERR(s);
-        error = path_lookup(s, LOOKUP_PARENT, nd);
+        error = kern_path_parent(s, nd);
        if (error)
                putname(s);
        else
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 36c423fb063..236ed1bdca2 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1657,16 +1657,11 @@ static int __init ocfs2_init(void)
                mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
        }
-        status = ocfs2_quota_setup();
-        if (status)
-                goto leave;
        ocfs2_set_locking_protocol();
        status = register_quota_format(&ocfs2_quota_format);
 leave:
        if (status < 0) {
-                ocfs2_quota_shutdown();
                ocfs2_free_mem_caches();
                exit_ocfs2_uptodate_cache();
        }
@@ -1683,8 +1678,6 @@ static void __exit ocfs2_exit(void)
 {
        mlog_entry_void();
-        ocfs2_quota_shutdown();
        if (ocfs2_wq) {
                flush_workqueue(ocfs2_wq);
                destroy_workqueue(ocfs2_wq);
diff --git a/fs/open.c b/fs/open.c
index 80d430ae3b2..f83ca80cc59 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -573,13 +573,15 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
 {
        struct path path;
        int error = -EINVAL;
-        int follow;
+        int lookup_flags;
-        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                goto out;
-        follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+        lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
-        error = user_path_at(dfd, filename, follow, &path);
+        if (flag & AT_EMPTY_PATH)
+                lookup_flags |= LOOKUP_EMPTY;
+        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
                goto out;
        error = mnt_want_write(path.mnt);
@@ -669,11 +671,16 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
                                        int (*open)(struct inode *, struct file *),
                                        const struct cred *cred)
 {
+        static const struct file_operations empty_fops = {};
        struct inode *inode;
        int error;
        f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
                                FMODE_PREAD | FMODE_PWRITE;
+        if (unlikely(f->f_flags & O_PATH))
+                f->f_mode = FMODE_PATH;
        inode = dentry->d_inode;
        if (f->f_mode & FMODE_WRITE) {
                error = __get_file_write_access(inode, mnt);
@@ -687,9 +694,15 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
        f->f_path.dentry = dentry;
        f->f_path.mnt = mnt;
        f->f_pos = 0;
-        f->f_op = fops_get(inode->i_fop);
        file_sb_list_add(f, inode->i_sb);
+        if (unlikely(f->f_mode & FMODE_PATH)) {
+                f->f_op = &empty_fops;
+                return f;
+        }
+        f->f_op = fops_get(inode->i_fop);
        error = security_dentry_open(f, cred);
        if (error)
                goto cleanup_all;
@@ -891,15 +904,110 @@ void fd_install(unsigned int fd, struct file *file)
 EXPORT_SYMBOL(fd_install);
+static inline int build_open_flags(int flags, int mode, struct open_flags *op)
+{
+        int lookup_flags = 0;
+        int acc_mode;
+        if (!(flags & O_CREAT))
+                mode = 0;
+        op->mode = mode;
+        /* Must never be set by userspace */
+        flags &= ~FMODE_NONOTIFY;
+        /*
+         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
+         * check for O_DSYNC if the need any syncing at all we enforce it's
+         * always set instead of having to deal with possibly weird behaviour
+         * for malicious applications setting only __O_SYNC.
+         */
+        if (flags & __O_SYNC)
+                flags |= O_DSYNC;
+        /*
+         * If we have O_PATH in the open flag. Then we
+         * cannot have anything other than the below set of flags
+         */
+        if (flags & O_PATH) {
+                flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
+                acc_mode = 0;
+        } else {
+                acc_mode = MAY_OPEN | ACC_MODE(flags);
+        }
+        op->open_flag = flags;
+        /* O_TRUNC implies we need access checks for write permissions */
+        if (flags & O_TRUNC)
+                acc_mode |= MAY_WRITE;
+        /* Allow the LSM permission hook to distinguish append
+           access from general write access. */
+        if (flags & O_APPEND)
+                acc_mode |= MAY_APPEND;
+        op->acc_mode = acc_mode;
+        op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
+        if (flags & O_CREAT) {
+                op->intent |= LOOKUP_CREATE;
+                if (flags & O_EXCL)
+                        op->intent |= LOOKUP_EXCL;
+        }
+        if (flags & O_DIRECTORY)
+                lookup_flags |= LOOKUP_DIRECTORY;
+        if (!(flags & O_NOFOLLOW))
+                lookup_flags |= LOOKUP_FOLLOW;
+        return lookup_flags;
+}
+/**
+ * filp_open - open file and return file pointer
+ *
+ * @filename:   path to open
+ * @flags:      open flags as per the open(2) second argument
+ * @mode:       mode for the new file if O_CREAT is set, else ignored
+ *
+ * This is the helper to open a file from kernelspace if you really
+ * have to.  But in generally you should not do this, so please move
+ * along, nothing to see here..
+ */
+struct file *filp_open(const char *filename, int flags, int mode)
+{
+        struct open_flags op;
+        int lookup = build_open_flags(flags, mode, &op);
+        return do_filp_open(AT_FDCWD, filename, &op, lookup);
+}
+EXPORT_SYMBOL(filp_open);
+struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+                            const char *filename, int flags)
+{
+        struct open_flags op;
+        int lookup = build_open_flags(flags, 0, &op);
+        if (flags & O_CREAT)
+                return ERR_PTR(-EINVAL);
+        if (!filename && (flags & O_DIRECTORY))
+                if (!dentry->d_inode->i_op->lookup)
+                        return ERR_PTR(-ENOTDIR);
+        return do_file_open_root(dentry, mnt, filename, &op, lookup);
+}
+EXPORT_SYMBOL(file_open_root);
 long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 {
+        struct open_flags op;
+        int lookup = build_open_flags(flags, mode, &op);
        char *tmp = getname(filename);
        int fd = PTR_ERR(tmp);
        if (!IS_ERR(tmp)) {
                fd = get_unused_fd_flags(flags);
                if (fd >= 0) {
-                        struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);
+                        struct file *f = do_filp_open(dfd, tmp, &op, lookup);
                        if (IS_ERR(f)) {
                                put_unused_fd(fd);
                                fd = PTR_ERR(f);
@@ -969,8 +1077,10 @@ int filp_close(struct file *filp, fl_owner_t id)
        if (filp->f_op && filp->f_op->flush)
                retval = filp->f_op->flush(filp, id);
-        dnotify_flush(filp, id);
+        if (likely(!(filp->f_mode & FMODE_PATH))) {
-        locks_remove_posix(filp, id);
+                dnotify_flush(filp, id);
+                locks_remove_posix(filp, id);
+        }
        fput(filp);
        return retval;
 }
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index be03a0b08b4..764b86a0196 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,7 +10,7 @@
 #include "check.h"
 #include "osf.h"
-#define MAX_OSF_PARTITIONS 8
+#define MAX_OSF_PARTITIONS 18
 int osf_partition(struct parsed_partitions *state)
 {
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0bae036831e..1bba24bad82 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1593,8 +1593,13 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
        struct inode *inode = dentry->d_inode;
        int maxlen = *lenp;
-        if (maxlen < 3)
+        if (need_parent && (maxlen < 5)) {
+                *lenp = 5;
                return 255;
+        } else if (maxlen < 3) {
+                *lenp = 3;
+                return 255;
+        }
        data[0] = inode->i_ino;
        data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 3eea859e699..c77514bd577 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2876,7 +2876,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        reiserfs_mounted_fs_count++;
        if (reiserfs_mounted_fs_count <= 1) {
                reiserfs_write_unlock(sb);
-                commit_wq = create_workqueue("reiserfs");
+                commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
                reiserfs_write_lock(sb);
        }
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 82f45542dcf..118662690cd 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1123,10 +1123,6 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
                reiserfs_write_unlock(dir->i_sb);
                return -EMLINK;
        }
-        if (inode->i_nlink == 0) {
-                reiserfs_write_unlock(dir->i_sb);
-                return -ENOENT;
-        }
        /* inc before scheduling so reiserfs_unlink knows we are here */
        inc_nlink(inode);
diff --git a/fs/stat.c b/fs/stat.c
index d5c61cf2b70..961039121cb 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,13 +75,16 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
        int error = -EINVAL;
        int lookup_flags = 0;
-        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0)
+        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
+                      AT_EMPTY_PATH)) != 0)
                goto out;
        if (!(flag & AT_SYMLINK_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
        if (flag & AT_NO_AUTOMOUNT)
                lookup_flags |= LOOKUP_NO_AUTOMOUNT;
+        if (flag & AT_EMPTY_PATH)
+                lookup_flags |= LOOKUP_EMPTY;
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
@@ -297,7 +300,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
        if (bufsiz <= 0)
                return -EINVAL;
-        error = user_path_at(dfd, pathname, 0, &path);
+        error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path);
        if (!error) {
                struct inode *inode = path.dentry->d_inode;
diff --git a/fs/statfs.c b/fs/statfs.c
index 30ea8c8a996..8244924dec5 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -73,149 +73,135 @@ int vfs_statfs(struct path *path, struct kstatfs *buf)
 }
 EXPORT_SYMBOL(vfs_statfs);
-static int do_statfs_native(struct path *path, struct statfs *buf)
+int user_statfs(const char __user *pathname, struct kstatfs *st)
 {
-        struct kstatfs st;
+        struct path path;
-        int retval;
+        int error = user_path(pathname, &path);
+        if (!error) {
+                error = vfs_statfs(&path, st);
+                path_put(&path);
+        }
+        return error;
+}
-        retval = vfs_statfs(path, &st);
+int fd_statfs(int fd, struct kstatfs *st)
-        if (retval)
+{
-                return retval;
+        struct file *file = fget(fd);
+        int error = -EBADF;
+        if (file) {
+                error = vfs_statfs(&file->f_path, st);
+                fput(file);
+        }
+        return error;
+}
-        if (sizeof(*buf) == sizeof(st))
+static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
-                memcpy(buf, &st, sizeof(st));
+{
+        struct statfs buf;
+        if (sizeof(buf) == sizeof(*st))
+                memcpy(&buf, st, sizeof(*st));
        else {
-                if (sizeof buf->f_blocks == 4) {
+                if (sizeof buf.f_blocks == 4) {
-                        if ((st.f_blocks | st.f_bfree | st.f_bavail |
+                        if ((st->f_blocks | st->f_bfree | st->f_bavail |
-                             st.f_bsize | st.f_frsize) &
+                             st->f_bsize | st->f_frsize) &
                            0xffffffff00000000ULL)
                                return -EOVERFLOW;
                        /*
                         * f_files and f_ffree may be -1; it's okay to stuff
                         * that into 32 bits
                         */
-                        if (st.f_files != -1 &&
+                        if (st->f_files != -1 &&
-                            (st.f_files & 0xffffffff00000000ULL))
+                            (st->f_files & 0xffffffff00000000ULL))
                                return -EOVERFLOW;
-                        if (st.f_ffree != -1 &&
+                        if (st->f_ffree != -1 &&
-                            (st.f_ffree & 0xffffffff00000000ULL))
+                            (st->f_ffree & 0xffffffff00000000ULL))
                                return -EOVERFLOW;
                }
-                buf->f_type = st.f_type;
+                buf.f_type = st->f_type;
-                buf->f_bsize = st.f_bsize;
+                buf.f_bsize = st->f_bsize;
-                buf->f_blocks = st.f_blocks;
+                buf.f_blocks = st->f_blocks;
-                buf->f_bfree = st.f_bfree;
+                buf.f_bfree = st->f_bfree;
-                buf->f_bavail = st.f_bavail;
+                buf.f_bavail = st->f_bavail;
-                buf->f_files = st.f_files;
+                buf.f_files = st->f_files;
-                buf->f_ffree = st.f_ffree;
+                buf.f_ffree = st->f_ffree;
-                buf->f_fsid = st.f_fsid;
+                buf.f_fsid = st->f_fsid;
-                buf->f_namelen = st.f_namelen;
+                buf.f_namelen = st->f_namelen;
-                buf->f_frsize = st.f_frsize;
+                buf.f_frsize = st->f_frsize;
-                buf->f_flags = st.f_flags;
+                buf.f_flags = st->f_flags;
-                memset(buf->f_spare, 0, sizeof(buf->f_spare));
+                memset(buf.f_spare, 0, sizeof(buf.f_spare));
        }
+        if (copy_to_user(p, &buf, sizeof(buf)))
+                return -EFAULT;
        return 0;
 }
-static int do_statfs64(struct path *path, struct statfs64 *buf)
+static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p)
 {
-        struct kstatfs st;
+        struct statfs64 buf;
-        int retval;
+        if (sizeof(buf) == sizeof(*st))
+                memcpy(&buf, st, sizeof(*st));
-        retval = vfs_statfs(path, &st);
-        if (retval)
-                return retval;
-        if (sizeof(*buf) == sizeof(st))
-                memcpy(buf, &st, sizeof(st));
        else {
-                buf->f_type = st.f_type;
+                buf.f_type = st->f_type;
-                buf->f_bsize = st.f_bsize;
+                buf.f_bsize = st->f_bsize;
-                buf->f_blocks = st.f_blocks;
+                buf.f_blocks = st->f_blocks;
-                buf->f_bfree = st.f_bfree;
+                buf.f_bfree = st->f_bfree;
-                buf->f_bavail = st.f_bavail;
+                buf.f_bavail = st->f_bavail;
-                buf->f_files = st.f_files;
+                buf.f_files = st->f_files;
-                buf->f_ffree = st.f_ffree;
+                buf.f_ffree = st->f_ffree;
-                buf->f_fsid = st.f_fsid;
+                buf.f_fsid = st->f_fsid;
-                buf->f_namelen = st.f_namelen;
+                buf.f_namelen = st->f_namelen;
-                buf->f_frsize = st.f_frsize;
+                buf.f_frsize = st->f_frsize;
-                buf->f_flags = st.f_flags;
+                buf.f_flags = st->f_flags;
-                memset(buf->f_spare, 0, sizeof(buf->f_spare));
+                memset(buf.f_spare, 0, sizeof(buf.f_spare));
        }
+        if (copy_to_user(p, &buf, sizeof(buf)))
+                return -EFAULT;
        return 0;
 }
 SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
 {
-        struct path path;
+        struct kstatfs st;
-        int error;
+        int error = user_statfs(pathname, &st);
+        if (!error)
-        error = user_path(pathname, &path);
+                error = do_statfs_native(&st, buf);
-        if (!error) {
-                struct statfs tmp;
-                error = do_statfs_native(&path, &tmp);
-                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                        error = -EFAULT;
-                path_put(&path);
-        }
        return error;
 }
 SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
 {
-        struct path path;
+        struct kstatfs st;
-        long error;
+        int error;
        if (sz != sizeof(*buf))
                return -EINVAL;
-        error = user_path(pathname, &path);
+        error = user_statfs(pathname, &st);
-        if (!error) {
+        if (!error)
-                struct statfs64 tmp;
+                error = do_statfs64(&st, buf);
-                error = do_statfs64(&path, &tmp);
-                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                        error = -EFAULT;
-                path_put(&path);
-        }
        return error;
 }
 SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
 {
-        struct file *file;
+        struct kstatfs st;
-        struct statfs tmp;
+        int error = fd_statfs(fd, &st);
-        int error;
+        if (!error)
+                error = do_statfs_native(&st, buf);
-        error = -EBADF;
-        file = fget(fd);
-        if (!file)
-                goto out;
-        error = do_statfs_native(&file->f_path, &tmp);
-        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                error = -EFAULT;
-        fput(file);
-out:
        return error;
 }
 SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
 {
-        struct file *file;
+        struct kstatfs st;
-        struct statfs64 tmp;
        int error;
        if (sz != sizeof(*buf))
                return -EINVAL;
-        error = -EBADF;
+        error = fd_statfs(fd, &st);
-        file = fget(fd);
+        if (!error)
-        if (!file)
+                error = do_statfs64(&st, buf);
-                goto out;
-        error = do_statfs64(&file->f_path, &tmp);
-        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                error = -EFAULT;
-        fput(file);
-out:
        return error;
 }
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 14f64b689d7..7217d67a80a 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -522,24 +522,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
        ubifs_assert(mutex_is_locked(&dir->i_mutex));
        ubifs_assert(mutex_is_locked(&inode->i_mutex));
-        /*
-         * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-         * otherwise has the potential to corrupt the orphan inode list.
-         *
-         * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and
-         * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not
-         * lock 'dirA->i_mutex', so this is possible. Both of the functions
-         * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes
-         * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this
-         * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA'
-         * to the list of orphans. After this, 'vfs_link()' will link
-         * 'dirB/fileB' to 'inodeA'. This is a problem because, for example,
-         * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode
-         * to the list of orphans.
-         */
-         if (inode->i_nlink == 0)
-                 return -ENOENT;
        err = dbg_check_synced_i_size(inode);
        if (err)
                return err;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index b7c338d5e9d..f1dce848ef9 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1286,8 +1286,13 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
        struct fid *fid = (struct fid *)fh;
        int type = FILEID_UDF_WITHOUT_PARENT;
-        if (len < 3 || (connectable && len < 5))
+        if (connectable && (len < 5)) {
+                *lenp = 5;
+                return 255;
+        } else if (len < 3) {
+                *lenp = 3;
                return 255;
+        }
        *lenp = 3;
        fid->udf.block = location.logicalBlockNum;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ac1c7e8378d..f83a4c830a6 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -2022,11 +2022,12 @@ xfs_buf_init(void)
        if (!xfslogd_workqueue)
                goto out_free_buf_zone;
-        xfsdatad_workqueue = create_workqueue("xfsdatad");
+        xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
        if (!xfsdatad_workqueue)
                goto out_destroy_xfslogd_workqueue;
-        xfsconvertd_workqueue = create_workqueue("xfsconvertd");
+        xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
+                                                WQ_MEM_RECLAIM, 1);
        if (!xfsconvertd_workqueue)
                goto out_destroy_xfsdatad_workqueue;
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index fc0114da7fd..f4f878fc008 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -89,8 +89,10 @@ xfs_fs_encode_fh(
         * seven combinations work.  The real answer is "don't use v2".
         */
        len = xfs_fileid_length(fileid_type);
-        if (*max_len < len)
+        if (*max_len < len) {
+                *max_len = len;
                return 255;
+        }
        *max_len = len;
        switch (fileid_type) {
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index edfa178bafb..4aff5639573 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -309,7 +309,7 @@ xfs_mru_cache_init(void)
        if (!xfs_mru_elem_zone)
                goto out;
-        xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache");
+        xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
        if (!xfs_mru_reap_wq)
                goto out_destroy_mru_elem_zone;