214 files changed, 6752 insertions, 3923 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 02a2cf616318..515455296378 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -21,8 +21,8 @@
 #include <linux/posix_acl_xattr.h>
 #include "xattr.h"
 #include "acl.h"
-#include "v9fs_vfs.h"
 #include "v9fs.h"
+#include "v9fs_vfs.h"
 static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
 {
@@ -59,7 +59,8 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
        struct v9fs_session_info *v9ses;
        v9ses = v9fs_inode2v9ses(inode);
-        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+        if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
+                        ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
                set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL);
                set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
                return 0;
@@ -71,11 +72,15 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
        if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
                set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
                set_cached_acl(inode, ACL_TYPE_ACCESS, pacl);
-                posix_acl_release(dacl);
-                posix_acl_release(pacl);
        } else
                retval = -EIO;
+        if (!IS_ERR(dacl))
+                posix_acl_release(dacl);
+        if (!IS_ERR(pacl))
+                posix_acl_release(pacl);
        return retval;
 }
@@ -100,9 +105,10 @@ int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
                return -ECHILD;
        v9ses = v9fs_inode2v9ses(inode);
-        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+        if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
+                        ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
                /*
-                 * On access = client mode get the acl
+                 * On access = client  and acl = on mode get the acl
                 * values from the server
                 */
                return 0;
@@ -128,6 +134,10 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
        struct inode *inode = dentry->d_inode;
        set_cached_acl(inode, type, acl);
+        if (!acl)
+                return 0;
        /* Set a setxattr request to server */
        size = posix_acl_xattr_size(acl->a_count);
        buffer = kmalloc(size, GFP_KERNEL);
@@ -177,10 +187,8 @@ int v9fs_acl_chmod(struct dentry *dentry)
 int v9fs_set_create_acl(struct dentry *dentry,
                        struct posix_acl *dpacl, struct posix_acl *pacl)
 {
-        if (dpacl)
+        v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
-                v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
+        v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
-        if (pacl)
-                v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
        posix_acl_release(dpacl);
        posix_acl_release(pacl);
        return 0;
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 0dbe0d139ac2..5b335c5086a1 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -33,67 +33,11 @@
 #define CACHETAG_LEN  11
-struct kmem_cache *vcookie_cache;
 struct fscache_netfs v9fs_cache_netfs = {
        .name           = "9p",
        .version        = 0,
 };
-static void init_once(void *foo)
-{
-        struct v9fs_cookie *vcookie = (struct v9fs_cookie *) foo;
-        vcookie->fscache = NULL;
-        vcookie->qid = NULL;
-        inode_init_once(&vcookie->inode);
-}
-/**
- * v9fs_init_vcookiecache - initialize a cache for vcookies to maintain
- *                          vcookie to inode mapping
- *
- * Returns 0 on success.
- */
-static int v9fs_init_vcookiecache(void)
-{
-        vcookie_cache = kmem_cache_create("vcookie_cache",
-                                          sizeof(struct v9fs_cookie),
-                                          0, (SLAB_RECLAIM_ACCOUNT|
-                                              SLAB_MEM_SPREAD),
-                                          init_once);
-        if (!vcookie_cache)
-                return -ENOMEM;
-        return 0;
-}
-/**
- * v9fs_destroy_vcookiecache - destroy the cache of vcookies
- *
- */
-static void v9fs_destroy_vcookiecache(void)
-{
-        kmem_cache_destroy(vcookie_cache);
-}
-int __v9fs_cache_register(void)
-{
-        int ret;
-        ret = v9fs_init_vcookiecache();
-        if (ret < 0)
-                return ret;
-        return fscache_register_netfs(&v9fs_cache_netfs);
-}
-void __v9fs_cache_unregister(void)
-{
-        v9fs_destroy_vcookiecache();
-        fscache_unregister_netfs(&v9fs_cache_netfs);
-}
 /**
 * v9fs_random_cachetag - Generate a random tag to be associated
 *                        with a new cache session.
@@ -133,9 +77,9 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
 }
 const struct fscache_cookie_def v9fs_cache_session_index_def = {
-        .name           = "9P.session",
+        .name           = "9P.session",
-        .type           = FSCACHE_COOKIE_TYPE_INDEX,
+        .type           = FSCACHE_COOKIE_TYPE_INDEX,
-        .get_key        = v9fs_cache_session_get_key,
+        .get_key        = v9fs_cache_session_get_key,
 };
 void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
@@ -163,33 +107,33 @@ void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
 static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
                                         void *buffer, uint16_t bufmax)
 {
-        const struct v9fs_cookie *vcookie = cookie_netfs_data;
+        const struct v9fs_inode *v9inode = cookie_netfs_data;
-        memcpy(buffer, &vcookie->qid->path, sizeof(vcookie->qid->path));
+        memcpy(buffer, &v9inode->fscache_key->path,
+               sizeof(v9inode->fscache_key->path));
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &vcookie->inode,
+        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode,
-                   vcookie->qid->path);
+                   v9inode->fscache_key->path);
-        return sizeof(vcookie->qid->path);
+        return sizeof(v9inode->fscache_key->path);
 }
 static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
                                      uint64_t *size)
 {
-        const struct v9fs_cookie *vcookie = cookie_netfs_data;
+        const struct v9fs_inode *v9inode = cookie_netfs_data;
-        *size = i_size_read(&vcookie->inode);
+        *size = i_size_read(&v9inode->vfs_inode);
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &vcookie->inode,
+        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode,
                   *size);
 }
 static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
                                         void *buffer, uint16_t buflen)
 {
-        const struct v9fs_cookie *vcookie = cookie_netfs_data;
+        const struct v9fs_inode *v9inode = cookie_netfs_data;
-        memcpy(buffer, &vcookie->qid->version, sizeof(vcookie->qid->version));
+        memcpy(buffer, &v9inode->fscache_key->version,
+               sizeof(v9inode->fscache_key->version));
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &vcookie->inode,
+        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode,
-                   vcookie->qid->version);
+                   v9inode->fscache_key->version);
-        return sizeof(vcookie->qid->version);
+        return sizeof(v9inode->fscache_key->version);
 }
 static enum
@@ -197,13 +141,13 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
                                            const void *buffer,
                                            uint16_t buflen)
 {
-        const struct v9fs_cookie *vcookie = cookie_netfs_data;
+        const struct v9fs_inode *v9inode = cookie_netfs_data;
-        if (buflen != sizeof(vcookie->qid->version))
+        if (buflen != sizeof(v9inode->fscache_key->version))
                return FSCACHE_CHECKAUX_OBSOLETE;
-        if (memcmp(buffer, &vcookie->qid->version,
+        if (memcmp(buffer, &v9inode->fscache_key->version,
-                   sizeof(vcookie->qid->version)))
+                   sizeof(v9inode->fscache_key->version)))
                return FSCACHE_CHECKAUX_OBSOLETE;
        return FSCACHE_CHECKAUX_OKAY;
@@ -211,7 +155,7 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
 static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
 {
-        struct v9fs_cookie *vcookie = cookie_netfs_data;
+        struct v9fs_inode *v9inode = cookie_netfs_data;
        struct pagevec pvec;
        pgoff_t first;
        int loop, nr_pages;
@@ -220,7 +164,7 @@ static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
        first = 0;
        for (;;) {
-                nr_pages = pagevec_lookup(&pvec, vcookie->inode.i_mapping,
+                nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping,
                                          first,
                                          PAGEVEC_SIZE - pagevec_count(&pvec));
                if (!nr_pages)
@@ -249,115 +193,114 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = {
 void v9fs_cache_inode_get_cookie(struct inode *inode)
 {
-        struct v9fs_cookie *vcookie;
+        struct v9fs_inode *v9inode;
        struct v9fs_session_info *v9ses;
        if (!S_ISREG(inode->i_mode))
                return;
-        vcookie = v9fs_inode2cookie(inode);
+        v9inode = V9FS_I(inode);
-        if (vcookie->fscache)
+        if (v9inode->fscache)
                return;
        v9ses = v9fs_inode2v9ses(inode);
-        vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
+        v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
                                                  &v9fs_cache_inode_index_def,
-                                                  vcookie);
+                                                  v9inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
-                   vcookie->fscache);
+                   v9inode->fscache);
 }
 void v9fs_cache_inode_put_cookie(struct inode *inode)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return;
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
-                   vcookie->fscache);
+                   v9inode->fscache);
-        fscache_relinquish_cookie(vcookie->fscache, 0);
+        fscache_relinquish_cookie(v9inode->fscache, 0);
-        vcookie->fscache = NULL;
+        v9inode->fscache = NULL;
 }
 void v9fs_cache_inode_flush_cookie(struct inode *inode)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return;
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
-                   vcookie->fscache);
+                   v9inode->fscache);
-        fscache_relinquish_cookie(vcookie->fscache, 1);
+        fscache_relinquish_cookie(v9inode->fscache, 1);
-        vcookie->fscache = NULL;
+        v9inode->fscache = NULL;
 }
 void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        struct p9_fid *fid;
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return;
-        spin_lock(&vcookie->lock);
+        spin_lock(&v9inode->fscache_lock);
        fid = filp->private_data;
        if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
                v9fs_cache_inode_flush_cookie(inode);
        else
                v9fs_cache_inode_get_cookie(inode);
-        spin_unlock(&vcookie->lock);
+        spin_unlock(&v9inode->fscache_lock);
 }
 void v9fs_cache_inode_reset_cookie(struct inode *inode)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        struct v9fs_session_info *v9ses;
        struct fscache_cookie *old;
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return;
-        old = vcookie->fscache;
+        old = v9inode->fscache;
-        spin_lock(&vcookie->lock);
+        spin_lock(&v9inode->fscache_lock);
-        fscache_relinquish_cookie(vcookie->fscache, 1);
+        fscache_relinquish_cookie(v9inode->fscache, 1);
        v9ses = v9fs_inode2v9ses(inode);
-        vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
+        v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
                                                  &v9fs_cache_inode_index_def,
-                                                  vcookie);
+                                                  v9inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
-                   inode, old, vcookie->fscache);
+                   inode, old, v9inode->fscache);
-        spin_unlock(&vcookie->lock);
+        spin_unlock(&v9inode->fscache_lock);
 }
 int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
 {
        struct inode *inode = page->mapping->host;
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        BUG_ON(!vcookie->fscache);
+        BUG_ON(!v9inode->fscache);
-        return fscache_maybe_release_page(vcookie->fscache, page, gfp);
+        return fscache_maybe_release_page(v9inode->fscache, page, gfp);
 }
 void __v9fs_fscache_invalidate_page(struct page *page)
 {
        struct inode *inode = page->mapping->host;
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        BUG_ON(!vcookie->fscache);
+        BUG_ON(!v9inode->fscache);
        if (PageFsCache(page)) {
-                fscache_wait_on_page_write(vcookie->fscache, page);
+                fscache_wait_on_page_write(v9inode->fscache, page);
                BUG_ON(!PageLocked(page));
-                fscache_uncache_page(vcookie->fscache, page);
+                fscache_uncache_page(v9inode->fscache, page);
        }
 }
@@ -380,13 +323,13 @@ static void v9fs_vfs_readpage_complete(struct page *page, void *data,
 int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
 {
        int ret;
-        const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        const struct v9fs_inode *v9inode = V9FS_I(inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return -ENOBUFS;
-        ret = fscache_read_or_alloc_page(vcookie->fscache,
+        ret = fscache_read_or_alloc_page(v9inode->fscache,
                                         page,
                                         v9fs_vfs_readpage_complete,
                                         NULL,
@@ -418,13 +361,13 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
                                  unsigned *nr_pages)
 {
        int ret;
-        const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        const struct v9fs_inode *v9inode = V9FS_I(inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return -ENOBUFS;
-        ret = fscache_read_or_alloc_pages(vcookie->fscache,
+        ret = fscache_read_or_alloc_pages(v9inode->fscache,
                                          mapping, pages, nr_pages,
                                          v9fs_vfs_readpage_complete,
                                          NULL,
@@ -453,11 +396,22 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
 void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
 {
        int ret;
-        const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        const struct v9fs_inode *v9inode = V9FS_I(inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
-        ret = fscache_write_page(vcookie->fscache, page, GFP_KERNEL);
+        ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL);
        P9_DPRINTK(P9_DEBUG_FSC, "ret =  %d", ret);
        if (ret != 0)
                v9fs_uncache_page(inode, page);
 }
+/*
+ * wait for a page to complete writing to the cache
+ */
+void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
+{
+        const struct v9fs_inode *v9inode = V9FS_I(inode);
+        P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+        if (PageFsCache(page))
+                fscache_wait_on_page_write(v9inode->fscache, page);
+}
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index a94192bfaee8..049507a5b01c 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -25,20 +25,6 @@
 #include <linux/fscache.h>
 #include <linux/spinlock.h>
-extern struct kmem_cache *vcookie_cache;
-struct v9fs_cookie {
-        spinlock_t lock;
-        struct inode inode;
-        struct fscache_cookie *fscache;
-        struct p9_qid *qid;
-};
-static inline struct v9fs_cookie *v9fs_inode2cookie(const struct inode *inode)
-{
-        return container_of(inode, struct v9fs_cookie, inode);
-}
 extern struct fscache_netfs v9fs_cache_netfs;
 extern const struct fscache_cookie_def v9fs_cache_session_index_def;
 extern const struct fscache_cookie_def v9fs_cache_inode_index_def;
@@ -64,23 +50,8 @@ extern int __v9fs_readpages_from_fscache(struct inode *inode,
                                         struct list_head *pages,
                                         unsigned *nr_pages);
 extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
+extern void __v9fs_fscache_wait_on_page_write(struct inode *inode,
+                                              struct page *page);
-/**
- * v9fs_cache_register - Register v9fs file system with the cache
- */
-static inline int v9fs_cache_register(void)
-{
-        return __v9fs_cache_register();
-}
-/**
- * v9fs_cache_unregister - Unregister v9fs from the cache
- */
-static inline void v9fs_cache_unregister(void)
-{
-        __v9fs_cache_unregister();
-}
 static inline int v9fs_fscache_release_page(struct page *page,
                                            gfp_t gfp)
@@ -117,28 +88,27 @@ static inline void v9fs_readpage_to_fscache(struct inode *inode,
 static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        fscache_uncache_page(vcookie->fscache, page);
+        fscache_uncache_page(v9inode->fscache, page);
        BUG_ON(PageFsCache(page));
 }
-static inline void v9fs_vcookie_set_qid(struct inode *inode,
+static inline void v9fs_fscache_set_key(struct inode *inode,
                                        struct p9_qid *qid)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        spin_lock(&vcookie->lock);
+        spin_lock(&v9inode->fscache_lock);
-        vcookie->qid = qid;
+        v9inode->fscache_key = qid;
-        spin_unlock(&vcookie->lock);
+        spin_unlock(&v9inode->fscache_lock);
 }
-#else /* CONFIG_9P_FSCACHE */
+static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
+                                                   struct page *page)
-static inline int v9fs_cache_register(void)
 {
-        return 1;
+        return __v9fs_fscache_wait_on_page_write(inode, page);
 }
-static inline void v9fs_cache_unregister(void) {}
+#else /* CONFIG_9P_FSCACHE */
 static inline int v9fs_fscache_release_page(struct page *page,
                                            gfp_t gfp) {
@@ -168,9 +138,11 @@ static inline void v9fs_readpage_to_fscache(struct inode *inode,
 static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
 {}
-static inline void v9fs_vcookie_set_qid(struct inode *inode,
+static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
-                                        struct p9_qid *qid)
+                                                   struct page *page)
-{}
+{
+        return;
+}
 #endif /* CONFIG_9P_FSCACHE */
 #endif /* _9P_CACHE_H */
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index b00223c99d70..cd63e002d826 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -125,46 +125,17 @@ err_out:
        return -ENOMEM;
 }
-/**
+static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
- * v9fs_fid_lookup - lookup for a fid, try to walk if not found
+                                               uid_t uid, int any)
- * @dentry: dentry to look for fid in
- *
- * Look for a fid in the specified dentry for the current user.
- * If no fid is found, try to create one walking from a fid from the parent
- * dentry (if it has one), or the root dentry. If the user haven't accessed
- * the fs yet, attach now and walk from the root.
- */
-struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 {
-        int i, n, l, clone, any, access;
-        u32 uid;
-        struct p9_fid *fid, *old_fid = NULL;
        struct dentry *ds;
-        struct v9fs_session_info *v9ses;
        char **wnames, *uname;
+        int i, n, l, clone, access;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid, *old_fid = NULL;
        v9ses = v9fs_inode2v9ses(dentry->d_inode);
        access = v9ses->flags & V9FS_ACCESS_MASK;
-        switch (access) {
-        case V9FS_ACCESS_SINGLE:
-        case V9FS_ACCESS_USER:
-        case V9FS_ACCESS_CLIENT:
-                uid = current_fsuid();
-                any = 0;
-                break;
-        case V9FS_ACCESS_ANY:
-                uid = v9ses->uid;
-                any = 1;
-                break;
-        default:
-                uid = ~0;
-                any = 0;
-                break;
-        }
        fid = v9fs_fid_find(dentry, uid, any);
        if (fid)
                return fid;
@@ -250,6 +221,45 @@ err_out:
        return fid;
 }
+/**
+ * v9fs_fid_lookup - lookup for a fid, try to walk if not found
+ * @dentry: dentry to look for fid in
+ *
+ * Look for a fid in the specified dentry for the current user.
+ * If no fid is found, try to create one walking from a fid from the parent
+ * dentry (if it has one), or the root dentry. If the user haven't accessed
+ * the fs yet, attach now and walk from the root.
+ */
+struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
+{
+        uid_t uid;
+        int  any, access;
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        access = v9ses->flags & V9FS_ACCESS_MASK;
+        switch (access) {
+        case V9FS_ACCESS_SINGLE:
+        case V9FS_ACCESS_USER:
+        case V9FS_ACCESS_CLIENT:
+                uid = current_fsuid();
+                any = 0;
+                break;
+        case V9FS_ACCESS_ANY:
+                uid = v9ses->uid;
+                any = 1;
+                break;
+        default:
+                uid = ~0;
+                any = 0;
+                break;
+        }
+        return v9fs_fid_lookup_with_uid(dentry, uid, any);
+}
 struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
 {
        struct p9_fid *fid, *ret;
@@ -261,3 +271,39 @@ struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
        ret = p9_client_walk(fid, 0, NULL, 1);
        return ret;
 }
+static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid)
+{
+        struct p9_fid *fid, *ret;
+        fid = v9fs_fid_lookup_with_uid(dentry, uid, 0);
+        if (IS_ERR(fid))
+                return fid;
+        ret = p9_client_walk(fid, 0, NULL, 1);
+        return ret;
+}
+struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
+{
+        int err;
+        struct p9_fid *fid;
+        fid = v9fs_fid_clone_with_uid(dentry, 0);
+        if (IS_ERR(fid))
+                goto error_out;
+        /*
+         * writeback fid will only be used to write back the
+         * dirty pages. We always request for the open fid in read-write
+         * mode so that a partial page write which result in page
+         * read can work.
+         */
+        err = p9_client_open(fid, O_RDWR);
+        if (err < 0) {
+                p9_client_clunk(fid);
+                fid = ERR_PTR(err);
+                goto error_out;
+        }
+error_out:
+        return fid;
+}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index c3bbd6af996d..bb0b6e7f58fc 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -19,7 +19,8 @@
 *  Boston, MA  02111-1301  USA
 *
 */
+#ifndef FS_9P_FID_H
+#define FS_9P_FID_H
 #include <linux/list.h>
 /**
@@ -45,3 +46,5 @@ struct v9fs_dentry {
 struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
 struct p9_fid *v9fs_fid_clone(struct dentry *dentry);
 int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
+struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
+#endif
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 2f77cd33ba83..c82b017f51f3 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -39,6 +39,7 @@
 static DEFINE_SPINLOCK(v9fs_sessionlist_lock);
 static LIST_HEAD(v9fs_sessionlist);
+struct kmem_cache *v9fs_inode_cache;
 /*
 * Option Parsing (code inspired by NFS code)
@@ -55,7 +56,7 @@ enum {
        /* Cache options */
        Opt_cache_loose, Opt_fscache,
        /* Access options */
-        Opt_access,
+        Opt_access, Opt_posixacl,
        /* Error token */
        Opt_err
 };
@@ -73,6 +74,7 @@ static const match_table_t tokens = {
        {Opt_fscache, "fscache"},
        {Opt_cachetag, "cachetag=%s"},
        {Opt_access, "access=%s"},
+        {Opt_posixacl, "posixacl"},
        {Opt_err, NULL}
 };
@@ -194,15 +196,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                        else if (strcmp(s, "any") == 0)
                                v9ses->flags |= V9FS_ACCESS_ANY;
                        else if (strcmp(s, "client") == 0) {
-#ifdef CONFIG_9P_FS_POSIX_ACL
                                v9ses->flags |= V9FS_ACCESS_CLIENT;
-#else
-                                P9_DPRINTK(P9_DEBUG_ERROR,
-                                        "access=client option not supported\n");
-                                kfree(s);
-                                ret = -EINVAL;
-                                goto free_and_return;
-#endif
                        } else {
                                v9ses->flags |= V9FS_ACCESS_SINGLE;
                                v9ses->uid = simple_strtoul(s, &e, 10);
@@ -212,6 +206,16 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                        kfree(s);
                        break;
+                case Opt_posixacl:
+#ifdef CONFIG_9P_FS_POSIX_ACL
+                        v9ses->flags |= V9FS_POSIX_ACL;
+#else
+                        P9_DPRINTK(P9_DEBUG_ERROR,
+                                        "Not defined CONFIG_9P_FS_POSIX_ACL. "
+                                        "Ignoring posixacl option\n");
+#endif
+                        break;
                default:
                        continue;
                }
@@ -260,19 +264,12 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        list_add(&v9ses->slist, &v9fs_sessionlist);
        spin_unlock(&v9fs_sessionlist_lock);
-        v9ses->flags = V9FS_ACCESS_USER;
        strcpy(v9ses->uname, V9FS_DEFUSER);
        strcpy(v9ses->aname, V9FS_DEFANAME);
        v9ses->uid = ~0;
        v9ses->dfltuid = V9FS_DEFUID;
        v9ses->dfltgid = V9FS_DEFGID;
-        rc = v9fs_parse_options(v9ses, data);
-        if (rc < 0) {
-                retval = rc;
-                goto error;
-        }
        v9ses->clnt = p9_client_create(dev_name, data);
        if (IS_ERR(v9ses->clnt)) {
                retval = PTR_ERR(v9ses->clnt);
@@ -281,10 +278,20 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                goto error;
        }
-        if (p9_is_proto_dotl(v9ses->clnt))
+        v9ses->flags = V9FS_ACCESS_USER;
+        if (p9_is_proto_dotl(v9ses->clnt)) {
+                v9ses->flags = V9FS_ACCESS_CLIENT;
                v9ses->flags |= V9FS_PROTO_2000L;
-        else if (p9_is_proto_dotu(v9ses->clnt))
+        } else if (p9_is_proto_dotu(v9ses->clnt)) {
                v9ses->flags |= V9FS_PROTO_2000U;
+        }
+        rc = v9fs_parse_options(v9ses, data);
+        if (rc < 0) {
+                retval = rc;
+                goto error;
+        }
        v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
@@ -306,6 +313,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                v9ses->flags |= V9FS_ACCESS_ANY;
                v9ses->uid = ~0;
        }
+        if (!v9fs_proto_dotl(v9ses) ||
+                !((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
+                /*
+                 * We support ACL checks on clinet only if the protocol is
+                 * 9P2000.L and access is V9FS_ACCESS_CLIENT.
+                 */
+                v9ses->flags &= ~V9FS_ACL_MASK;
+        }
        fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0,
                                                        v9ses->aname);
@@ -467,6 +482,63 @@ static void v9fs_sysfs_cleanup(void)
        kobject_put(v9fs_kobj);
 }
+static void v9fs_inode_init_once(void *foo)
+{
+        struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;
+#ifdef CONFIG_9P_FSCACHE
+        v9inode->fscache = NULL;
+        v9inode->fscache_key = NULL;
+#endif
+        inode_init_once(&v9inode->vfs_inode);
+}
+/**
+ * v9fs_init_inode_cache - initialize a cache for 9P
+ * Returns 0 on success.
+ */
+static int v9fs_init_inode_cache(void)
+{
+        v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
+                                          sizeof(struct v9fs_inode),
+                                          0, (SLAB_RECLAIM_ACCOUNT|
+                                              SLAB_MEM_SPREAD),
+                                          v9fs_inode_init_once);
+        if (!v9fs_inode_cache)
+                return -ENOMEM;
+        return 0;
+}
+/**
+ * v9fs_destroy_inode_cache - destroy the cache of 9P inode
+ *
+ */
+static void v9fs_destroy_inode_cache(void)
+{
+        kmem_cache_destroy(v9fs_inode_cache);
+}
+static int v9fs_cache_register(void)
+{
+        int ret;
+        ret = v9fs_init_inode_cache();
+        if (ret < 0)
+                return ret;
+#ifdef CONFIG_9P_FSCACHE
+        return fscache_register_netfs(&v9fs_cache_netfs);
+#else
+        return ret;
+#endif
+}
+static void v9fs_cache_unregister(void)
+{
+        v9fs_destroy_inode_cache();
+#ifdef CONFIG_9P_FSCACHE
+        fscache_unregister_netfs(&v9fs_cache_netfs);
+#endif
+}
 /**
 * init_v9fs - Initialize module
 *
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index c4b5d8864f0d..bd8496db135b 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -20,6 +20,9 @@
 *  Boston, MA  02111-1301  USA
 *
 */
+#ifndef FS_9P_V9FS_H
+#define FS_9P_V9FS_H
 #include <linux/backing-dev.h>
 /**
@@ -28,8 +31,10 @@
 * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
+ * @V9FS_ACCESS_CLIENT: Just like user, but access check is performed on client.
 * @V9FS_ACCESS_ANY: use a single attach for all users
 * @V9FS_ACCESS_MASK: bit mask of different ACCESS options
+ * @V9FS_POSIX_ACL: POSIX ACLs are enforced
 *
 * Session flags reflect options selected by users at mount time
 */
@@ -37,13 +42,15 @@
                         V9FS_ACCESS_USER |   \
                         V9FS_ACCESS_CLIENT)
 #define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
+#define V9FS_ACL_MASK V9FS_POSIX_ACL
 enum p9_session_flags {
        V9FS_PROTO_2000U        = 0x01,
        V9FS_PROTO_2000L        = 0x02,
        V9FS_ACCESS_SINGLE      = 0x04,
        V9FS_ACCESS_USER        = 0x08,
-        V9FS_ACCESS_CLIENT      = 0x10
+        V9FS_ACCESS_CLIENT      = 0x10,
+        V9FS_POSIX_ACL          = 0x20
 };
 /* possible values of ->cache */
@@ -109,8 +116,28 @@ struct v9fs_session_info {
        struct list_head slist; /* list of sessions registered with v9fs */
        struct backing_dev_info bdi;
        struct rw_semaphore rename_sem;
+        struct p9_fid *root_fid; /* Used for file system sync */
+};
+/* cache_validity flags */
+#define V9FS_INO_INVALID_ATTR 0x01
+struct v9fs_inode {
+#ifdef CONFIG_9P_FSCACHE
+        spinlock_t fscache_lock;
+        struct fscache_cookie *fscache;
+        struct p9_qid *fscache_key;
+#endif
+        unsigned int cache_validity;
+        struct p9_fid *writeback_fid;
+        struct inode vfs_inode;
 };
+static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
+{
+        return container_of(inode, struct v9fs_inode, vfs_inode);
+}
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
                                                                        char *);
 extern void v9fs_session_close(struct v9fs_session_info *v9ses);
@@ -124,16 +151,15 @@ extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        struct inode *new_dir, struct dentry *new_dentry);
 extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
                        void *p);
-extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses,
+extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
-                        struct p9_fid *fid,
+                                         struct p9_fid *fid,
-                        struct super_block *sb);
+                                         struct super_block *sb);
 extern const struct inode_operations v9fs_dir_inode_operations_dotl;
 extern const struct inode_operations v9fs_file_inode_operations_dotl;
 extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
-extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses,
+extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
-                        struct p9_fid *fid,
+                                              struct p9_fid *fid,
-                        struct super_block *sb);
+                                              struct super_block *sb);
 /* other default globals */
 #define V9FS_PORT       564
@@ -158,7 +184,7 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
 }
 /**
- * v9fs_inode_from_fid - Helper routine to populate an inode by
+ * v9fs_get_inode_from_fid - Helper routine to populate an inode by
 * issuing a attribute request
 * @v9ses: session information
 * @fid: fid to issue attribute request for
@@ -166,11 +192,12 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
 *
 */
 static inline struct inode *
-v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-                                struct super_block *sb)
+                        struct super_block *sb)
 {
        if (v9fs_proto_dotl(v9ses))
-                return v9fs_inode_dotl(v9ses, fid, sb);
+                return v9fs_inode_from_fid_dotl(v9ses, fid, sb);
        else
-                return v9fs_inode(v9ses, fid, sb);
+                return v9fs_inode_from_fid(v9ses, fid, sb);
 }
+#endif
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index b789f8e597ec..4014160903a9 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -20,6 +20,8 @@
 *  Boston, MA  02111-1301  USA
 *
 */
+#ifndef FS_9P_V9FS_VFS_H
+#define FS_9P_V9FS_VFS_H
 /* plan9 semantics are that created files are implicitly opened.
 * But linux semantics are that you call create, then open.
@@ -36,6 +38,7 @@
 * unlink calls remove, which is an implicit clunk. So we have to track
 * that kind of thing so that we don't try to clunk a dead fid.
 */
+#define P9_LOCK_TIMEOUT (30*HZ)
 extern struct file_system_type v9fs_fs_type;
 extern const struct address_space_operations v9fs_addr_operations;
@@ -45,13 +48,15 @@ extern const struct file_operations v9fs_dir_operations;
 extern const struct file_operations v9fs_dir_operations_dotl;
 extern const struct dentry_operations v9fs_dentry_operations;
 extern const struct dentry_operations v9fs_cached_dentry_operations;
+extern const struct file_operations v9fs_cached_file_operations;
+extern const struct file_operations v9fs_cached_file_operations_dotl;
+extern struct kmem_cache *v9fs_inode_cache;
-#ifdef CONFIG_9P_FSCACHE
 struct inode *v9fs_alloc_inode(struct super_block *sb);
 void v9fs_destroy_inode(struct inode *inode);
-#endif
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
+int v9fs_init_inode(struct v9fs_session_info *v9ses,
+                    struct inode *inode, int mode);
 void v9fs_evict_inode(struct inode *inode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
 void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
@@ -62,8 +67,19 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
 int v9fs_uflags2omode(int uflags, int extended);
 ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
+ssize_t v9fs_fid_readn(struct p9_fid *, char *, char __user *, u32, u64);
 void v9fs_blank_wstat(struct p9_wstat *wstat);
 int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
 int v9fs_file_fsync_dotl(struct file *filp, int datasync);
+ssize_t v9fs_file_write_internal(struct inode *, struct p9_fid *,
-#define P9_LOCK_TIMEOUT (30*HZ)
+                                 const char __user *, size_t, loff_t *, int);
+int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode);
+int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode);
+static inline void v9fs_invalidate_inode_attr(struct inode *inode)
+{
+        struct v9fs_inode *v9inode;
+        v9inode = V9FS_I(inode);
+        v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
+        return;
+}
+#endif
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index b7f2a8e3863e..2524e4cbb8ea 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -39,16 +39,16 @@
 #include "v9fs.h"
 #include "v9fs_vfs.h"
 #include "cache.h"
+#include "fid.h"
 /**
- * v9fs_vfs_readpage - read an entire page in from 9P
+ * v9fs_fid_readpage - read an entire page in from 9P
 *
- * @filp: file being read
+ * @fid: fid being read
 * @page: structure to page
 *
 */
+static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
-static int v9fs_vfs_readpage(struct file *filp, struct page *page)
 {
        int retval;
        loff_t offset;
@@ -67,7 +67,7 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
        buffer = kmap(page);
        offset = page_offset(page);
-        retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset);
+        retval = v9fs_fid_readn(fid, buffer, NULL, PAGE_CACHE_SIZE, offset);
        if (retval < 0) {
                v9fs_uncache_page(inode, page);
                goto done;
@@ -87,6 +87,19 @@ done:
 }
 /**
+ * v9fs_vfs_readpage - read an entire page in from 9P
+ *
+ * @filp: file being read
+ * @page: structure to page
+ *
+ */
+static int v9fs_vfs_readpage(struct file *filp, struct page *page)
+{
+        return v9fs_fid_readpage(filp->private_data, page);
+}
+/**
 * v9fs_vfs_readpages - read a set of pages from 9P
 *
 * @filp: file being read
@@ -124,7 +137,6 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
 {
        if (PagePrivate(page))
                return 0;
        return v9fs_fscache_release_page(page, gfp);
 }
@@ -137,20 +149,89 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
 static void v9fs_invalidate_page(struct page *page, unsigned long offset)
 {
+        /*
+         * If called with zero offset, we should release
+         * the private state assocated with the page
+         */
        if (offset == 0)
                v9fs_fscache_invalidate_page(page);
 }
+static int v9fs_vfs_writepage_locked(struct page *page)
+{
+        char *buffer;
+        int retval, len;
+        loff_t offset, size;
+        mm_segment_t old_fs;
+        struct v9fs_inode *v9inode;
+        struct inode *inode = page->mapping->host;
+        v9inode = V9FS_I(inode);
+        size = i_size_read(inode);
+        if (page->index == size >> PAGE_CACHE_SHIFT)
+                len = size & ~PAGE_CACHE_MASK;
+        else
+                len = PAGE_CACHE_SIZE;
+        set_page_writeback(page);
+        buffer = kmap(page);
+        offset = page_offset(page);
+        old_fs = get_fs();
+        set_fs(get_ds());
+        /* We should have writeback_fid always set */
+        BUG_ON(!v9inode->writeback_fid);
+        retval = v9fs_file_write_internal(inode,
+                                          v9inode->writeback_fid,
+                                          (__force const char __user *)buffer,
+                                          len, &offset, 0);
+        if (retval > 0)
+                retval = 0;
+        set_fs(old_fs);
+        kunmap(page);
+        end_page_writeback(page);
+        return retval;
+}
+static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+        int retval;
+        retval = v9fs_vfs_writepage_locked(page);
+        if (retval < 0) {
+                if (retval == -EAGAIN) {
+                        redirty_page_for_writepage(wbc, page);
+                        retval = 0;
+                } else {
+                        SetPageError(page);
+                        mapping_set_error(page->mapping, retval);
+                }
+        } else
+                retval = 0;
+        unlock_page(page);
+        return retval;
+}
 /**
 * v9fs_launder_page - Writeback a dirty page
- * Since the writes go directly to the server, we simply return a 0
- * here to indicate success.
- *
 * Returns 0 on success.
 */
 static int v9fs_launder_page(struct page *page)
 {
+        int retval;
+        struct inode *inode = page->mapping->host;
+        v9fs_fscache_wait_on_page_write(inode, page);
+        if (clear_page_dirty_for_io(page)) {
+                retval = v9fs_vfs_writepage_locked(page);
+                if (retval)
+                        return retval;
+        }
        return 0;
 }
@@ -173,9 +254,15 @@ static int v9fs_launder_page(struct page *page)
 * with an error.
 *
 */
-ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+static ssize_t
-                loff_t pos, unsigned long nr_segs)
+v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+               loff_t pos, unsigned long nr_segs)
 {
+        /*
+         * FIXME
+         * Now that we do caching with cache mode enabled, We need
+         * to support direct IO
+         */
        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
                        "off/no(%lld/%lu) EINVAL\n",
                        iocb->ki_filp->f_path.dentry->d_name.name,
@@ -183,11 +270,84 @@ ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
        return -EINVAL;
 }
+static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
+                            loff_t pos, unsigned len, unsigned flags,
+                            struct page **pagep, void **fsdata)
+{
+        int retval = 0;
+        struct page *page;
+        struct v9fs_inode *v9inode;
+        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+        struct inode *inode = mapping->host;
+        v9inode = V9FS_I(inode);
+start:
+        page = grab_cache_page_write_begin(mapping, index, flags);
+        if (!page) {
+                retval = -ENOMEM;
+                goto out;
+        }
+        BUG_ON(!v9inode->writeback_fid);
+        if (PageUptodate(page))
+                goto out;
+        if (len == PAGE_CACHE_SIZE)
+                goto out;
+        retval = v9fs_fid_readpage(v9inode->writeback_fid, page);
+        page_cache_release(page);
+        if (!retval)
+                goto start;
+out:
+        *pagep = page;
+        return retval;
+}
+static int v9fs_write_end(struct file *filp, struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned copied,
+                          struct page *page, void *fsdata)
+{
+        loff_t last_pos = pos + copied;
+        struct inode *inode = page->mapping->host;
+        if (unlikely(copied < len)) {
+                /*
+                 * zero out the rest of the area
+                 */
+                unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+                zero_user(page, from + copied, len - copied);
+                flush_dcache_page(page);
+        }
+        if (!PageUptodate(page))
+                SetPageUptodate(page);
+        /*
+         * No need to use i_size_read() here, the i_size
+         * cannot change under us because we hold the i_mutex.
+         */
+        if (last_pos > inode->i_size) {
+                inode_add_bytes(inode, last_pos - inode->i_size);
+                i_size_write(inode, last_pos);
+        }
+        set_page_dirty(page);
+        unlock_page(page);
+        page_cache_release(page);
+        return copied;
+}
 const struct address_space_operations v9fs_addr_operations = {
-      .readpage = v9fs_vfs_readpage,
+        .readpage = v9fs_vfs_readpage,
-      .readpages = v9fs_vfs_readpages,
+        .readpages = v9fs_vfs_readpages,
-      .releasepage = v9fs_release_page,
+        .set_page_dirty = __set_page_dirty_nobuffers,
-      .invalidatepage = v9fs_invalidate_page,
+        .writepage = v9fs_vfs_writepage,
-      .launder_page = v9fs_launder_page,
+        .write_begin = v9fs_write_begin,
-      .direct_IO = v9fs_direct_IO,
+        .write_end = v9fs_write_end,
+        .releasepage = v9fs_release_page,
+        .invalidatepage = v9fs_invalidate_page,
+        .launder_page = v9fs_launder_page,
+        .direct_IO = v9fs_direct_IO,
 };
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 233b7d4ffe5e..b6a3b9f7fe4d 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -63,20 +63,15 @@ static int v9fs_dentry_delete(const struct dentry *dentry)
 * v9fs_cached_dentry_delete - called when dentry refcount equals 0
 * @dentry:  dentry in question
 *
- * Only return 1 if our inode is invalid.  Only non-synthetic files
- * (ones without mtime == 0) should be calling this function.
- *
 */
 static int v9fs_cached_dentry_delete(const struct dentry *dentry)
 {
-        struct inode *inode = dentry->d_inode;
+        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n",
-        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
+                   dentry->d_name.name, dentry);
-                                                                        dentry);
-        if(!inode)
+        /* Don't cache negative dentries */
+        if (!dentry->d_inode)
                return 1;
        return 0;
 }
@@ -105,7 +100,41 @@ static void v9fs_dentry_release(struct dentry *dentry)
        }
 }
+static int v9fs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        struct p9_fid *fid;
+        struct inode *inode;
+        struct v9fs_inode *v9inode;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        if (!inode)
+                goto out_valid;
+        v9inode = V9FS_I(inode);
+        if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
+                int retval;
+                struct v9fs_session_info *v9ses;
+                fid = v9fs_fid_lookup(dentry);
+                if (IS_ERR(fid))
+                        return PTR_ERR(fid);
+                v9ses = v9fs_inode2v9ses(inode);
+                if (v9fs_proto_dotl(v9ses))
+                        retval = v9fs_refresh_inode_dotl(fid, inode);
+                else
+                        retval = v9fs_refresh_inode(fid, inode);
+                if (retval <= 0)
+                        return retval;
+        }
+out_valid:
+        return 1;
+}
 const struct dentry_operations v9fs_cached_dentry_operations = {
+        .d_revalidate = v9fs_lookup_revalidate,
        .d_delete = v9fs_cached_dentry_delete,
        .d_release = v9fs_dentry_release,
 };
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index b84ebe8cefed..9c2bdda5cd9d 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -295,7 +295,6 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
        P9_DPRINTK(P9_DEBUG_VFS,
                        "v9fs_dir_release: inode: %p filp: %p fid: %d\n",
                        inode, filp, fid ? fid->fid : -1);
-        filemap_write_and_wait(inode->i_mapping);
        if (fid)
                p9_client_clunk(fid);
        return 0;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 240c30674396..78bcb97c3425 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -44,8 +44,7 @@
 #include "fid.h"
 #include "cache.h"
-static const struct file_operations v9fs_cached_file_operations;
+static const struct vm_operations_struct v9fs_file_vm_ops;
-static const struct file_operations v9fs_cached_file_operations_dotl;
 /**
 * v9fs_file_open - open a file (or directory)
@@ -57,11 +56,13 @@ static const struct file_operations v9fs_cached_file_operations_dotl;
 int v9fs_file_open(struct inode *inode, struct file *file)
 {
        int err;
+        struct v9fs_inode *v9inode;
        struct v9fs_session_info *v9ses;
        struct p9_fid *fid;
        int omode;
        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
+        v9inode = V9FS_I(inode);
        v9ses = v9fs_inode2v9ses(inode);
        if (v9fs_proto_dotl(v9ses))
                omode = file->f_flags;
@@ -89,20 +90,30 @@ int v9fs_file_open(struct inode *inode, struct file *file)
        }
        file->private_data = fid;
-        if ((fid->qid.version) && (v9ses->cache)) {
+        if (v9ses->cache && !v9inode->writeback_fid) {
-                P9_DPRINTK(P9_DEBUG_VFS, "cached");
+                /*
-                /* enable cached file options */
+                 * clone a fid and add it to writeback_fid
-                if(file->f_op == &v9fs_file_operations)
+                 * we do it during open time instead of
-                        file->f_op = &v9fs_cached_file_operations;
+                 * page dirty time via write_begin/page_mkwrite
-                else if (file->f_op == &v9fs_file_operations_dotl)
+                 * because we want write after unlink usecase
-                        file->f_op = &v9fs_cached_file_operations_dotl;
+                 * to work.
+                 */
+                fid = v9fs_writeback_fid(file->f_path.dentry);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        goto out_error;
+                }
+                v9inode->writeback_fid = (void *) fid;
+        }
 #ifdef CONFIG_9P_FSCACHE
+        if (v9ses->cache)
                v9fs_cache_inode_set_cookie(inode, file);
 #endif
-        }
        return 0;
+out_error:
+        p9_client_clunk(file->private_data);
+        file->private_data = NULL;
+        return err;
 }
 /**
@@ -335,25 +346,22 @@ out_err:
 }
 /**
- * v9fs_file_readn - read from a file
+ * v9fs_fid_readn - read from a fid
- * @filp: file pointer to read
+ * @fid: fid to read
 * @data: data buffer to read data into
 * @udata: user data buffer to read data into
 * @count: size of buffer
 * @offset: offset at which to read data
 *
 */
 ssize_t
-v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
+v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count,
               u64 offset)
 {
        int n, total, size;
-        struct p9_fid *fid = filp->private_data;
        P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
-                                        (long long unsigned) offset, count);
+                   (long long unsigned) offset, count);
        n = 0;
        total = 0;
        size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -379,6 +387,22 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
 }
 /**
+ * v9fs_file_readn - read from a file
+ * @filp: file pointer to read
+ * @data: data buffer to read data into
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+ssize_t
+v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
+               u64 offset)
+{
+        return v9fs_fid_readn(filp->private_data, data, udata, count, offset);
+}
+/**
 * v9fs_file_read - read from a file
 * @filp: file pointer to read
 * @udata: user data buffer to read data into
@@ -410,45 +434,22 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
        return ret;
 }
-/**
+ssize_t
- * v9fs_file_write - write to a file
+v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
- * @filp: file pointer to write
+                         const char __user *data, size_t count,
- * @data: data buffer to write data from
+                         loff_t *offset, int invalidate)
- * @count: size of buffer
- * @offset: offset at which to write data
- *
- */
-static ssize_t
-v9fs_file_write(struct file *filp, const char __user * data,
-                size_t count, loff_t * offset)
 {
-        ssize_t retval;
-        size_t total = 0;
        int n;
-        struct p9_fid *fid;
+        loff_t i_size;
+        size_t total = 0;
        struct p9_client *clnt;
-        struct inode *inode = filp->f_path.dentry->d_inode;
        loff_t origin = *offset;
        unsigned long pg_start, pg_end;
        P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
                (int)count, (int)*offset);
-        fid = filp->private_data;
        clnt = fid->clnt;
-        retval = generic_write_checks(filp, &origin, &count, 0);
-        if (retval)
-                goto out;
-        retval = -EINVAL;
-        if ((ssize_t) count < 0)
-                goto out;
-        retval = 0;
-        if (!count)
-                goto out;
        do {
                n = p9_client_write(fid, NULL, data+total, origin+total, count);
                if (n <= 0)
@@ -457,25 +458,60 @@ v9fs_file_write(struct file *filp, const char __user * data,
                total += n;
        } while (count > 0);
-        if (total > 0) {
+        if (invalidate && (total > 0)) {
                pg_start = origin >> PAGE_CACHE_SHIFT;
                pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
                if (inode->i_mapping && inode->i_mapping->nrpages)
                        invalidate_inode_pages2_range(inode->i_mapping,
                                                      pg_start, pg_end);
                *offset += total;
-                i_size_write(inode, i_size_read(inode) + total);
+                i_size = i_size_read(inode);
-                inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+                if (*offset > i_size) {
+                        inode_add_bytes(inode, *offset - i_size);
+                        i_size_write(inode, *offset);
+                }
        }
        if (n < 0)
-                retval = n;
+                return n;
-        else
-                retval = total;
+        return total;
+}
+/**
+ * v9fs_file_write - write to a file
+ * @filp: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_file_write(struct file *filp, const char __user * data,
+                size_t count, loff_t *offset)
+{
+        ssize_t retval = 0;
+        loff_t origin = *offset;
+        retval = generic_write_checks(filp, &origin, &count, 0);
+        if (retval)
+                goto out;
+        retval = -EINVAL;
+        if ((ssize_t) count < 0)
+                goto out;
+        retval = 0;
+        if (!count)
+                goto out;
+        return v9fs_file_write_internal(filp->f_path.dentry->d_inode,
+                                        filp->private_data,
+                                        data, count, offset, 1);
 out:
        return retval;
 }
 static int v9fs_file_fsync(struct file *filp, int datasync)
 {
        struct p9_fid *fid;
@@ -505,28 +541,182 @@ int v9fs_file_fsync_dotl(struct file *filp, int datasync)
        return retval;
 }
-static const struct file_operations v9fs_cached_file_operations = {
+static int
+v9fs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        int retval;
+        retval = generic_file_mmap(file, vma);
+        if (!retval)
+                vma->vm_ops = &v9fs_file_vm_ops;
+        return retval;
+}
+static int
+v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct v9fs_inode *v9inode;
+        struct page *page = vmf->page;
+        struct file *filp = vma->vm_file;
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n",
+                   page, (unsigned long)filp->private_data);
+        v9inode = V9FS_I(inode);
+        /* make sure the cache has finished storing the page */
+        v9fs_fscache_wait_on_page_write(inode, page);
+        BUG_ON(!v9inode->writeback_fid);
+        lock_page(page);
+        if (page->mapping != inode->i_mapping)
+                goto out_unlock;
+        return VM_FAULT_LOCKED;
+out_unlock:
+        unlock_page(page);
+        return VM_FAULT_NOPAGE;
+}
+static ssize_t
+v9fs_direct_read(struct file *filp, char __user *udata, size_t count,
+                 loff_t *offsetp)
+{
+        loff_t size, offset;
+        struct inode *inode;
+        struct address_space *mapping;
+        offset = *offsetp;
+        mapping = filp->f_mapping;
+        inode = mapping->host;
+        if (!count)
+                return 0;
+        size = i_size_read(inode);
+        if (offset < size)
+                filemap_write_and_wait_range(mapping, offset,
+                                             offset + count - 1);
+        return v9fs_file_read(filp, udata, count, offsetp);
+}
+/**
+ * v9fs_cached_file_read - read from a file
+ * @filp: file pointer to read
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+static ssize_t
+v9fs_cached_file_read(struct file *filp, char __user *data, size_t count,
+                      loff_t *offset)
+{
+        if (filp->f_flags & O_DIRECT)
+                return v9fs_direct_read(filp, data, count, offset);
+        return do_sync_read(filp, data, count, offset);
+}
+static ssize_t
+v9fs_direct_write(struct file *filp, const char __user * data,
+                  size_t count, loff_t *offsetp)
+{
+        loff_t offset;
+        ssize_t retval;
+        struct inode *inode;
+        struct address_space *mapping;
+        offset = *offsetp;
+        mapping = filp->f_mapping;
+        inode = mapping->host;
+        if (!count)
+                return 0;
+        mutex_lock(&inode->i_mutex);
+        retval = filemap_write_and_wait_range(mapping, offset,
+                                              offset + count - 1);
+        if (retval)
+                goto err_out;
+        /*
+         * After a write we want buffered reads to be sure to go to disk to get
+         * the new data.  We invalidate clean cached page from the region we're
+         * about to write.  We do this *before* the write so that if we fail
+         * here we fall back to buffered write
+         */
+        if (mapping->nrpages) {
+                pgoff_t pg_start = offset >> PAGE_CACHE_SHIFT;
+                pgoff_t pg_end   = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+                retval = invalidate_inode_pages2_range(mapping,
+                                                        pg_start, pg_end);
+                /*
+                 * If a page can not be invalidated, fall back
+                 * to buffered write.
+                 */
+                if (retval) {
+                        if (retval == -EBUSY)
+                                goto buff_write;
+                        goto err_out;
+                }
+        }
+        retval = v9fs_file_write(filp, data, count, offsetp);
+err_out:
+        mutex_unlock(&inode->i_mutex);
+        return retval;
+buff_write:
+        mutex_unlock(&inode->i_mutex);
+        return do_sync_write(filp, data, count, offsetp);
+}
+/**
+ * v9fs_cached_file_write - write to a file
+ * @filp: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_cached_file_write(struct file *filp, const char __user * data,
+                       size_t count, loff_t *offset)
+{
+        if (filp->f_flags & O_DIRECT)
+                return v9fs_direct_write(filp, data, count, offset);
+        return do_sync_write(filp, data, count, offset);
+}
+static const struct vm_operations_struct v9fs_file_vm_ops = {
+        .fault = filemap_fault,
+        .page_mkwrite = v9fs_vm_page_mkwrite,
+};
+const struct file_operations v9fs_cached_file_operations = {
        .llseek = generic_file_llseek,
-        .read = do_sync_read,
+        .read = v9fs_cached_file_read,
+        .write = v9fs_cached_file_write,
        .aio_read = generic_file_aio_read,
-        .write = v9fs_file_write,
+        .aio_write = generic_file_aio_write,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
        .lock = v9fs_file_lock,
-        .mmap = generic_file_readonly_mmap,
+        .mmap = v9fs_file_mmap,
        .fsync = v9fs_file_fsync,
 };
-static const struct file_operations v9fs_cached_file_operations_dotl = {
+const struct file_operations v9fs_cached_file_operations_dotl = {
        .llseek = generic_file_llseek,
-        .read = do_sync_read,
+        .read = v9fs_cached_file_read,
+        .write = v9fs_cached_file_write,
        .aio_read = generic_file_aio_read,
-        .write = v9fs_file_write,
+        .aio_write = generic_file_aio_write,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
        .lock = v9fs_file_lock_dotl,
        .flock = v9fs_file_flock_dotl,
-        .mmap = generic_file_readonly_mmap,
+        .mmap = v9fs_file_mmap,
        .fsync = v9fs_file_fsync_dotl,
 };
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b76a40bdf4c2..8a2c232f708a 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -203,26 +203,25 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
        wstat->extension = NULL;
 }
-#ifdef CONFIG_9P_FSCACHE
 /**
 * v9fs_alloc_inode - helper function to allocate an inode
- * This callback is executed before setting up the inode so that we
- * can associate a vcookie with each inode.
 *
 */
 struct inode *v9fs_alloc_inode(struct super_block *sb)
 {
-        struct v9fs_cookie *vcookie;
+        struct v9fs_inode *v9inode;
-        vcookie = (struct v9fs_cookie *)kmem_cache_alloc(vcookie_cache,
+        v9inode = (struct v9fs_inode *)kmem_cache_alloc(v9fs_inode_cache,
-                                                         GFP_KERNEL);
+                                                        GFP_KERNEL);
-        if (!vcookie)
+        if (!v9inode)
                return NULL;
+#ifdef CONFIG_9P_FSCACHE
-        vcookie->fscache = NULL;
+        v9inode->fscache = NULL;
-        vcookie->qid = NULL;
+        v9inode->fscache_key = NULL;
-        spin_lock_init(&vcookie->lock);
+        spin_lock_init(&v9inode->fscache_lock);
-        return &vcookie->inode;
+#endif
+        v9inode->writeback_fid = NULL;
+        v9inode->cache_validity = 0;
+        return &v9inode->vfs_inode;
 }
 /**
@@ -234,35 +233,18 @@ static void v9fs_i_callback(struct rcu_head *head)
 {
        struct inode *inode = container_of(head, struct inode, i_rcu);
        INIT_LIST_HEAD(&inode->i_dentry);
-        kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
+        kmem_cache_free(v9fs_inode_cache, V9FS_I(inode));
 }
 void v9fs_destroy_inode(struct inode *inode)
 {
        call_rcu(&inode->i_rcu, v9fs_i_callback);
 }
-#endif
-/**
+int v9fs_init_inode(struct v9fs_session_info *v9ses,
- * v9fs_get_inode - helper function to setup an inode
+                    struct inode *inode, int mode)
- * @sb: superblock
- * @mode: mode to setup inode with
- *
- */
-struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 {
-        int err;
+        int err = 0;
-        struct inode *inode;
-        struct v9fs_session_info *v9ses = sb->s_fs_info;
-        P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
-        inode = new_inode(sb);
-        if (!inode) {
-                P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
-                return ERR_PTR(-ENOMEM);
-        }
        inode_init_owner(inode, NULL, mode);
        inode->i_blocks = 0;
@@ -292,14 +274,20 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
        case S_IFREG:
                if (v9fs_proto_dotl(v9ses)) {
                        inode->i_op = &v9fs_file_inode_operations_dotl;
-                        inode->i_fop = &v9fs_file_operations_dotl;
+                        if (v9ses->cache)
+                                inode->i_fop =
+                                        &v9fs_cached_file_operations_dotl;
+                        else
+                                inode->i_fop = &v9fs_file_operations_dotl;
                } else {
                        inode->i_op = &v9fs_file_inode_operations;
-                        inode->i_fop = &v9fs_file_operations;
+                        if (v9ses->cache)
+                                inode->i_fop = &v9fs_cached_file_operations;
+                        else
+                                inode->i_fop = &v9fs_file_operations;
                }
                break;
        case S_IFLNK:
                if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
                        P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
@@ -335,12 +323,37 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
                err = -EINVAL;
                goto error;
        }
+error:
+        return err;
-        return inode;
+}
-error:
+/**
-        iput(inode);
+ * v9fs_get_inode - helper function to setup an inode
-        return ERR_PTR(err);
+ * @sb: superblock
+ * @mode: mode to setup inode with
+ *
+ */
+struct inode *v9fs_get_inode(struct super_block *sb, int mode)
+{
+        int err;
+        struct inode *inode;
+        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
+        inode = new_inode(sb);
+        if (!inode) {
+                P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        err = v9fs_init_inode(v9ses, inode, mode);
+        if (err) {
+                iput(inode);
+                return ERR_PTR(err);
+        }
+        return inode;
 }
 /*
@@ -403,6 +416,8 @@ error:
 */
 void v9fs_evict_inode(struct inode *inode)
 {
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        truncate_inode_pages(inode->i_mapping, 0);
        end_writeback(inode);
        filemap_fdatawrite(inode->i_mapping);
@@ -410,41 +425,67 @@ void v9fs_evict_inode(struct inode *inode)
 #ifdef CONFIG_9P_FSCACHE
        v9fs_cache_inode_put_cookie(inode);
 #endif
+        /* clunk the fid stashed in writeback_fid */
+        if (v9inode->writeback_fid) {
+                p9_client_clunk(v9inode->writeback_fid);
+                v9inode->writeback_fid = NULL;
+        }
 }
-struct inode *
+static struct inode *v9fs_qid_iget(struct super_block *sb,
-v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+                                   struct p9_qid *qid,
-        struct super_block *sb)
+                                   struct p9_wstat *st)
 {
-        int err, umode;
+        int retval, umode;
-        struct inode *ret = NULL;
+        unsigned long i_ino;
-        struct p9_wstat *st;
+        struct inode *inode;
+        struct v9fs_session_info *v9ses = sb->s_fs_info;
-        st = p9_client_stat(fid);
-        if (IS_ERR(st))
-                return ERR_CAST(st);
+        i_ino = v9fs_qid2ino(qid);
+        inode = iget_locked(sb, i_ino);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        /*
+         * initialize the inode with the stat info
+         * FIXME!! we may need support for stale inodes
+         * later.
+         */
        umode = p9mode2unixmode(v9ses, st->mode);
-        ret = v9fs_get_inode(sb, umode);
+        retval = v9fs_init_inode(v9ses, inode, umode);
-        if (IS_ERR(ret)) {
+        if (retval)
-                err = PTR_ERR(ret);
                goto error;
-        }
-        v9fs_stat2inode(st, ret, sb);
-        ret->i_ino = v9fs_qid2ino(&st->qid);
+        v9fs_stat2inode(st, inode, sb);
 #ifdef CONFIG_9P_FSCACHE
-        v9fs_vcookie_set_qid(ret, &st->qid);
+        v9fs_fscache_set_key(inode, &st->qid);
-        v9fs_cache_inode_get_cookie(ret);
+        v9fs_cache_inode_get_cookie(inode);
 #endif
-        p9stat_free(st);
+        unlock_new_inode(inode);
-        kfree(st);
+        return inode;
-        return ret;
 error:
+        unlock_new_inode(inode);
+        iput(inode);
+        return ERR_PTR(retval);
+}
+struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+                    struct super_block *sb)
+{
+        struct p9_wstat *st;
+        struct inode *inode = NULL;
+        st = p9_client_stat(fid);
+        if (IS_ERR(st))
+                return ERR_CAST(st);
+        inode = v9fs_qid_iget(sb, &st->qid, st);
        p9stat_free(st);
        kfree(st);
-        return ERR_PTR(err);
+        return inode;
 }
 /**
@@ -458,8 +499,8 @@ error:
 static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 {
        int retval;
-        struct inode *file_inode;
        struct p9_fid *v9fid;
+        struct inode *file_inode;
        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
                rmdir);
@@ -470,8 +511,20 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
                return PTR_ERR(v9fid);
        retval = p9_client_remove(v9fid);
-        if (!retval)
+        if (!retval) {
-                drop_nlink(file_inode);
+                /*
+                 * directories on unlink should have zero
+                 * link count
+                 */
+                if (rmdir) {
+                        clear_nlink(file_inode);
+                        drop_nlink(dir);
+                } else
+                        drop_nlink(file_inode);
+                v9fs_invalidate_inode_attr(file_inode);
+                v9fs_invalidate_inode_attr(dir);
+        }
        return retval;
 }
@@ -531,7 +584,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        }
        /* instantiate inode and assign the unopened fid to the dentry */
-        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -570,9 +623,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        int err;
        u32 perm;
        int flags;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid;
        struct file *filp;
+        struct v9fs_inode *v9inode;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid, *inode_fid;
        err = 0;
        fid = NULL;
@@ -592,8 +646,25 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
                goto error;
        }
+        v9fs_invalidate_inode_attr(dir);
        /* if we are opening a file, assign the open fid to the file */
        if (nd && nd->flags & LOOKUP_OPEN) {
+                v9inode = V9FS_I(dentry->d_inode);
+                if (v9ses->cache && !v9inode->writeback_fid) {
+                        /*
+                         * clone a fid and add it to writeback_fid
+                         * we do it during open time instead of
+                         * page dirty time via write_begin/page_mkwrite
+                         * because we want write after unlink usecase
+                         * to work.
+                         */
+                        inode_fid = v9fs_writeback_fid(dentry);
+                        if (IS_ERR(inode_fid)) {
+                                err = PTR_ERR(inode_fid);
+                                goto error;
+                        }
+                        v9inode->writeback_fid = (void *) inode_fid;
+                }
                filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
                if (IS_ERR(filp)) {
                        err = PTR_ERR(filp);
@@ -601,6 +672,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
                }
                filp->private_data = fid;
+#ifdef CONFIG_9P_FSCACHE
+                if (v9ses->cache)
+                        v9fs_cache_inode_set_cookie(dentry->d_inode, filp);
+#endif
        } else
                p9_client_clunk(fid);
@@ -625,8 +700,8 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
        int err;
        u32 perm;
-        struct v9fs_session_info *v9ses;
        struct p9_fid *fid;
+        struct v9fs_session_info *v9ses;
        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
        err = 0;
@@ -636,6 +711,9 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (IS_ERR(fid)) {
                err = PTR_ERR(fid);
                fid = NULL;
+        } else {
+                inc_nlink(dir);
+                v9fs_invalidate_inode_attr(dir);
        }
        if (fid)
@@ -687,7 +765,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
                return ERR_PTR(result);
        }
-        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
        if (IS_ERR(inode)) {
                result = PTR_ERR(inode);
                inode = NULL;
@@ -747,17 +825,19 @@ int
 v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                struct inode *new_dir, struct dentry *new_dentry)
 {
+        int retval;
        struct inode *old_inode;
+        struct inode *new_inode;
        struct v9fs_session_info *v9ses;
        struct p9_fid *oldfid;
        struct p9_fid *olddirfid;
        struct p9_fid *newdirfid;
        struct p9_wstat wstat;
-        int retval;
        P9_DPRINTK(P9_DEBUG_VFS, "\n");
        retval = 0;
        old_inode = old_dentry->d_inode;
+        new_inode = new_dentry->d_inode;
        v9ses = v9fs_inode2v9ses(old_inode);
        oldfid = v9fs_fid_lookup(old_dentry);
        if (IS_ERR(oldfid))
@@ -798,9 +878,30 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        retval = p9_client_wstat(oldfid, &wstat);
 clunk_newdir:
-        if (!retval)
+        if (!retval) {
+                if (new_inode) {
+                        if (S_ISDIR(new_inode->i_mode))
+                                clear_nlink(new_inode);
+                        else
+                                drop_nlink(new_inode);
+                        /*
+                         * Work around vfs rename rehash bug with
+                         * FS_RENAME_DOES_D_MOVE
+                         */
+                        v9fs_invalidate_inode_attr(new_inode);
+                }
+                if (S_ISDIR(old_inode->i_mode)) {
+                        if (!new_inode)
+                                inc_nlink(new_dir);
+                        drop_nlink(old_dir);
+                }
+                v9fs_invalidate_inode_attr(old_inode);
+                v9fs_invalidate_inode_attr(old_dir);
+                v9fs_invalidate_inode_attr(new_dir);
                /* successful rename */
                d_move(old_dentry, new_dentry);
+        }
        up_write(&v9ses->rename_sem);
        p9_client_clunk(newdirfid);
@@ -831,9 +932,10 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
        err = -EPERM;
        v9ses = v9fs_inode2v9ses(dentry->d_inode);
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                return simple_getattr(mnt, dentry, stat);
+                generic_fillattr(dentry->d_inode, stat);
+                return 0;
+        }
        fid = v9fs_fid_lookup(dentry);
        if (IS_ERR(fid))
                return PTR_ERR(fid);
@@ -891,17 +993,20 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
                if (iattr->ia_valid & ATTR_GID)
                        wstat.n_gid = iattr->ia_gid;
        }
-        retval = p9_client_wstat(fid, &wstat);
-        if (retval < 0)
-                return retval;
        if ((iattr->ia_valid & ATTR_SIZE) &&
            iattr->ia_size != i_size_read(dentry->d_inode)) {
                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
                if (retval)
                        return retval;
        }
+        /* Write all dirty data */
+        if (S_ISREG(dentry->d_inode->i_mode))
+                filemap_write_and_wait(dentry->d_inode->i_mapping);
+        retval = p9_client_wstat(fid, &wstat);
+        if (retval < 0)
+                return retval;
+        v9fs_invalidate_inode_attr(dentry->d_inode);
        setattr_copy(dentry->d_inode, iattr);
        mark_inode_dirty(dentry->d_inode);
@@ -924,6 +1029,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
        char tag_name[14];
        unsigned int i_nlink;
        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        inode->i_nlink = 1;
@@ -983,6 +1089,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
        /* not real number of blocks, but 512 byte ones ... */
        inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+        v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
 }
 /**
@@ -1115,8 +1222,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
        int mode, const char *extension)
 {
        u32 perm;
-        struct v9fs_session_info *v9ses;
        struct p9_fid *fid;
+        struct v9fs_session_info *v9ses;
        v9ses = v9fs_inode2v9ses(dir);
        if (!v9fs_proto_dotu(v9ses)) {
@@ -1130,6 +1237,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(fid))
                return PTR_ERR(fid);
+        v9fs_invalidate_inode_attr(dir);
        p9_client_clunk(fid);
        return 0;
 }
@@ -1166,8 +1274,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
              struct dentry *dentry)
 {
        int retval;
-        struct p9_fid *oldfid;
        char *name;
+        struct p9_fid *oldfid;
        P9_DPRINTK(P9_DEBUG_VFS,
                " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
@@ -1186,7 +1294,10 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
        sprintf(name, "%d\n", oldfid->fid);
        retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name);
        __putname(name);
+        if (!retval) {
+                v9fs_refresh_inode(oldfid, old_dentry->d_inode);
+                v9fs_invalidate_inode_attr(dir);
+        }
 clunk_fid:
        p9_client_clunk(oldfid);
        return retval;
@@ -1237,6 +1348,32 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        return retval;
 }
+int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
+{
+        loff_t i_size;
+        struct p9_wstat *st;
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(inode);
+        st = p9_client_stat(fid);
+        if (IS_ERR(st))
+                return PTR_ERR(st);
+        spin_lock(&inode->i_lock);
+        /*
+         * We don't want to refresh inode->i_size,
+         * because we may have cached data
+         */
+        i_size = inode->i_size;
+        v9fs_stat2inode(st, inode, inode->i_sb);
+        if (v9ses->cache)
+                inode->i_size = i_size;
+        spin_unlock(&inode->i_lock);
+        p9stat_free(st);
+        kfree(st);
+        return 0;
+}
 static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index fe3ffa9aace4..67c138e94feb 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -86,40 +86,63 @@ static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
        return dentry;
 }
+static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
+                                        struct p9_qid *qid,
+                                        struct p9_fid *fid,
+                                        struct p9_stat_dotl *st)
+{
+        int retval;
+        unsigned long i_ino;
+        struct inode *inode;
+        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        i_ino = v9fs_qid2ino(qid);
+        inode = iget_locked(sb, i_ino);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        /*
+         * initialize the inode with the stat info
+         * FIXME!! we may need support for stale inodes
+         * later.
+         */
+        retval = v9fs_init_inode(v9ses, inode, st->st_mode);
+        if (retval)
+                goto error;
+        v9fs_stat2inode_dotl(st, inode);
+#ifdef CONFIG_9P_FSCACHE
+        v9fs_fscache_set_key(inode, &st->qid);
+        v9fs_cache_inode_get_cookie(inode);
+#endif
+        retval = v9fs_get_acl(inode, fid);
+        if (retval)
+                goto error;
+        unlock_new_inode(inode);
+        return inode;
+error:
+        unlock_new_inode(inode);
+        iput(inode);
+        return ERR_PTR(retval);
+}
 struct inode *
-v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-        struct super_block *sb)
+                         struct super_block *sb)
 {
-        struct inode *ret = NULL;
-        int err;
        struct p9_stat_dotl *st;
+        struct inode *inode = NULL;
        st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
        if (IS_ERR(st))
                return ERR_CAST(st);
-        ret = v9fs_get_inode(sb, st->st_mode);
+        inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st);
-        if (IS_ERR(ret)) {
-                err = PTR_ERR(ret);
-                goto error;
-        }
-        v9fs_stat2inode_dotl(st, ret);
-        ret->i_ino = v9fs_qid2ino(&st->qid);
-#ifdef CONFIG_9P_FSCACHE
-        v9fs_vcookie_set_qid(ret, &st->qid);
-        v9fs_cache_inode_get_cookie(ret);
-#endif
-        err = v9fs_get_acl(ret, fid);
-        if (err) {
-                iput(ret);
-                goto error;
-        }
-        kfree(st);
-        return ret;
-error:
        kfree(st);
-        return ERR_PTR(err);
+        return inode;
 }
 /**
@@ -136,16 +159,17 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                struct nameidata *nd)
 {
        int err = 0;
-        char *name = NULL;
        gid_t gid;
        int flags;
        mode_t mode;
-        struct v9fs_session_info *v9ses;
+        char *name = NULL;
-        struct p9_fid *fid = NULL;
-        struct p9_fid *dfid, *ofid;
        struct file *filp;
        struct p9_qid qid;
        struct inode *inode;
+        struct p9_fid *fid = NULL;
+        struct v9fs_inode *v9inode;
+        struct p9_fid *dfid, *ofid, *inode_fid;
+        struct v9fs_session_info *v9ses;
        struct posix_acl *pacl = NULL, *dacl = NULL;
        v9ses = v9fs_inode2v9ses(dir);
@@ -196,6 +220,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                                err);
                goto error;
        }
+        v9fs_invalidate_inode_attr(dir);
        /* instantiate inode and assign the unopened fid to the dentry */
        fid = p9_client_walk(dfid, 1, &name, 1);
@@ -205,7 +230,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                fid = NULL;
                goto error;
        }
-        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -219,6 +244,22 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
        /* Now set the ACL based on the default value */
        v9fs_set_create_acl(dentry, dacl, pacl);
+        v9inode = V9FS_I(inode);
+        if (v9ses->cache && !v9inode->writeback_fid) {
+                /*
+                 * clone a fid and add it to writeback_fid
+                 * we do it during open time instead of
+                 * page dirty time via write_begin/page_mkwrite
+                 * because we want write after unlink usecase
+                 * to work.
+                 */
+                inode_fid = v9fs_writeback_fid(dentry);
+                if (IS_ERR(inode_fid)) {
+                        err = PTR_ERR(inode_fid);
+                        goto error;
+                }
+                v9inode->writeback_fid = (void *) inode_fid;
+        }
        /* Since we are opening a file, assign the open fid to the file */
        filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
        if (IS_ERR(filp)) {
@@ -226,6 +267,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                return PTR_ERR(filp);
        }
        filp->private_data = ofid;
+#ifdef CONFIG_9P_FSCACHE
+        if (v9ses->cache)
+                v9fs_cache_inode_set_cookie(inode, filp);
+#endif
        return 0;
 error:
@@ -300,7 +345,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
                        goto error;
                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -327,7 +372,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
        }
        /* Now set the ACL based on the default value */
        v9fs_set_create_acl(dentry, dacl, pacl);
+        inc_nlink(dir);
+        v9fs_invalidate_inode_attr(dir);
 error:
        if (fid)
                p9_client_clunk(fid);
@@ -346,9 +392,10 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
        err = -EPERM;
        v9ses = v9fs_inode2v9ses(dentry->d_inode);
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                return simple_getattr(mnt, dentry, stat);
+                generic_fillattr(dentry->d_inode, stat);
+                return 0;
+        }
        fid = v9fs_fid_lookup(dentry);
        if (IS_ERR(fid))
                return PTR_ERR(fid);
@@ -406,16 +453,20 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
        if (IS_ERR(fid))
                return PTR_ERR(fid);
-        retval = p9_client_setattr(fid, &p9attr);
-        if (retval < 0)
-                return retval;
        if ((iattr->ia_valid & ATTR_SIZE) &&
            iattr->ia_size != i_size_read(dentry->d_inode)) {
                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
                if (retval)
                        return retval;
        }
+        /* Write all dirty data */
+        if (S_ISREG(dentry->d_inode->i_mode))
+                filemap_write_and_wait(dentry->d_inode->i_mapping);
+        retval = p9_client_setattr(fid, &p9attr);
+        if (retval < 0)
+                return retval;
+        v9fs_invalidate_inode_attr(dentry->d_inode);
        setattr_copy(dentry->d_inode, iattr);
        mark_inode_dirty(dentry->d_inode);
@@ -439,6 +490,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 void
 v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 {
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
                inode->i_atime.tv_sec = stat->st_atime_sec;
@@ -497,20 +549,21 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
        /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
         * because the inode structure does not have fields for them.
         */
+        v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
 }
 static int
 v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
                const char *symname)
 {
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *dfid;
-        struct p9_fid *fid = NULL;
-        struct inode *inode;
-        struct p9_qid qid;
-        char *name;
        int err;
        gid_t gid;
+        char *name;
+        struct p9_qid qid;
+        struct inode *inode;
+        struct p9_fid *dfid;
+        struct p9_fid *fid = NULL;
+        struct v9fs_session_info *v9ses;
        name = (char *) dentry->d_name.name;
        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
@@ -534,6 +587,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
                goto error;
        }
+        v9fs_invalidate_inode_attr(dir);
        if (v9ses->cache) {
                /* Now walk from the parent so we can get an unopened fid. */
                fid = p9_client_walk(dfid, 1, &name, 1);
@@ -546,7 +600,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
                }
                /* instantiate inode and assign the unopened fid to dentry */
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -588,10 +642,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
                struct dentry *dentry)
 {
        int err;
-        struct p9_fid *dfid, *oldfid;
        char *name;
-        struct v9fs_session_info *v9ses;
        struct dentry *dir_dentry;
+        struct p9_fid *dfid, *oldfid;
+        struct v9fs_session_info *v9ses;
        P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
                        dir->i_ino, old_dentry->d_name.name,
@@ -616,29 +670,17 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
                return err;
        }
+        v9fs_invalidate_inode_attr(dir);
        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
                /* Get the latest stat info from server. */
                struct p9_fid *fid;
-                struct p9_stat_dotl *st;
                fid = v9fs_fid_lookup(old_dentry);
                if (IS_ERR(fid))
                        return PTR_ERR(fid);
-                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+                v9fs_refresh_inode_dotl(fid, old_dentry->d_inode);
-                if (IS_ERR(st))
-                        return PTR_ERR(st);
-                v9fs_stat2inode_dotl(st, old_dentry->d_inode);
-                kfree(st);
-        } else {
-                /* Caching disabled. No need to get upto date stat info.
-                 * This dentry will be released immediately. So, just hold the
-                 * inode
-                 */
-                ihold(old_dentry->d_inode);
        }
+        ihold(old_dentry->d_inode);
        d_instantiate(dentry, old_dentry->d_inode);
        return err;
@@ -657,12 +699,12 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
                dev_t rdev)
 {
        int err;
+        gid_t gid;
        char *name;
        mode_t mode;
        struct v9fs_session_info *v9ses;
        struct p9_fid *fid = NULL, *dfid = NULL;
        struct inode *inode;
-        gid_t gid;
        struct p9_qid qid;
        struct dentry *dir_dentry;
        struct posix_acl *dacl = NULL, *pacl = NULL;
@@ -699,6 +741,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
        if (err < 0)
                goto error;
+        v9fs_invalidate_inode_attr(dir);
        /* instantiate inode and assign the unopened fid to the dentry */
        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
                fid = p9_client_walk(dfid, 1, &name, 1);
@@ -710,7 +753,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
                        goto error;
                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -782,6 +825,31 @@ ndset:
        return NULL;
 }
+int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
+{
+        loff_t i_size;
+        struct p9_stat_dotl *st;
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(inode);
+        st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+        if (IS_ERR(st))
+                return PTR_ERR(st);
+        spin_lock(&inode->i_lock);
+        /*
+         * We don't want to refresh inode->i_size,
+         * because we may have cached data
+         */
+        i_size = inode->i_size;
+        v9fs_stat2inode_dotl(st, inode);
+        if (v9ses->cache)
+                inode->i_size = i_size;
+        spin_unlock(&inode->i_lock);
+        kfree(st);
+        return 0;
+}
 const struct inode_operations v9fs_dir_inode_operations_dotl = {
        .create = v9fs_vfs_create_dotl,
        .lookup = v9fs_vfs_lookup,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index dbaabe3b8131..09fd08d1606f 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -86,12 +86,15 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
        } else
                sb->s_op = &v9fs_super_ops;
        sb->s_bdi = &v9ses->bdi;
+        if (v9ses->cache)
+                sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE;
-        sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
+        sb->s_flags = flags | MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
-            MS_NOATIME;
+        if (!v9ses->cache)
+                sb->s_flags |= MS_SYNCHRONOUS;
 #ifdef CONFIG_9P_FS_POSIX_ACL
-        if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)
+        if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL)
                sb->s_flags |= MS_POSIXACL;
 #endif
@@ -151,7 +154,6 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
                retval = PTR_ERR(inode);
                goto release_sb;
        }
        root = d_alloc_root(inode);
        if (!root) {
                iput(inode);
@@ -166,7 +168,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
                        retval = PTR_ERR(st);
                        goto release_sb;
                }
+                root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
                v9fs_stat2inode_dotl(st, root->d_inode);
                kfree(st);
        } else {
@@ -183,10 +185,21 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
                p9stat_free(st);
                kfree(st);
        }
+        v9fs_fid_add(root, fid);
        retval = v9fs_get_acl(inode, fid);
        if (retval)
                goto release_sb;
-        v9fs_fid_add(root, fid);
+        /*
+         * Add the root fid to session info. This is used
+         * for file system sync. We want a cloned fid here
+         * so that we can do a sync_filesystem after a
+         * shrink_dcache_for_umount
+         */
+        v9ses->root_fid = v9fs_fid_clone(root);
+        if (IS_ERR(v9ses->root_fid)) {
+                retval = PTR_ERR(v9ses->root_fid);
+                goto release_sb;
+        }
        P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
        return dget(sb->s_root);
@@ -197,15 +210,11 @@ close_session:
        v9fs_session_close(v9ses);
        kfree(v9ses);
        return ERR_PTR(retval);
 release_sb:
        /*
-         * we will do the session_close and root dentry release
+         * we will do the session_close and root dentry
-         * in the below call. But we need to clunk fid, because we haven't
+         * release in the below call.
-         * attached the fid to dentry so it won't get clunked
-         * automatically.
         */
-        p9_client_clunk(fid);
        deactivate_locked_super(sb);
        return ERR_PTR(retval);
 }
@@ -223,7 +232,7 @@ static void v9fs_kill_super(struct super_block *s)
        P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
        kill_anon_super(s);
+        p9_client_clunk(v9ses->root_fid);
        v9fs_session_cancel(v9ses);
        v9fs_session_close(v9ses);
        kfree(v9ses);
@@ -276,11 +285,31 @@ done:
        return res;
 }
+static int v9fs_sync_fs(struct super_block *sb, int wait)
+{
+        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_sync_fs: super_block %p\n", sb);
+        return p9_client_sync_fs(v9ses->root_fid);
+}
+static int v9fs_drop_inode(struct inode *inode)
+{
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(inode);
+        if (v9ses->cache)
+                return generic_drop_inode(inode);
+        /*
+         * in case of non cached mode always drop the
+         * the inode because we want the inode attribute
+         * to always match that on the server.
+         */
+        return 1;
+}
 static const struct super_operations v9fs_super_ops = {
-#ifdef CONFIG_9P_FSCACHE
        .alloc_inode = v9fs_alloc_inode,
        .destroy_inode = v9fs_destroy_inode,
-#endif
        .statfs = simple_statfs,
        .evict_inode = v9fs_evict_inode,
        .show_options = generic_show_options,
@@ -288,11 +317,11 @@ static const struct super_operations v9fs_super_ops = {
 };
 static const struct super_operations v9fs_super_ops_dotl = {
-#ifdef CONFIG_9P_FSCACHE
        .alloc_inode = v9fs_alloc_inode,
        .destroy_inode = v9fs_destroy_inode,
-#endif
+        .sync_fs = v9fs_sync_fs,
        .statfs = v9fs_statfs,
+        .drop_inode = v9fs_drop_inode,
        .evict_inode = v9fs_evict_inode,
        .show_options = generic_show_options,
        .umount_begin = v9fs_umount_begin,
@@ -303,5 +332,5 @@ struct file_system_type v9fs_fs_type = {
        .mount = v9fs_mount,
        .kill_sb = v9fs_kill_super,
        .owner = THIS_MODULE,
-        .fs_flags = FS_RENAME_DOES_D_MOVE,
+        .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT,
 };
diff --git a/fs/Kconfig b/fs/Kconfig
index 3db9caa57edc..f3aa9b08b228 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,7 +47,7 @@ config FS_POSIX_ACL
        def_bool n
 config EXPORTFS
-        tristate
+        bool
 config FILE_LOCKING
        bool "Enable POSIX file locking API" if EXPERT
@@ -187,6 +187,7 @@ source "fs/omfs/Kconfig"
 source "fs/hpfs/Kconfig"
 source "fs/qnx4/Kconfig"
 source "fs/romfs/Kconfig"
+source "fs/pstore/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index a7f7cef0c0c8..fb68c2b8cf8a 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -48,6 +48,8 @@ obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o xattr_acl.o
 obj-$(CONFIG_NFS_COMMON)        += nfs_common/
 obj-$(CONFIG_GENERIC_ACL)       += generic_acl.o
+obj-$(CONFIG_FHANDLE)           += fhandle.o
 obj-y                           += quota/
 obj-$(CONFIG_PROC_FS)           += proc/
@@ -121,3 +123,4 @@ obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_EXOFS_FS)          += exofs/
 obj-$(CONFIG_CEPH_FS)           += ceph/
+obj-$(CONFIG_PSTORE)            += pstore/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index 1dd5f34b3cf2..e55182a74605 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,7 +1,6 @@
 config ADFS_FS
        tristate "ADFS file system support (EXPERIMENTAL)"
        depends on BLOCK && EXPERIMENTAL
-        depends on BKL # need to fix
        help
          The Acorn Disc Filing System is the standard file system of the
          RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 3b4a764ed780..3d83075aaa2e 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -9,7 +9,6 @@
 *
 *  Common directory handling for ADFS
 */
-#include <linux/smp_lock.h>
 #include "adfs.h"
 /*
@@ -27,8 +26,6 @@ adfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct adfs_dir dir;
        int ret = 0;
-        lock_kernel();  
        if (filp->f_pos >> 32)
                goto out;
@@ -70,7 +67,6 @@ free_out:
        ops->free(&dir);
 out:
-        unlock_kernel();
        return ret;
 }
@@ -276,7 +272,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        struct object_info obj;
        int error;
-        lock_kernel();
        error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
        if (error == 0) {
                error = -EACCES;
@@ -288,7 +283,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                if (inode)
                        error = 0;
        }
-        unlock_kernel();
        d_add(dentry, inode);
        return ERR_PTR(error);
 }
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 65794b8fe79e..09fe40198d1c 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -7,7 +7,6 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include "adfs.h"
@@ -316,8 +315,6 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
        unsigned int ia_valid = attr->ia_valid;
        int error;
        
-        lock_kernel();
        error = inode_change_ok(inode, attr);
        /*
@@ -359,7 +356,6 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
        if (ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MODE))
                mark_inode_dirty(inode);
 out:
-        unlock_kernel();
        return error;
 }
@@ -374,7 +370,6 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        struct object_info obj;
        int ret;
-        lock_kernel();
        obj.file_id     = inode->i_ino;
        obj.name_len    = 0;
        obj.parent_id   = ADFS_I(inode)->parent_id;
@@ -384,6 +379,5 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        obj.size        = inode->i_size;
        ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
-        unlock_kernel();
        return ret;
 }
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 2d7954049fbe..06d7388b477b 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -14,7 +14,6 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/statfs.h>
 #include "adfs.h"
 #include "dir_f.h"
@@ -120,15 +119,11 @@ static void adfs_put_super(struct super_block *sb)
        int i;
        struct adfs_sb_info *asb = ADFS_SB(sb);
-        lock_kernel();
        for (i = 0; i < asb->s_map_size; i++)
                brelse(asb->s_map[i].dm_bh);
        kfree(asb->s_map);
        kfree(asb);
        sb->s_fs_info = NULL;
-        unlock_kernel();
 }
 static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
@@ -359,15 +354,11 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
        struct adfs_sb_info *asb;
        struct inode *root;
-        lock_kernel();
        sb->s_flags |= MS_NODIRATIME;
        asb = kzalloc(sizeof(*asb), GFP_KERNEL);
-        if (!asb) {
+        if (!asb)
-                unlock_kernel();
                return -ENOMEM;
-        }
        sb->s_fs_info = asb;
        /* set default options */
@@ -485,7 +476,6 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
                adfs_error(sb, "get root inode failed\n");
                goto error;
        }
-        unlock_kernel();
        return 0;
 error_free_bh:
@@ -493,7 +483,6 @@ error_free_bh:
 error:
        sb->s_fs_info = NULL;
        kfree(asb);
-        unlock_kernel();
        return -EINVAL;
 }
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 15690bb1d3b5..789b3afb3423 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -140,6 +140,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
        candidate->first = candidate->last = index;
        candidate->offset_first = from;
        candidate->to_last = to;
+        INIT_LIST_HEAD(&candidate->link);
        candidate->usage = 1;
        candidate->state = AFS_WBACK_PENDING;
        init_waitqueue_head(&candidate->waitq);
diff --git a/fs/aio.c b/fs/aio.c
index fc557a3be0a9..7f54f43b8f7c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -85,7 +85,7 @@ static int __init aio_setup(void)
        kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
        kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
-        aio_wq = create_workqueue("aio");
+        aio_wq = alloc_workqueue("aio", 0, 1);  /* used to limit concurrency */
        abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
        BUG_ON(!aio_wq || !abe_pool);
@@ -239,15 +239,23 @@ static void __put_ioctx(struct kioctx *ctx)
        call_rcu(&ctx->rcu_head, ctx_rcu_free);
 }
-#define get_ioctx(kioctx) do {                                          \
+static inline void get_ioctx(struct kioctx *kioctx)
-        BUG_ON(atomic_read(&(kioctx)->users) <= 0);                     \
+{
-        atomic_inc(&(kioctx)->users);                                   \
+        BUG_ON(atomic_read(&kioctx->users) <= 0);
-} while (0)
+        atomic_inc(&kioctx->users);
-#define put_ioctx(kioctx) do {                                          \
+}
-        BUG_ON(atomic_read(&(kioctx)->users) <= 0);                     \
-        if (unlikely(atomic_dec_and_test(&(kioctx)->users)))            \
+static inline int try_get_ioctx(struct kioctx *kioctx)
-                __put_ioctx(kioctx);                                    \
+{
-} while (0)
+        return atomic_inc_not_zero(&kioctx->users);
+}
+static inline void put_ioctx(struct kioctx *kioctx)
+{
+        BUG_ON(atomic_read(&kioctx->users) <= 0);
+        if (unlikely(atomic_dec_and_test(&kioctx->users)))
+                __put_ioctx(kioctx);
+}
 /* ioctx_alloc
 *      Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
@@ -569,7 +577,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
                spin_lock(&fput_lock);
                list_add(&req->ki_list, &fput_head);
                spin_unlock(&fput_lock);
-                queue_work(aio_wq, &fput_work);
+                schedule_work(&fput_work);
        } else {
                req->ki_filp = NULL;
                really_put_req(ctx, req);
@@ -601,8 +609,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
        rcu_read_lock();
        hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
-                if (ctx->user_id == ctx_id && !ctx->dead) {
+                /*
-                        get_ioctx(ctx);
+                 * RCU protects us against accessing freed memory but
+                 * we have to be careful not to get a reference when the
+                 * reference count already dropped to 0 (ctx->dead test
+                 * is unreliable because of races).
+                 */
+                if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){
                        ret = ctx;
                        break;
                }
@@ -1629,6 +1642,23 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                goto out_put_req;
        spin_lock_irq(&ctx->ctx_lock);
+        /*
+         * We could have raced with io_destroy() and are currently holding a
+         * reference to ctx which should be destroyed. We cannot submit IO
+         * since ctx gets freed as soon as io_submit() puts its reference.  The
+         * check here is reliable: io_destroy() sets ctx->dead before waiting
+         * for outstanding IO and the barrier between these two is realized by
+         * unlock of mm->ioctx_lock and lock of ctx->ctx_lock.  Analogously we
+         * increment ctx->reqs_active before checking for ctx->dead and the
+         * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we
+         * don't see ctx->dead set here, io_destroy() waits for our IO to
+         * finish.
+         */
+        if (ctx->dead) {
+                spin_unlock_irq(&ctx->ctx_lock);
+                ret = -EINVAL;
+                goto out_put_req;
+        }
        aio_run_iocb(req);
        if (!list_empty(&ctx->run_list)) {
                /* drain the run list */
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4fb8a3431531..889287019599 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -873,6 +873,11 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
        ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
        if (ret)
                goto out_del;
+        /*
+         * bdev could be deleted beneath us which would implicitly destroy
+         * the holder directory.  Hold on to it.
+         */
+        kobject_get(bdev->bd_part->holder_dir);
        list_add(&holder->list, &bdev->bd_holder_disks);
        goto out_unlock;
@@ -909,6 +914,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
                del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
                del_symlink(bdev->bd_part->holder_dir,
                            &disk_to_dev(disk)->kobj);
+                kobject_put(bdev->bd_part->holder_dir);
                list_del_init(&holder->list);
                kfree(holder);
        }
@@ -922,14 +928,15 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
 * flush_disk - invalidates all buffer-cache entries on a disk
 *
 * @bdev:      struct block device to be flushed
+ * @kill_dirty: flag to guide handling of dirty inodes
 *
 * Invalidates all buffer-cache entries on a disk. It should be called
 * when a disk has been changed -- either by a media change or online
 * resize.
 */
-static void flush_disk(struct block_device *bdev)
+static void flush_disk(struct block_device *bdev, bool kill_dirty)
 {
-        if (__invalidate_device(bdev)) {
+        if (__invalidate_device(bdev, kill_dirty)) {
                char name[BDEVNAME_SIZE] = "";
                if (bdev->bd_disk)
@@ -966,7 +973,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
                       "%s: detected capacity change from %lld to %lld\n",
                       name, bdev_size, disk_size);
                i_size_write(bdev->bd_inode, disk_size);
-                flush_disk(bdev);
+                flush_disk(bdev, false);
        }
 }
 EXPORT_SYMBOL(check_disk_size_change);
@@ -1019,7 +1026,7 @@ int check_disk_change(struct block_device *bdev)
        if (!(events & DISK_EVENT_MEDIA_CHANGE))
                return 0;
-        flush_disk(bdev);
+        flush_disk(bdev, true);
        if (bdops->revalidate_disk)
                bdops->revalidate_disk(bdev->bd_disk);
        return 1;
@@ -1600,7 +1607,7 @@ fail:
 }
 EXPORT_SYMBOL(lookup_bdev);
-int __invalidate_device(struct block_device *bdev)
+int __invalidate_device(struct block_device *bdev, bool kill_dirty)
 {
        struct super_block *sb = get_super(bdev);
        int res = 0;
@@ -1613,7 +1620,7 @@ int __invalidate_device(struct block_device *bdev)
                 * hold).
                 */
                shrink_dcache_sb(sb);
-                res = invalidate_inodes(sb);
+                res = invalidate_inodes(sb, kill_dirty);
                drop_super(sb);
        }
        invalidate_bdev(bdev);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c98b3af6052..7f78cc78fdd0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -729,6 +729,15 @@ struct btrfs_space_info {
        u64 disk_total;         /* total bytes on disk, takes mirrors into
                                   account */
+        /*
+         * we bump reservation progress every time we decrement
+         * bytes_reserved.  This way people waiting for reservations
+         * know something good has happened and they can check
+         * for progress.  The number here isn't to be trusted, it
+         * just shows reclaim activity
+         */
+        unsigned long reservation_progress;
        int full;               /* indicates that we cannot allocate any more
                                   chunks for this space */
        int force_alloc;        /* set if we need to force a chunk alloc for
@@ -1254,6 +1263,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_SPACE_CACHE         (1 << 12)
 #define BTRFS_MOUNT_CLEAR_CACHE         (1 << 13)
 #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
+#define BTRFS_MOUNT_ENOSPC_DEBUG         (1 << 15)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -2218,6 +2228,8 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root,
                                   u64 start, u64 end);
 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
                               u64 num_bytes);
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 type);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ff27d7a477b2..b4ffad859adb 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
        int len = *max_len;
        int type;
-        if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
+        if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
-            (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
+                *max_len = BTRFS_FID_SIZE_CONNECTABLE;
                return 255;
+        } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
+                *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+                return 255;
+        }
        len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
        type = FILEID_BTRFS_WITHOUT_PARENT;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f3c96fc01439..7b3089b5c2df 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3342,15 +3342,16 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
        u64 max_reclaim;
        u64 reclaimed = 0;
        long time_left;
-        int pause = 1;
        int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
        int loops = 0;
+        unsigned long progress;
        block_rsv = &root->fs_info->delalloc_block_rsv;
        space_info = block_rsv->space_info;
        smp_mb();
        reserved = space_info->bytes_reserved;
+        progress = space_info->reservation_progress;
        if (reserved == 0)
                return 0;
@@ -3365,31 +3366,36 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
                spin_lock(&space_info->lock);
-                if (reserved > space_info->bytes_reserved) {
+                if (reserved > space_info->bytes_reserved)
-                        loops = 0;
                        reclaimed += reserved - space_info->bytes_reserved;
-                } else {
-                        loops++;
-                }
                reserved = space_info->bytes_reserved;
                spin_unlock(&space_info->lock);
+                loops++;
                if (reserved == 0 || reclaimed >= max_reclaim)
                        break;
                if (trans && trans->transaction->blocked)
                        return -EAGAIN;
-                __set_current_state(TASK_INTERRUPTIBLE);
+                time_left = schedule_timeout_interruptible(1);
-                time_left = schedule_timeout(pause);
                /* We were interrupted, exit */
                if (time_left)
                        break;
-                pause <<= 1;
+                /* we've kicked the IO a few times, if anything has been freed,
-                if (pause > HZ / 10)
+                 * exit.  There is no sense in looping here for a long time
-                        pause = HZ / 10;
+                 * when we really need to commit the transaction, or there are
+                 * just too many writers without enough free space
+                 */
+                if (loops > 3) {
+                        smp_mb();
+                        if (progress != space_info->reservation_progress)
+                                break;
+                }
        }
        return reclaimed >= to_reclaim;
@@ -3612,6 +3618,7 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
                if (num_bytes) {
                        spin_lock(&space_info->lock);
                        space_info->bytes_reserved -= num_bytes;
+                        space_info->reservation_progress++;
                        spin_unlock(&space_info->lock);
                }
        }
@@ -3844,6 +3851,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
        if (block_rsv->reserved >= block_rsv->size) {
                num_bytes = block_rsv->reserved - block_rsv->size;
                sinfo->bytes_reserved -= num_bytes;
+                sinfo->reservation_progress++;
                block_rsv->reserved = block_rsv->size;
                block_rsv->full = 1;
        }
@@ -4005,7 +4013,6 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                to_reserve = 0;
        }
        spin_unlock(&BTRFS_I(inode)->accounting_lock);
        to_reserve += calc_csum_metadata_size(inode, num_bytes);
        ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
        if (ret)
@@ -4133,6 +4140,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        btrfs_set_block_group_used(&cache->item, old_val);
                        cache->reserved -= num_bytes;
                        cache->space_info->bytes_reserved -= num_bytes;
+                        cache->space_info->reservation_progress++;
                        cache->space_info->bytes_used += num_bytes;
                        cache->space_info->disk_used += num_bytes * factor;
                        spin_unlock(&cache->lock);
@@ -4184,6 +4192,7 @@ static int pin_down_extent(struct btrfs_root *root,
        if (reserved) {
                cache->reserved -= num_bytes;
                cache->space_info->bytes_reserved -= num_bytes;
+                cache->space_info->reservation_progress++;
        }
        spin_unlock(&cache->lock);
        spin_unlock(&cache->space_info->lock);
@@ -4234,6 +4243,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
                                space_info->bytes_readonly += num_bytes;
                        cache->reserved -= num_bytes;
                        space_info->bytes_reserved -= num_bytes;
+                        space_info->reservation_progress++;
                }
                spin_unlock(&cache->lock);
                spin_unlock(&space_info->lock);
@@ -4712,6 +4722,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                if (ret) {
                        spin_lock(&cache->space_info->lock);
                        cache->space_info->bytes_reserved -= buf->len;
+                        cache->space_info->reservation_progress++;
                        spin_unlock(&cache->space_info->lock);
                }
                goto out;
@@ -5376,7 +5387,7 @@ again:
                               num_bytes, data, 1);
                goto again;
        }
-        if (ret == -ENOSPC) {
+        if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
                struct btrfs_space_info *sinfo;
                sinfo = __find_space_info(root->fs_info, data);
@@ -8065,6 +8076,13 @@ out:
        return ret;
 }
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 type)
+{
+        u64 alloc_flags = get_alloc_profile(root, type);
+        return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+}
 /*
 * helper to account the unused space of all the readonly block group in the
 * list. takes mirrors into account.
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 92ac5192c518..714adc4ac4c2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1433,12 +1433,13 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 */
 u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end, u64 max_bytes,
-                     unsigned long bits)
+                     unsigned long bits, int contig)
 {
        struct rb_node *node;
        struct extent_state *state;
        u64 cur_start = *start;
        u64 total_bytes = 0;
+        u64 last = 0;
        int found = 0;
        if (search_end <= cur_start) {
@@ -1463,7 +1464,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
                state = rb_entry(node, struct extent_state, rb_node);
                if (state->start > search_end)
                        break;
-                if (state->end >= cur_start && (state->state & bits)) {
+                if (contig && found && state->start > last + 1)
+                        break;
+                if (state->end >= cur_start && (state->state & bits) == bits) {
                        total_bytes += min(search_end, state->end) + 1 -
                                       max(cur_start, state->start);
                        if (total_bytes >= max_bytes)
@@ -1472,6 +1475,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
                                *start = state->start;
                                found = 1;
                        }
+                        last = state->end;
+                } else if (contig && found) {
+                        break;
                }
                node = rb_next(node);
                if (!node)
@@ -2912,6 +2918,46 @@ out:
        return sector;
 }
+/*
+ * helper function for fiemap, which doesn't want to see any holes.
+ * This maps until we find something past 'last'
+ */
+static struct extent_map *get_extent_skip_holes(struct inode *inode,
+                                                u64 offset,
+                                                u64 last,
+                                                get_extent_t *get_extent)
+{
+        u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
+        struct extent_map *em;
+        u64 len;
+        if (offset >= last)
+                return NULL;
+        while(1) {
+                len = last - offset;
+                if (len == 0)
+                        break;
+                len = (len + sectorsize - 1) & ~(sectorsize - 1);
+                em = get_extent(inode, NULL, 0, offset, len, 0);
+                if (!em || IS_ERR(em))
+                        return em;
+                /* if this isn't a hole return it */
+                if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
+                    em->block_start != EXTENT_MAP_HOLE) {
+                        return em;
+                }
+                /* this is a hole, advance to the next extent */
+                offset = extent_map_end(em);
+                free_extent_map(em);
+                if (offset >= last)
+                        break;
+        }
+        return NULL;
+}
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len, get_extent_t *get_extent)
 {
@@ -2921,16 +2967,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        u32 flags = 0;
        u32 found_type;
        u64 last;
+        u64 last_for_get_extent = 0;
        u64 disko = 0;
+        u64 isize = i_size_read(inode);
        struct btrfs_key found_key;
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        struct btrfs_path *path;
        struct btrfs_file_extent_item *item;
        int end = 0;
-        u64 em_start = 0, em_len = 0;
+        u64 em_start = 0;
+        u64 em_len = 0;
+        u64 em_end = 0;
        unsigned long emflags;
-        int hole = 0;
        if (len == 0)
                return -EINVAL;
@@ -2940,6 +2989,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                return -ENOMEM;
        path->leave_spinning = 1;
+        /*
+         * lookup the last file extent.  We're not using i_size here
+         * because there might be preallocation past i_size
+         */
        ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
                                       path, inode->i_ino, -1, 0);
        if (ret < 0) {
@@ -2953,18 +3006,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
        found_type = btrfs_key_type(&found_key);
-        /* No extents, just return */
+        /* No extents, but there might be delalloc bits */
        if (found_key.objectid != inode->i_ino ||
            found_type != BTRFS_EXTENT_DATA_KEY) {
-                btrfs_free_path(path);
+                /* have to trust i_size as the end */
-                return 0;
+                last = (u64)-1;
+                last_for_get_extent = isize;
+        } else {
+                /*
+                 * remember the start of the last extent.  There are a
+                 * bunch of different factors that go into the length of the
+                 * extent, so its much less complex to remember where it started
+                 */
+                last = found_key.offset;
+                last_for_get_extent = last + 1;
        }
-        last = found_key.offset;
        btrfs_free_path(path);
+        /*
+         * we might have some extents allocated but more delalloc past those
+         * extents.  so, we trust isize unless the start of the last extent is
+         * beyond isize
+         */
+        if (last < isize) {
+                last = (u64)-1;
+                last_for_get_extent = isize;
+        }
        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
                         &cached_state, GFP_NOFS);
-        em = get_extent(inode, NULL, 0, off, max - off, 0);
+        em = get_extent_skip_holes(inode, off, last_for_get_extent,
+                                   get_extent);
        if (!em)
                goto out;
        if (IS_ERR(em)) {
@@ -2973,22 +3046,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        }
        while (!end) {
-                hole = 0;
+                u64 offset_in_extent;
-                off = em->start + em->len;
-                if (off >= max)
-                        end = 1;
-                if (em->block_start == EXTENT_MAP_HOLE) {
+                /* break if the extent we found is outside the range */
-                        hole = 1;
+                if (em->start >= max || extent_map_end(em) < off)
-                        goto next;
+                        break;
-                }
-                em_start = em->start;
+                /*
-                em_len = em->len;
+                 * get_extent may return an extent that starts before our
+                 * requested range.  We have to make sure the ranges
+                 * we return to fiemap always move forward and don't
+                 * overlap, so adjust the offsets here
+                 */
+                em_start = max(em->start, off);
+                /*
+                 * record the offset from the start of the extent
+                 * for adjusting the disk offset below
+                 */
+                offset_in_extent = em_start - em->start;
+                em_end = extent_map_end(em);
+                em_len = em_end - em_start;
+                emflags = em->flags;
                disko = 0;
                flags = 0;
+                /*
+                 * bump off for our next call to get_extent
+                 */
+                off = extent_map_end(em);
+                if (off >= max)
+                        end = 1;
                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
                        end = 1;
                        flags |= FIEMAP_EXTENT_LAST;
@@ -2999,42 +3088,34 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        flags |= (FIEMAP_EXTENT_DELALLOC |
                                  FIEMAP_EXTENT_UNKNOWN);
                } else {
-                        disko = em->block_start;
+                        disko = em->block_start + offset_in_extent;
                }
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
                        flags |= FIEMAP_EXTENT_ENCODED;
-next:
-                emflags = em->flags;
                free_extent_map(em);
                em = NULL;
-                if (!end) {
+                if ((em_start >= last) || em_len == (u64)-1 ||
-                        em = get_extent(inode, NULL, 0, off, max - off, 0);
+                   (last == (u64)-1 && isize <= em_end)) {
-                        if (!em)
-                                goto out;
-                        if (IS_ERR(em)) {
-                                ret = PTR_ERR(em);
-                                goto out;
-                        }
-                        emflags = em->flags;
-                }
-                if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
                        flags |= FIEMAP_EXTENT_LAST;
                        end = 1;
                }
-                if (em_start == last) {
+                /* now scan forward to see if this is really the last extent. */
+                em = get_extent_skip_holes(inode, off, last_for_get_extent,
+                                           get_extent);
+                if (IS_ERR(em)) {
+                        ret = PTR_ERR(em);
+                        goto out;
+                }
+                if (!em) {
                        flags |= FIEMAP_EXTENT_LAST;
                        end = 1;
                }
+                ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
-                if (!hole) {
+                                              em_len, flags);
-                        ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+                if (ret)
-                                                em_len, flags);
+                        goto out_free;
-                        if (ret)
-                                goto out_free;
-                }
        }
 out_free:
        free_extent_map(em);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7083cfafd061..9318dfefd59c 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -191,7 +191,7 @@ void extent_io_exit(void);
 u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end,
-                     u64 max_bytes, unsigned long bits);
+                     u64 max_bytes, unsigned long bits, int contig);
 void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 7084140d5940..f447b783bb84 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -70,6 +70,19 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
                /* Flush processor's dcache for this page */
                flush_dcache_page(page);
+                /*
+                 * if we get a partial write, we can end up with
+                 * partially up to date pages.  These add
+                 * a lot of complexity, so make sure they don't
+                 * happen by forcing this copy to be retried.
+                 *
+                 * The rest of the btrfs_file_write code will fall
+                 * back to page at a time copies after we return 0.
+                 */
+                if (!PageUptodate(page) && copied < count)
+                        copied = 0;
                iov_iter_advance(i, copied);
                write_bytes -= copied;
                total_copied += copied;
@@ -763,6 +776,27 @@ out:
 }
 /*
+ * on error we return an unlocked page and the error value
+ * on success we return a locked page and 0
+ */
+static int prepare_uptodate_page(struct page *page, u64 pos)
+{
+        int ret = 0;
+        if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
+                ret = btrfs_readpage(NULL, page);
+                if (ret)
+                        return ret;
+                lock_page(page);
+                if (!PageUptodate(page)) {
+                        unlock_page(page);
+                        return -EIO;
+                }
+        }
+        return 0;
+}
+/*
 * this gets pages into the page cache and locks them down, it also properly
 * waits for data=ordered extents to finish before allowing the pages to be
 * modified.
@@ -777,6 +811,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
        unsigned long index = pos >> PAGE_CACHE_SHIFT;
        struct inode *inode = fdentry(file)->d_inode;
        int err = 0;
+        int faili = 0;
        u64 start_pos;
        u64 last_pos;
@@ -794,15 +829,24 @@ again:
        for (i = 0; i < num_pages; i++) {
                pages[i] = grab_cache_page(inode->i_mapping, index + i);
                if (!pages[i]) {
-                        int c;
+                        faili = i - 1;
-                        for (c = i - 1; c >= 0; c--) {
+                        err = -ENOMEM;
-                                unlock_page(pages[c]);
+                        goto fail;
-                                page_cache_release(pages[c]);
+                }
-                        }
-                        return -ENOMEM;
+                if (i == 0)
+                        err = prepare_uptodate_page(pages[i], pos);
+                if (i == num_pages - 1)
+                        err = prepare_uptodate_page(pages[i],
+                                                    pos + write_bytes);
+                if (err) {
+                        page_cache_release(pages[i]);
+                        faili = i - 1;
+                        goto fail;
                }
                wait_on_page_writeback(pages[i]);
        }
+        err = 0;
        if (start_pos < inode->i_size) {
                struct btrfs_ordered_extent *ordered;
                lock_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -842,6 +886,14 @@ again:
                WARN_ON(!PageLocked(pages[i]));
        }
        return 0;
+fail:
+        while (faili >= 0) {
+                unlock_page(pages[faili]);
+                page_cache_release(pages[faili]);
+                faili--;
+        }
+        return err;
 }
 static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
@@ -851,7 +903,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        struct inode *inode = fdentry(file)->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct page *pinned[2];
        struct page **pages = NULL;
        struct iov_iter i;
        loff_t *ppos = &iocb->ki_pos;
@@ -872,9 +923,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
                      (file->f_flags & O_DIRECT));
-        pinned[0] = NULL;
-        pinned[1] = NULL;
        start_pos = pos;
        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
@@ -962,32 +1010,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        first_index = pos >> PAGE_CACHE_SHIFT;
        last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
-        /*
-         * there are lots of better ways to do this, but this code
-         * makes sure the first and last page in the file range are
-         * up to date and ready for cow
-         */
-        if ((pos & (PAGE_CACHE_SIZE - 1))) {
-                pinned[0] = grab_cache_page(inode->i_mapping, first_index);
-                if (!PageUptodate(pinned[0])) {
-                        ret = btrfs_readpage(NULL, pinned[0]);
-                        BUG_ON(ret);
-                        wait_on_page_locked(pinned[0]);
-                } else {
-                        unlock_page(pinned[0]);
-                }
-        }
-        if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
-                pinned[1] = grab_cache_page(inode->i_mapping, last_index);
-                if (!PageUptodate(pinned[1])) {
-                        ret = btrfs_readpage(NULL, pinned[1]);
-                        BUG_ON(ret);
-                        wait_on_page_locked(pinned[1]);
-                } else {
-                        unlock_page(pinned[1]);
-                }
-        }
        while (iov_iter_count(&i) > 0) {
                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
                size_t write_bytes = min(iov_iter_count(&i),
@@ -1024,8 +1046,20 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                copied = btrfs_copy_from_user(pos, num_pages,
                                           write_bytes, pages, &i);
-                dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >>
-                                PAGE_CACHE_SHIFT;
+                /*
+                 * if we have trouble faulting in the pages, fall
+                 * back to one page at a time
+                 */
+                if (copied < write_bytes)
+                        nrptrs = 1;
+                if (copied == 0)
+                        dirty_pages = 0;
+                else
+                        dirty_pages = (copied + offset +
+                                       PAGE_CACHE_SIZE - 1) >>
+                                       PAGE_CACHE_SHIFT;
                if (num_pages > dirty_pages) {
                        if (copied > 0)
@@ -1069,10 +1103,6 @@ out:
                err = ret;
        kfree(pages);
-        if (pinned[0])
-                page_cache_release(pinned[0]);
-        if (pinned[1])
-                page_cache_release(pinned[1]);
        *ppos = pos;
        /*
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fb9bd7832b6d..512c3d1da083 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -90,13 +90,14 @@ static noinline int cow_file_range(struct inode *inode,
                                   unsigned long *nr_written, int unlock);
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
-                                     struct inode *inode,  struct inode *dir)
+                                     struct inode *inode,  struct inode *dir,
+                                     const struct qstr *qstr)
 {
        int err;
        err = btrfs_init_acl(trans, inode, dir);
        if (!err)
-                err = btrfs_xattr_security_init(trans, inode, dir);
+                err = btrfs_xattr_security_init(trans, inode, dir, qstr);
        return err;
 }
@@ -1913,7 +1914,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start)
        private = 0;
        if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
-                             (u64)-1, 1, EXTENT_DIRTY)) {
+                             (u64)-1, 1, EXTENT_DIRTY, 0)) {
                ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
                                        start, &private_failure);
                if (ret == 0) {
@@ -4704,7 +4705,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(inode))
                goto out_unlock;
-        err = btrfs_init_inode_security(trans, inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
@@ -4765,7 +4766,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(inode))
                goto out_unlock;
-        err = btrfs_init_inode_security(trans, inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
@@ -4806,9 +4807,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        int err;
        int drop_inode = 0;
-        if (inode->i_nlink == 0)
-                return -ENOENT;
        /* do not allow sys_link's with other subvols of the same device */
        if (root->objectid != BTRFS_I(inode)->root->objectid)
                return -EPERM;
@@ -4821,10 +4819,11 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                goto fail;
        /*
-         * 1 item for inode ref
+         * 2 items for inode and inode ref
         * 2 items for dir items
+         * 1 item for parent inode
         */
-        trans = btrfs_start_transaction(root, 3);
+        trans = btrfs_start_transaction(root, 5);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto fail;
@@ -4893,7 +4892,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        drop_on_err = 1;
-        err = btrfs_init_inode_security(trans, inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err)
                goto out_fail;
@@ -5280,6 +5279,128 @@ out:
        return em;
 }
+struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
+                                           size_t pg_offset, u64 start, u64 len,
+                                           int create)
+{
+        struct extent_map *em;
+        struct extent_map *hole_em = NULL;
+        u64 range_start = start;
+        u64 end;
+        u64 found;
+        u64 found_end;
+        int err = 0;
+        em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+        if (IS_ERR(em))
+                return em;
+        if (em) {
+                /*
+                 * if our em maps to a hole, there might
+                 * actually be delalloc bytes behind it
+                 */
+                if (em->block_start != EXTENT_MAP_HOLE)
+                        return em;
+                else
+                        hole_em = em;
+        }
+        /* check to see if we've wrapped (len == -1 or similar) */
+        end = start + len;
+        if (end < start)
+                end = (u64)-1;
+        else
+                end -= 1;
+        em = NULL;
+        /* ok, we didn't find anything, lets look for delalloc */
+        found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
+                                 end, len, EXTENT_DELALLOC, 1);
+        found_end = range_start + found;
+        if (found_end < range_start)
+                found_end = (u64)-1;
+        /*
+         * we didn't find anything useful, return
+         * the original results from get_extent()
+         */
+        if (range_start > end || found_end <= start) {
+                em = hole_em;
+                hole_em = NULL;
+                goto out;
+        }
+        /* adjust the range_start to make sure it doesn't
+         * go backwards from the start they passed in
+         */
+        range_start = max(start,range_start);
+        found = found_end - range_start;
+        if (found > 0) {
+                u64 hole_start = start;
+                u64 hole_len = len;
+                em = alloc_extent_map(GFP_NOFS);
+                if (!em) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                /*
+                 * when btrfs_get_extent can't find anything it
+                 * returns one huge hole
+                 *
+                 * make sure what it found really fits our range, and
+                 * adjust to make sure it is based on the start from
+                 * the caller
+                 */
+                if (hole_em) {
+                        u64 calc_end = extent_map_end(hole_em);
+                        if (calc_end <= start || (hole_em->start > end)) {
+                                free_extent_map(hole_em);
+                                hole_em = NULL;
+                        } else {
+                                hole_start = max(hole_em->start, start);
+                                hole_len = calc_end - hole_start;
+                        }
+                }
+                em->bdev = NULL;
+                if (hole_em && range_start > hole_start) {
+                        /* our hole starts before our delalloc, so we
+                         * have to return just the parts of the hole
+                         * that go until  the delalloc starts
+                         */
+                        em->len = min(hole_len,
+                                      range_start - hole_start);
+                        em->start = hole_start;
+                        em->orig_start = hole_start;
+                        /*
+                         * don't adjust block start at all,
+                         * it is fixed at EXTENT_MAP_HOLE
+                         */
+                        em->block_start = hole_em->block_start;
+                        em->block_len = hole_len;
+                } else {
+                        em->start = range_start;
+                        em->len = found;
+                        em->orig_start = range_start;
+                        em->block_start = EXTENT_MAP_DELALLOC;
+                        em->block_len = found;
+                }
+        } else if (hole_em) {
+                return hole_em;
+        }
+out:
+        free_extent_map(hole_em);
+        if (err) {
+                free_extent_map(em);
+                return ERR_PTR(err);
+        }
+        return em;
+}
 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                                                  u64 start, u64 len)
 {
@@ -5934,6 +6055,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
        if (!skip_sum) {
                dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
                if (!dip->csums) {
+                        kfree(dip);
                        ret = -ENOMEM;
                        goto free_ordered;
                }
@@ -6102,7 +6224,7 @@ out:
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len)
 {
-        return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
+        return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
 }
 int btrfs_readpage(struct file *file, struct page *page)
@@ -6982,7 +7104,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(inode))
                goto out_unlock;
-        err = btrfs_init_inode_security(trans, inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index be2d4f6aaa5e..5fdb2abc4fa7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1071,12 +1071,15 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
        if (copy_from_user(&flags, arg, sizeof(flags)))
                return -EFAULT;
-        if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
+        if (flags & BTRFS_SUBVOL_CREATE_ASYNC)
                return -EINVAL;
        if (flags & ~BTRFS_SUBVOL_RDONLY)
                return -EOPNOTSUPP;
+        if (!is_owner_or_cap(inode))
+                return -EACCES;
        down_write(&root->fs_info->subvol_sem);
        /* nothing to do */
@@ -1097,7 +1100,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
                goto out_reset;
        }
-        ret = btrfs_update_root(trans, root,
+        ret = btrfs_update_root(trans, root->fs_info->tree_root,
                                &root->root_key, &root->root_item);
        btrfs_commit_transaction(trans, root);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index cc9b450399df..a178f5ebea78 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -280,6 +280,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
        unsigned long tot_out;
        unsigned long tot_len;
        char *buf;
+        bool may_late_unmap, need_unmap;
        data_in = kmap(pages_in[0]);
        tot_len = read_compress_length(data_in);
@@ -300,11 +301,13 @@ static int lzo_decompress_biovec(struct list_head *ws,
                tot_in += in_len;
                working_bytes = in_len;
+                may_late_unmap = need_unmap = false;
                /* fast path: avoid using the working buffer */
                if (in_page_bytes_left >= in_len) {
                        buf = data_in + in_offset;
                        bytes = in_len;
+                        may_late_unmap = true;
                        goto cont;
                }
@@ -329,14 +332,17 @@ cont:
                                if (working_bytes == 0 && tot_in >= tot_len)
                                        break;
-                                kunmap(pages_in[page_in_index]);
+                                if (page_in_index + 1 >= total_pages_in) {
-                                page_in_index++;
-                                if (page_in_index >= total_pages_in) {
                                        ret = -1;
-                                        data_in = NULL;
                                        goto done;
                                }
-                                data_in = kmap(pages_in[page_in_index]);
+                                if (may_late_unmap)
+                                        need_unmap = true;
+                                else
+                                        kunmap(pages_in[page_in_index]);
+                                data_in = kmap(pages_in[++page_in_index]);
                                in_page_bytes_left = PAGE_CACHE_SIZE;
                                in_offset = 0;
@@ -346,6 +352,8 @@ cont:
                out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
                ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
                                            &out_len);
+                if (need_unmap)
+                        kunmap(pages_in[page_in_index - 1]);
                if (ret != LZO_E_OK) {
                        printk(KERN_WARNING "btrfs decompress failed\n");
                        ret = -1;
@@ -363,8 +371,7 @@ cont:
                        break;
        }
 done:
-        if (data_in)
+        kunmap(pages_in[page_in_index]);
-                kunmap(pages_in[page_in_index]);
        return ret;
 }
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0825e4ed9447..31ade5802ae8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3654,6 +3654,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        u32 item_size;
        int ret;
        int err = 0;
+        int progress = 0;
        path = btrfs_alloc_path();
        if (!path)
@@ -3666,9 +3667,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        }
        while (1) {
+                progress++;
                trans = btrfs_start_transaction(rc->extent_root, 0);
                BUG_ON(IS_ERR(trans));
+restart:
                if (update_backref_cache(trans, &rc->backref_cache)) {
                        btrfs_end_transaction(trans, rc->extent_root);
                        continue;
@@ -3781,6 +3783,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                        }
                }
        }
+        if (trans && progress && err == -ENOSPC) {
+                ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
+                                              rc->block_group->flags);
+                if (ret == 0) {
+                        err = 0;
+                        progress = 0;
+                        goto restart;
+                }
+        }
        btrfs_release_path(rc->extent_root, path);
        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a004008f7d28..d39a9895d932 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -155,7 +155,8 @@ enum {
        Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
-        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
+        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
+        Opt_enospc_debug, Opt_err,
 };
 static match_table_t tokens = {
@@ -184,6 +185,7 @@ static match_table_t tokens = {
        {Opt_space_cache, "space_cache"},
        {Opt_clear_cache, "clear_cache"},
        {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+        {Opt_enospc_debug, "enospc_debug"},
        {Opt_err, NULL},
 };
@@ -358,6 +360,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_user_subvol_rm_allowed:
                        btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
                        break;
+                case Opt_enospc_debug:
+                        btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
+                        break;
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index af7dbca15276..dd13eb81ee40 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1338,11 +1338,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        ret = btrfs_shrink_device(device, 0);
        if (ret)
-                goto error_brelse;
+                goto error_undo;
        ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
        if (ret)
-                goto error_brelse;
+                goto error_undo;
        device->in_fs_metadata = 0;
@@ -1416,6 +1416,13 @@ out:
        mutex_unlock(&root->fs_info->volume_mutex);
        mutex_unlock(&uuid_mutex);
        return ret;
+error_undo:
+        if (device->writeable) {
+                list_add(&device->dev_alloc_list,
+                         &root->fs_info->fs_devices->alloc_list);
+                root->fs_info->fs_devices->rw_devices++;
+        }
+        goto error_brelse;
 }
 /*
@@ -1633,7 +1640,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        device->dev_root = root->fs_info->dev_root;
        device->bdev = bdev;
        device->in_fs_metadata = 1;
-        device->mode = 0;
+        device->mode = FMODE_EXCL;
        set_blocksize(device->bdev, 4096);
        if (seeding_dev) {
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a5776531dc2b..d779cefcfd7d 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -370,7 +370,8 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
 }
 int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
-                              struct inode *inode, struct inode *dir)
+                              struct inode *inode, struct inode *dir,
+                              const struct qstr *qstr)
 {
        int err;
        size_t len;
@@ -378,7 +379,8 @@ int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
        char *suffix;
        char *name;
-        err = security_inode_init_security(inode, dir, &suffix, &value, &len);
+        err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
+                                           &len);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 7a43fd640bbb..b3cc8039134b 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -37,6 +37,7 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
 extern int btrfs_removexattr(struct dentry *dentry, const char *name);
 extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
-                                     struct inode *inode, struct inode *dir);
+                                     struct inode *inode, struct inode *dir,
+                                     const struct qstr *qstr);
 #endif /* __XATTR__ */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 42c7fafc8bfe..a0358c2189cb 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -275,6 +275,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
                                  bool preemptive)
 {
        struct dentry *grave, *trap;
+        struct path path, path_to_graveyard;
        char nbuffer[8 + 8 + 1];
        int ret;
@@ -287,10 +288,18 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
        /* non-directories can just be unlinked */
        if (!S_ISDIR(rep->d_inode->i_mode)) {
                _debug("unlink stale object");
-                ret = vfs_unlink(dir->d_inode, rep);
-                if (preemptive)
+                path.mnt = cache->mnt;
-                        cachefiles_mark_object_buried(cache, rep);
+                path.dentry = dir;
+                ret = security_path_unlink(&path, rep);
+                if (ret < 0) {
+                        cachefiles_io_error(cache, "Unlink security error");
+                } else {
+                        ret = vfs_unlink(dir->d_inode, rep);
+                        if (preemptive)
+                                cachefiles_mark_object_buried(cache, rep);
+                }
                mutex_unlock(&dir->d_inode->i_mutex);
@@ -379,12 +388,23 @@ try_again:
        }
        /* attempt the rename */
-        ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave);
+        path.mnt = cache->mnt;
-        if (ret != 0 && ret != -ENOMEM)
+        path.dentry = dir;
-                cachefiles_io_error(cache, "Rename failed with error %d", ret);
+        path_to_graveyard.mnt = cache->mnt;
+        path_to_graveyard.dentry = cache->graveyard;
+        ret = security_path_rename(&path, rep, &path_to_graveyard, grave);
+        if (ret < 0) {
+                cachefiles_io_error(cache, "Rename security error %d", ret);
+        } else {
+                ret = vfs_rename(dir->d_inode, rep,
+                                 cache->graveyard->d_inode, grave);
+                if (ret != 0 && ret != -ENOMEM)
+                        cachefiles_io_error(cache,
+                                            "Rename failed with error %d", ret);
-        if (preemptive)
+                if (preemptive)
-                cachefiles_mark_object_buried(cache, rep);
+                        cachefiles_mark_object_buried(cache, rep);
+        }
        unlock_rename(cache->graveyard, dir);
        dput(grave);
@@ -448,6 +468,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
 {
        struct cachefiles_cache *cache;
        struct dentry *dir, *next = NULL;
+        struct path path;
        unsigned long start;
        const char *name;
        int ret, nlen;
@@ -458,6 +479,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
        cache = container_of(parent->fscache.cache,
                             struct cachefiles_cache, cache);
+        path.mnt = cache->mnt;
        ASSERT(parent->dentry);
        ASSERT(parent->dentry->d_inode);
@@ -511,6 +533,10 @@ lookup_again:
                        if (ret < 0)
                                goto create_error;
+                        path.dentry = dir;
+                        ret = security_path_mkdir(&path, next, 0);
+                        if (ret < 0)
+                                goto create_error;
                        start = jiffies;
                        ret = vfs_mkdir(dir->d_inode, next, 0);
                        cachefiles_hist(cachefiles_mkdir_histogram, start);
@@ -536,6 +562,10 @@ lookup_again:
                        if (ret < 0)
                                goto create_error;
+                        path.dentry = dir;
+                        ret = security_path_mknod(&path, next, S_IFREG, 0);
+                        if (ret < 0)
+                                goto create_error;
                        start = jiffies;
                        ret = vfs_create(dir->d_inode, next, S_IFREG, NULL);
                        cachefiles_hist(cachefiles_create_histogram, start);
@@ -692,6 +722,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 {
        struct dentry *subdir;
        unsigned long start;
+        struct path path;
        int ret;
        _enter(",,%s", dirname);
@@ -719,6 +750,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
                _debug("attempt mkdir");
+                path.mnt = cache->mnt;
+                path.dentry = dir;
+                ret = security_path_mkdir(&path, subdir, 0700);
+                if (ret < 0)
+                        goto mkdir_error;
                ret = vfs_mkdir(dir->d_inode, subdir, 0700);
                if (ret < 0)
                        goto mkdir_error;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f0aef787a102..ebafa65a29b6 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -60,7 +60,6 @@ int ceph_init_dentry(struct dentry *dentry)
        }
        di->dentry = dentry;
        di->lease_session = NULL;
-        di->parent_inode = igrab(dentry->d_parent->d_inode);
        dentry->d_fsdata = di;
        dentry->d_time = jiffies;
        ceph_dentry_lru_add(dentry);
@@ -410,7 +409,7 @@ more:
        spin_lock(&inode->i_lock);
        if (ci->i_release_count == fi->dir_release_count) {
                dout(" marking %p complete\n", inode);
-                ci->i_ceph_flags |= CEPH_I_COMPLETE;
+                /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
                ci->i_max_offset = filp->f_pos;
        }
        spin_unlock(&inode->i_lock);
@@ -497,6 +496,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
        /* .snap dir? */
        if (err == -ENOENT &&
+            ceph_snap(parent) == CEPH_NOSNAP &&
            strcmp(dentry->d_name.name,
                   fsc->mount_options->snapdir_name) == 0) {
                struct inode *inode = ceph_get_snapdir(parent);
@@ -993,7 +993,7 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *dir;
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        dir = dentry->d_parent->d_inode;
@@ -1030,28 +1030,8 @@ out_touch:
 static void ceph_dentry_release(struct dentry *dentry)
 {
        struct ceph_dentry_info *di = ceph_dentry(dentry);
-        struct inode *parent_inode = NULL;
-        u64 snapid = CEPH_NOSNAP;
-        if (!IS_ROOT(dentry)) {
+        dout("dentry_release %p\n", dentry);
-                parent_inode = di->parent_inode;
-                if (parent_inode)
-                        snapid = ceph_snap(parent_inode);
-        }
-        dout("dentry_release %p parent %p\n", dentry, parent_inode);
-        if (parent_inode && snapid != CEPH_SNAPDIR) {
-                struct ceph_inode_info *ci = ceph_inode(parent_inode);
-                spin_lock(&parent_inode->i_lock);
-                if (ci->i_shared_gen == di->lease_shared_gen ||
-                    snapid <= CEPH_MAXSNAP) {
-                        dout(" clearing %p complete (d_release)\n",
-                             parent_inode);
-                        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-                        ci->i_release_count++;
-                }
-                spin_unlock(&parent_inode->i_lock);
-        }
        if (di) {
                ceph_dentry_lru_del(dentry);
                if (di->lease_session)
@@ -1059,8 +1039,6 @@ static void ceph_dentry_release(struct dentry *dentry)
                kmem_cache_free(ceph_dentry_cachep, di);
                dentry->d_fsdata = NULL;
        }
-        if (parent_inode)
-                iput(parent_inode);
 }
 static int ceph_snapdir_d_revalidate(struct dentry *dentry,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 5625463aa479..193bfa5e9cbd 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -707,7 +707,7 @@ static int fill_inode(struct inode *inode,
                    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
                    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
                        dout(" marking %p complete (empty)\n", inode);
-                        ci->i_ceph_flags |= CEPH_I_COMPLETE;
+                        /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
                        ci->i_max_offset = 2;
                }
                break;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 88fcaa21b801..20b907d76ae2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -207,7 +207,6 @@ struct ceph_dentry_info {
        struct dentry *dentry;
        u64 time;
        u64 offset;
-        struct inode *parent_inode;
 };
 struct ceph_inode_xattrs_info {
diff --git a/fs/compat.c b/fs/compat.c
index f6fd0a00e6cc..c6d31a3bab88 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -262,35 +262,19 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
 */
 asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
 {
-        struct path path;
+        struct kstatfs tmp;
-        int error;
+        int error = user_statfs(pathname, &tmp);
+        if (!error)
-        error = user_path(pathname, &path);
+                error = put_compat_statfs(buf, &tmp);
-        if (!error) {
-                struct kstatfs tmp;
-                error = vfs_statfs(&path, &tmp);
-                if (!error)
-                        error = put_compat_statfs(buf, &tmp);
-                path_put(&path);
-        }
        return error;
 }
 asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf)
 {
-        struct file * file;
        struct kstatfs tmp;
-        int error;
+        int error = fd_statfs(fd, &tmp);
-        error = -EBADF;
-        file = fget(fd);
-        if (!file)
-                goto out;
-        error = vfs_statfs(&file->f_path, &tmp);
        if (!error)
                error = put_compat_statfs(buf, &tmp);
-        fput(file);
-out:
        return error;
 }
@@ -329,41 +313,29 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
 asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf)
 {
-        struct path path;
+        struct kstatfs tmp;
        int error;
        if (sz != sizeof(*buf))
                return -EINVAL;
-        error = user_path(pathname, &path);
+        error = user_statfs(pathname, &tmp);
-        if (!error) {
+        if (!error)
-                struct kstatfs tmp;
+                error = put_compat_statfs64(buf, &tmp);
-                error = vfs_statfs(&path, &tmp);
-                if (!error)
-                        error = put_compat_statfs64(buf, &tmp);
-                path_put(&path);
-        }
        return error;
 }
 asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf)
 {
-        struct file * file;
        struct kstatfs tmp;
        int error;
        if (sz != sizeof(*buf))
                return -EINVAL;
-        error = -EBADF;
+        error = fd_statfs(fd, &tmp);
-        file = fget(fd);
-        if (!file)
-                goto out;
-        error = vfs_statfs(&file->f_path, &tmp);
        if (!error)
                error = put_compat_statfs64(buf, &tmp);
-        fput(file);
-out:
        return error;
 }
@@ -1228,7 +1200,9 @@ compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
        file = fget_light(fd, &fput_needed);
        if (!file)
                return -EBADF;
-        ret = compat_readv(file, vec, vlen, &pos);
+        ret = -ESPIPE;
+        if (file->f_mode & FMODE_PREAD)
+                ret = compat_readv(file, vec, vlen, &pos);
        fput_light(file, fput_needed);
        return ret;
 }
@@ -1285,7 +1259,9 @@ compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
        file = fget_light(fd, &fput_needed);
        if (!file)
                return -EBADF;
-        ret = compat_writev(file, vec, vlen, &pos);
+        ret = -ESPIPE;
+        if (file->f_mode & FMODE_PWRITE)
+                ret = compat_writev(file, vec, vlen, &pos);
        fput_light(file, fput_needed);
        return ret;
 }
@@ -2308,3 +2284,16 @@ asmlinkage long compat_sys_timerfd_gettime(int ufd,
 }
 #endif /* CONFIG_TIMERFD */
+#ifdef CONFIG_FHANDLE
+/*
+ * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
+ * doesn't set the O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_open_by_handle_at(int mountdirfd,
+                             struct file_handle __user *handle, int flags)
+{
+        return do_handle_open(mountdirfd, handle, flags);
+}
+#endif
diff --git a/fs/dcache.c b/fs/dcache.c
index 2a6bd9a4ae97..a39fe47c466f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -296,8 +296,12 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
        __releases(parent->d_lock)
        __releases(dentry->d_inode->i_lock)
 {
-        dentry->d_parent = NULL;
        list_del(&dentry->d_u.d_child);
+        /*
+         * Inform try_to_ascend() that we are no longer attached to the
+         * dentry tree
+         */
+        dentry->d_flags |= DCACHE_DISCONNECTED;
        if (parent)
                spin_unlock(&parent->d_lock);
        dentry_iput(dentry);
@@ -1012,6 +1016,35 @@ void shrink_dcache_for_umount(struct super_block *sb)
 }
 /*
+ * This tries to ascend one level of parenthood, but
+ * we can race with renaming, so we need to re-check
+ * the parenthood after dropping the lock and check
+ * that the sequence number still matches.
+ */
+static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq)
+{
+        struct dentry *new = old->d_parent;
+        rcu_read_lock();
+        spin_unlock(&old->d_lock);
+        spin_lock(&new->d_lock);
+        /*
+         * might go back up the wrong parent if we have had a rename
+         * or deletion
+         */
+        if (new != old->d_parent ||
+                 (old->d_flags & DCACHE_DISCONNECTED) ||
+                 (!locked && read_seqretry(&rename_lock, seq))) {
+                spin_unlock(&new->d_lock);
+                new = NULL;
+        }
+        rcu_read_unlock();
+        return new;
+}
+/*
 * Search for at least 1 mount point in the dentry's subdirs.
 * We descend to the next level whenever the d_subdirs
 * list is non-empty and continue searching.
@@ -1066,24 +1099,10 @@ resume:
         * All done at this level ... ascend and resume the search.
         */
        if (this_parent != parent) {
-                struct dentry *tmp;
+                struct dentry *child = this_parent;
-                struct dentry *child;
+                this_parent = try_to_ascend(this_parent, locked, seq);
+                if (!this_parent)
-                tmp = this_parent->d_parent;
-                rcu_read_lock();
-                spin_unlock(&this_parent->d_lock);
-                child = this_parent;
-                this_parent = tmp;
-                spin_lock(&this_parent->d_lock);
-                /* might go back up the wrong parent if we have had a rename
-                 * or deletion */
-                if (this_parent != child->d_parent ||
-                         (!locked && read_seqretry(&rename_lock, seq))) {
-                        spin_unlock(&this_parent->d_lock);
-                        rcu_read_unlock();
                        goto rename_retry;
-                }
-                rcu_read_unlock();
                next = child->d_u.d_child.next;
                goto resume;
        }
@@ -1181,24 +1200,10 @@ resume:
         * All done at this level ... ascend and resume the search.
         */
        if (this_parent != parent) {
-                struct dentry *tmp;
+                struct dentry *child = this_parent;
-                struct dentry *child;
+                this_parent = try_to_ascend(this_parent, locked, seq);
+                if (!this_parent)
-                tmp = this_parent->d_parent;
-                rcu_read_lock();
-                spin_unlock(&this_parent->d_lock);
-                child = this_parent;
-                this_parent = tmp;
-                spin_lock(&this_parent->d_lock);
-                /* might go back up the wrong parent if we have had a rename
-                 * or deletion */
-                if (this_parent != child->d_parent ||
-                        (!locked && read_seqretry(&rename_lock, seq))) {
-                        spin_unlock(&this_parent->d_lock);
-                        rcu_read_unlock();
                        goto rename_retry;
-                }
-                rcu_read_unlock();
                next = child->d_u.d_child.next;
                goto resume;
        }
@@ -1523,6 +1528,28 @@ struct dentry * d_alloc_root(struct inode * root_inode)
 }
 EXPORT_SYMBOL(d_alloc_root);
+static struct dentry * __d_find_any_alias(struct inode *inode)
+{
+        struct dentry *alias;
+        if (list_empty(&inode->i_dentry))
+                return NULL;
+        alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias);
+        __dget(alias);
+        return alias;
+}
+static struct dentry * d_find_any_alias(struct inode *inode)
+{
+        struct dentry *de;
+        spin_lock(&inode->i_lock);
+        de = __d_find_any_alias(inode);
+        spin_unlock(&inode->i_lock);
+        return de;
+}
 /**
 * d_obtain_alias - find or allocate a dentry for a given inode
 * @inode: inode to allocate the dentry for
@@ -1552,7 +1579,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
        if (IS_ERR(inode))
                return ERR_CAST(inode);
-        res = d_find_alias(inode);
+        res = d_find_any_alias(inode);
        if (res)
                goto out_iput;
@@ -1565,7 +1592,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
        spin_lock(&inode->i_lock);
-        res = __d_find_alias(inode, 0);
+        res = __d_find_any_alias(inode);
        if (res) {
                spin_unlock(&inode->i_lock);
                dput(tmp);
@@ -2920,28 +2947,14 @@ resume:
                spin_unlock(&dentry->d_lock);
        }
        if (this_parent != root) {
-                struct dentry *tmp;
+                struct dentry *child = this_parent;
-                struct dentry *child;
-                tmp = this_parent->d_parent;
                if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
                        this_parent->d_flags |= DCACHE_GENOCIDE;
                        this_parent->d_count--;
                }
-                rcu_read_lock();
+                this_parent = try_to_ascend(this_parent, locked, seq);
-                spin_unlock(&this_parent->d_lock);
+                if (!this_parent)
-                child = this_parent;
-                this_parent = tmp;
-                spin_lock(&this_parent->d_lock);
-                /* might go back up the wrong parent if we have had a rename
-                 * or deletion */
-                if (this_parent != child->d_parent ||
-                         (!locked && read_seqretry(&rename_lock, seq))) {
-                        spin_unlock(&this_parent->d_lock);
-                        rcu_read_unlock();
                        goto rename_retry;
-                }
-                rcu_read_unlock();
                next = child->d_u.d_child.next;
                goto resume;
        }
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 37a8ca7c1222..e7a7a2f07324 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -13,9 +13,6 @@
 *
 */
-/* uncomment to get debug messages from the debug filesystem, ah the irony. */
-/* #define DEBUG */
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -310,7 +307,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_symlink);
-static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
+static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
 {
        int ret = 0;
@@ -333,6 +330,7 @@ static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
                        dput(dentry);
                }
        }
+        return ret;
 }
 /**
@@ -351,7 +349,8 @@ static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
 void debugfs_remove(struct dentry *dentry)
 {
        struct dentry *parent;
-        
+        int ret;
        if (!dentry)
                return;
@@ -360,9 +359,10 @@ void debugfs_remove(struct dentry *dentry)
                return;
        mutex_lock(&parent->d_inode->i_mutex);
-        __debugfs_remove(dentry, parent);
+        ret = __debugfs_remove(dentry, parent);
        mutex_unlock(&parent->d_inode->i_mutex);
-        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+        if (!ret)
+                simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove);
@@ -540,17 +540,5 @@ static int __init debugfs_init(void)
        return retval;
 }
-static void __exit debugfs_exit(void)
-{
-        debugfs_registered = false;
-        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
-        unregister_filesystem(&debug_fs_type);
-        kobject_put(debug_kobj);
-}
 core_initcall(debugfs_init);
-module_exit(debugfs_exit);
-MODULE_LICENSE("GPL");
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 267d0ada4541..4a09af9e9a63 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -63,6 +63,13 @@
 * cleanup path and it is also acquired by eventpoll_release_file()
 * if a file has been pushed inside an epoll set and it is then
 * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL).
+ * It is also acquired when inserting an epoll fd onto another epoll
+ * fd. We do this so that we walk the epoll tree and ensure that this
+ * insertion does not create a cycle of epoll file descriptors, which
+ * could lead to deadlock. We need a global mutex to prevent two
+ * simultaneous inserts (A into B and B into A) from racing and
+ * constructing a cycle without either insert observing that it is
+ * going to.
 * It is possible to drop the "ep->mtx" and to use the global
 * mutex "epmutex" (together with "ep->lock") to have it working,
 * but having "ep->mtx" will make the interface more scalable.
@@ -224,6 +231,9 @@ static long max_user_watches __read_mostly;
 */
 static DEFINE_MUTEX(epmutex);
+/* Used to check for epoll file descriptor inclusion loops */
+static struct nested_calls poll_loop_ncalls;
 /* Used for safe wake up implementation */
 static struct nested_calls poll_safewake_ncalls;
@@ -1198,6 +1208,62 @@ retry:
        return res;
 }
+/**
+ * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
+ *                      API, to verify that adding an epoll file inside another
+ *                      epoll structure, does not violate the constraints, in
+ *                      terms of closed loops, or too deep chains (which can
+ *                      result in excessive stack usage).
+ *
+ * @priv: Pointer to the epoll file to be currently checked.
+ * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
+ *          data structure pointer.
+ * @call_nests: Current dept of the @ep_call_nested() call stack.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ *          structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
+{
+        int error = 0;
+        struct file *file = priv;
+        struct eventpoll *ep = file->private_data;
+        struct rb_node *rbp;
+        struct epitem *epi;
+        mutex_lock(&ep->mtx);
+        for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+                epi = rb_entry(rbp, struct epitem, rbn);
+                if (unlikely(is_file_epoll(epi->ffd.file))) {
+                        error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+                                               ep_loop_check_proc, epi->ffd.file,
+                                               epi->ffd.file->private_data, current);
+                        if (error != 0)
+                                break;
+                }
+        }
+        mutex_unlock(&ep->mtx);
+        return error;
+}
+/**
+ * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
+ *                 another epoll file (represented by @ep) does not create
+ *                 closed loops or too deep chains.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @file: Pointer to the epoll file to be checked.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ *          structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check(struct eventpoll *ep, struct file *file)
+{
+        return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+                              ep_loop_check_proc, file, ep, current);
+}
 /*
 * Open an eventpoll file descriptor.
 */
@@ -1246,6 +1312,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                struct epoll_event __user *, event)
 {
        int error;
+        int did_lock_epmutex = 0;
        struct file *file, *tfile;
        struct eventpoll *ep;
        struct epitem *epi;
@@ -1287,6 +1354,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
         */
        ep = file->private_data;
+        /*
+         * When we insert an epoll file descriptor, inside another epoll file
+         * descriptor, there is the change of creating closed loops, which are
+         * better be handled here, than in more critical paths.
+         *
+         * We hold epmutex across the loop check and the insert in this case, in
+         * order to prevent two separate inserts from racing and each doing the
+         * insert "at the same time" such that ep_loop_check passes on both
+         * before either one does the insert, thereby creating a cycle.
+         */
+        if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
+                mutex_lock(&epmutex);
+                did_lock_epmutex = 1;
+                error = -ELOOP;
+                if (ep_loop_check(ep, tfile) != 0)
+                        goto error_tgt_fput;
+        }
        mutex_lock(&ep->mtx);
        /*
@@ -1322,6 +1408,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        mutex_unlock(&ep->mtx);
 error_tgt_fput:
+        if (unlikely(did_lock_epmutex))
+                mutex_unlock(&epmutex);
        fput(tfile);
 error_fput:
        fput(file);
@@ -1441,6 +1530,12 @@ static int __init eventpoll_init(void)
                EP_ITEM_COST;
        BUG_ON(max_user_watches < 0);
+        /*
+         * Initialize the structure used to perform epoll file descriptor
+         * inclusion loops checks.
+         */
+        ep_nested_calls_init(&poll_loop_ncalls);
        /* Initialize the structure used to perform safe poll wait head wake ups */
        ep_nested_calls_init(&poll_safewake_ncalls);
diff --git a/fs/exec.c b/fs/exec.c
index 52a447d9b6ab..ba99e1abb1aa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -115,13 +115,16 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
        struct file *file;
        char *tmp = getname(library);
        int error = PTR_ERR(tmp);
+        static const struct open_flags uselib_flags = {
+                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+                .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
+                .intent = LOOKUP_OPEN
+        };
        if (IS_ERR(tmp))
                goto out;
-        file = do_filp_open(AT_FDCWD, tmp,
+        file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
-                                O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
-                                MAY_READ | MAY_EXEC | MAY_OPEN);
        putname(tmp);
        error = PTR_ERR(file);
        if (IS_ERR(file))
@@ -721,10 +724,13 @@ struct file *open_exec(const char *name)
 {
        struct file *file;
        int err;
+        static const struct open_flags open_exec_flags = {
+                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+                .acc_mode = MAY_EXEC | MAY_OPEN,
+                .intent = LOOKUP_OPEN
+        };
-        file = do_filp_open(AT_FDCWD, name,
+        file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW);
-                                O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
-                                MAY_EXEC | MAY_OPEN);
        if (IS_ERR(file))
                goto out;
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 264e95d02830..4d70db110cfc 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -272,7 +272,6 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                err = exofs_set_link(new_dir, new_de, new_page, old_inode);
                new_inode->i_ctime = CURRENT_TIME;
                if (dir_de)
@@ -286,12 +285,9 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        if (new_dir->i_nlink >= EXOFS_LINK_MAX)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = exofs_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
@@ -299,7 +295,7 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
        old_inode->i_ctime = CURRENT_TIME;
        exofs_delete_entry(old_de, old_page);
-        inode_dec_link_count(old_inode);
+        mark_inode_dirty(old_inode);
        if (dir_de) {
                err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 4b6825740dd5..b05acb796135 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -320,9 +320,14 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid,
        struct inode * inode = dentry->d_inode;
        int len = *max_len;
        int type = FILEID_INO32_GEN;
-        
-        if (len < 2 || (connectable && len < 4))
+        if (connectable && (len < 4)) {
+                *max_len = 4;
+                return 255;
+        } else if (len < 2) {
+                *max_len = 2;
                return 255;
+        }
        len = 2;
        fid->i32.ino = inode->i_ino;
@@ -369,6 +374,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
        /*
         * Try to get any dentry for the given file handle from the filesystem.
         */
+        if (!nop || !nop->fh_to_dentry)
+                return ERR_PTR(-ESTALE);
        result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
        if (!result)
                result = ERR_PTR(-ESTALE);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 6346a2acf326..1b48c3370872 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -110,7 +110,7 @@ extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
 extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
 /* ialloc.c */
-extern struct inode * ext2_new_inode (struct inode *, int);
+extern struct inode * ext2_new_inode (struct inode *, int, const struct qstr *);
 extern void ext2_free_inode (struct inode *);
 extern unsigned long ext2_count_free_inodes (struct super_block *);
 extern void ext2_check_inodes_bitmap (struct super_block *);
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index ad70479aabff..ee9ed31948e1 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -429,7 +429,8 @@ found:
        return group;
 }
-struct inode *ext2_new_inode(struct inode *dir, int mode)
+struct inode *ext2_new_inode(struct inode *dir, int mode,
+                             const struct qstr *qstr)
 {
        struct super_block *sb;
        struct buffer_head *bitmap_bh = NULL;
@@ -585,7 +586,7 @@ got:
        if (err)
                goto fail_free_drop;
-        err = ext2_init_security(inode,dir);
+        err = ext2_init_security(inode, dir, qstr);
        if (err)
                goto fail_free_drop;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 2e1d8341d827..ed5c5d496ee9 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -104,7 +104,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, st
        dquot_initialize(dir);
-        inode = ext2_new_inode(dir, mode);
+        inode = ext2_new_inode(dir, mode, &dentry->d_name);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
@@ -133,7 +133,7 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_
        dquot_initialize(dir);
-        inode = ext2_new_inode (dir, mode);
+        inode = ext2_new_inode (dir, mode, &dentry->d_name);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                init_special_inode(inode, inode->i_mode, rdev);
@@ -159,7 +159,7 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
        dquot_initialize(dir);
-        inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO);
+        inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO, &dentry->d_name);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out;
@@ -230,7 +230,7 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
        inode_inc_link_count(dir);
-        inode = ext2_new_inode (dir, S_IFDIR | mode);
+        inode = ext2_new_inode(dir, S_IFDIR | mode, &dentry->d_name);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_dir;
@@ -344,7 +344,6 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
                new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                ext2_set_link(new_dir, new_de, new_page, old_inode, 1);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
@@ -356,12 +355,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
                        if (new_dir->i_nlink >= EXT2_LINK_MAX)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = ext2_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
@@ -369,12 +365,11 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
        /*
         * Like most other Unix systems, set the ctime for inodes on a
         * rename.
-         * inode_dec_link_count() will mark the inode dirty.
         */
        old_inode->i_ctime = CURRENT_TIME_SEC;
+        mark_inode_dirty(old_inode);
        ext2_delete_entry (old_de, old_page);
-        inode_dec_link_count(old_inode);
        if (dir_de) {
                if (old_dir != new_dir)
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index a1a1c2184616..5e41cccff762 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -116,9 +116,11 @@ exit_ext2_xattr(void)
 # endif  /* CONFIG_EXT2_FS_XATTR */
 #ifdef CONFIG_EXT2_FS_SECURITY
-extern int ext2_init_security(struct inode *inode, struct inode *dir);
+extern int ext2_init_security(struct inode *inode, struct inode *dir,
+                              const struct qstr *qstr);
 #else
-static inline int ext2_init_security(struct inode *inode, struct inode *dir)
+static inline int ext2_init_security(struct inode *inode, struct inode *dir,
+                                     const struct qstr *qstr)
 {
        return 0;
 }
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 3004e15d5da5..5d979b4347b0 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -47,14 +47,15 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name,
 }
 int
-ext2_init_security(struct inode *inode, struct inode *dir)
+ext2_init_security(struct inode *inode, struct inode *dir,
+                   const struct qstr *qstr)
 {
        int err;
        size_t len;
        void *value;
        char *name;
-        err = security_inode_init_security(inode, dir, &name, &value, &len);
+        err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 045995c8ce5a..153242187fce 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1991,6 +1991,7 @@ ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
                spin_unlock(sb_bgl_lock(sbi, group));
                percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
+                free_blocks -= next - start;
                /* Do not issue a TRIM on extents smaller than minblocks */
                if ((next - start) < minblocks)
                        goto free_extent;
@@ -2040,7 +2041,7 @@ free_extent:
                cond_resched();
                /* No more suitable extents */
-                if ((free_blocks - count) < minblocks)
+                if (free_blocks < minblocks)
                        break;
        }
@@ -2090,7 +2091,8 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
        ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
        int ret = 0;
-        start = range->start >> sb->s_blocksize_bits;
+        start = (range->start >> sb->s_blocksize_bits) +
+                le32_to_cpu(es->s_first_data_block);
        len = range->len >> sb->s_blocksize_bits;
        minlen = range->minlen >> sb->s_blocksize_bits;
        trimmed = 0;
@@ -2099,10 +2101,6 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
                return -EINVAL;
        if (start >= max_blks)
                goto out;
-        if (start < le32_to_cpu(es->s_first_data_block)) {
-                len -= le32_to_cpu(es->s_first_data_block) - start;
-                start = le32_to_cpu(es->s_first_data_block);
-        }
        if (start + len > max_blks)
                len = max_blks - start;
@@ -2129,10 +2127,15 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
                if (free_blocks < minlen)
                        continue;
-                if (len >= EXT3_BLOCKS_PER_GROUP(sb))
+                /*
-                        len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block);
+                 * For all the groups except the last one, last block will
-                else
+                 * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to
+                 * change it for the last group in which case first_block +
+                 * len < EXT3_BLOCKS_PER_GROUP(sb).
+                 */
+                if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb))
                        last_block = first_block + len;
+                len -= last_block - first_block;
                ret = ext3_trim_all_free(sb, group, first_block,
                                        last_block, minlen);
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 9724aef22460..bfc2dc43681d 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -404,7 +404,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent)
 * For other inodes, search forward from the parent directory's block
 * group to find a free inode.
 */
-struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
+struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
+                             const struct qstr *qstr, int mode)
 {
        struct super_block *sb;
        struct buffer_head *bitmap_bh = NULL;
@@ -589,7 +590,7 @@ got:
        if (err)
                goto fail_free_drop;
-        err = ext3_init_security(handle,inode, dir);
+        err = ext3_init_security(handle, inode, dir, qstr);
        if (err)
                goto fail_free_drop;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index b27ba71810ec..32f3b8695859 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1540,8 +1540,8 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        goto cleanup;
                node2 = (struct dx_node *)(bh2->b_data);
                entries2 = node2->entries;
+                memset(&node2->fake, 0, sizeof(struct fake_dirent));
                node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
-                node2->fake.inode = 0;
                BUFFER_TRACE(frame->bh, "get_write_access");
                err = ext3_journal_get_write_access(handle, frame->bh);
                if (err)
@@ -1710,7 +1710,7 @@ retry:
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
-        inode = ext3_new_inode (handle, dir, mode);
+        inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext3_file_inode_operations;
@@ -1746,7 +1746,7 @@ retry:
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
-        inode = ext3_new_inode (handle, dir, mode);
+        inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                init_special_inode(inode, inode->i_mode, rdev);
@@ -1784,7 +1784,7 @@ retry:
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
-        inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
+        inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
@@ -2206,7 +2206,7 @@ retry:
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
-        inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+        inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
@@ -2253,13 +2253,6 @@ static int ext3_link (struct dentry * old_dentry,
        dquot_initialize(dir);
-        /*
-         * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-         * otherwise has the potential to corrupt the orphan inode list.
-         */
-        if (inode->i_nlink == 0)
-                return -ENOENT;
 retry:
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 85c8cc8f2473..071689f86e18 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1464,6 +1464,13 @@ static void ext3_orphan_cleanup (struct super_block * sb,
                return;
        }
+        /* Check if feature set allows readwrite operations */
+        if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) {
+                ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+                         "unknown ROCOMPAT features");
+                return;
+        }
        if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
                if (es->s_last_orphan)
                        jbd_debug(1, "Errors on filesystem, "
@@ -1936,6 +1943,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        sb->s_qcop = &ext3_qctl_operations;
        sb->dq_op = &ext3_quota_operations;
 #endif
+        memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
        mutex_init(&sbi->s_orphan_lock);
        mutex_init(&sbi->s_resize_lock);
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 377fe7201169..2be4f69bfa64 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -128,10 +128,10 @@ exit_ext3_xattr(void)
 #ifdef CONFIG_EXT3_FS_SECURITY
 extern int ext3_init_security(handle_t *handle, struct inode *inode,
-                                struct inode *dir);
+                              struct inode *dir, const struct qstr *qstr);
 #else
 static inline int ext3_init_security(handle_t *handle, struct inode *inode,
-                                struct inode *dir)
+                                     struct inode *dir, const struct qstr *qstr)
 {
        return 0;
 }
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 03a99bfc59f9..b8d9f83aa5c5 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -49,14 +49,15 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name,
 }
 int
-ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+                   const struct qstr *qstr)
 {
        int err;
        size_t len;
        void *value;
        char *name;
-        err = security_inode_init_security(inode, dir, &name, &value, &len);
+        err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index eb9097aec6f0..78b79e1bd7ed 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1042,7 +1042,7 @@ got:
        if (err)
                goto fail_free_drop;
-        err = ext4_init_security(handle, inode, dir);
+        err = ext4_init_security(handle, inode, dir, qstr);
        if (err)
                goto fail_free_drop;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5485390d32c5..e781b7ea5630 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2304,13 +2304,6 @@ static int ext4_link(struct dentry *old_dentry,
        dquot_initialize(dir);
-        /*
-         * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-         * otherwise has the potential to corrupt the orphan inode list.
-         */
-        if (inode->i_nlink == 0)
-                return -ENOENT;
 retry:
        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f6a318f836b2..203f9e4a70be 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3415,6 +3415,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_qcop = &ext4_qctl_operations;
        sb->dq_op = &ext4_quota_operations;
 #endif
+        memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
        mutex_init(&sbi->s_orphan_lock);
        mutex_init(&sbi->s_resize_lock);
@@ -3509,7 +3511,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
 no_journal:
-        EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
+        /*
+         * The maximum number of concurrent works can be high and
+         * concurrency isn't really necessary.  Limit it to 1.
+         */
+        EXT4_SB(sb)->dio_unwritten_wq =
+                alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM, 1);
        if (!EXT4_SB(sb)->dio_unwritten_wq) {
                printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
                goto failed_mount_wq;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 1ef16520b950..25b7387ff183 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -145,10 +145,10 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 #ifdef CONFIG_EXT4_FS_SECURITY
 extern int ext4_init_security(handle_t *handle, struct inode *inode,
-                                struct inode *dir);
+                              struct inode *dir, const struct qstr *qstr);
 #else
 static inline int ext4_init_security(handle_t *handle, struct inode *inode,
-                                struct inode *dir)
+                                     struct inode *dir, const struct qstr *qstr)
 {
        return 0;
 }
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 9b21268e121c..007c3bfbf094 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -49,14 +49,15 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
 }
 int
-ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+                   const struct qstr *qstr)
 {
        int err;
        size_t len;
        void *value;
        char *name;
-        err = security_inode_init_security(inode, dir, &name, &value, &len);
+        err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 86753fe10bd1..0e277ec4b612 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -757,8 +757,10 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
        struct inode *inode =  de->d_inode;
        u32 ipos_h, ipos_m, ipos_l;
-        if (len < 5)
+        if (len < 5) {
+                *lenp = 5;
                return 255; /* no room */
+        }
        ipos_h = MSDOS_I(inode)->i_pos >> 8;
        ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index f88f752babd9..adae3fb7451a 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -43,7 +43,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
 static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        /* This is not negative dentry. Always valid. */
@@ -54,7 +54,7 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 {
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        /*
diff --git a/fs/fcntl.c b/fs/fcntl.c
index cb1026181bdc..6c82e5bac039 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -131,7 +131,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 SYSCALL_DEFINE1(dup, unsigned int, fildes)
 {
        int ret = -EBADF;
-        struct file *file = fget(fildes);
+        struct file *file = fget_raw(fildes);
        if (file) {
                ret = get_unused_fd();
@@ -426,15 +426,35 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
        return err;
 }
+static int check_fcntl_cmd(unsigned cmd)
+{
+        switch (cmd) {
+        case F_DUPFD:
+        case F_DUPFD_CLOEXEC:
+        case F_GETFD:
+        case F_SETFD:
+        case F_GETFL:
+                return 1;
+        }
+        return 0;
+}
 SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 {       
        struct file *filp;
        long err = -EBADF;
-        filp = fget(fd);
+        filp = fget_raw(fd);
        if (!filp)
                goto out;
+        if (unlikely(filp->f_mode & FMODE_PATH)) {
+                if (!check_fcntl_cmd(cmd)) {
+                        fput(filp);
+                        goto out;
+                }
+        }
        err = security_file_fcntl(filp, cmd, arg);
        if (err) {
                fput(filp);
@@ -456,10 +476,17 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
        long err;
        err = -EBADF;
-        filp = fget(fd);
+        filp = fget_raw(fd);
        if (!filp)
                goto out;
+        if (unlikely(filp->f_mode & FMODE_PATH)) {
+                if (!check_fcntl_cmd(cmd)) {
+                        fput(filp);
+                        goto out;
+                }
+        }
        err = security_file_fcntl(filp, cmd, arg);
        if (err) {
                fput(filp);
@@ -808,14 +835,14 @@ static int __init fcntl_init(void)
         * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
         * is defined as O_NONBLOCK on some platforms and not on others.
         */
-        BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+        BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
                O_RDONLY        | O_WRONLY      | O_RDWR        |
                O_CREAT         | O_EXCL        | O_NOCTTY      |
                O_TRUNC         | O_APPEND      | /* O_NONBLOCK | */
                __O_SYNC        | O_DSYNC       | FASYNC        |
                O_DIRECT        | O_LARGEFILE   | O_DIRECTORY   |
                O_NOFOLLOW      | O_NOATIME     | O_CLOEXEC     |
-                __FMODE_EXEC
+                __FMODE_EXEC    | O_PATH
                ));
        fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/fhandle.c b/fs/fhandle.c
new file mode 100644
index 000000000000..bf93ad2bee07
--- /dev/null
+++ b/fs/fhandle.c
@@ -0,0 +1,265 @@
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/exportfs.h>
+#include <linux/fs_struct.h>
+#include <linux/fsnotify.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+static long do_sys_name_to_handle(struct path *path,
+                                  struct file_handle __user *ufh,
+                                  int __user *mnt_id)
+{
+        long retval;
+        struct file_handle f_handle;
+        int handle_dwords, handle_bytes;
+        struct file_handle *handle = NULL;
+        /*
+         * We need t make sure wether the file system
+         * support decoding of the file handle
+         */
+        if (!path->mnt->mnt_sb->s_export_op ||
+            !path->mnt->mnt_sb->s_export_op->fh_to_dentry)
+                return -EOPNOTSUPP;
+        if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
+                return -EFAULT;
+        if (f_handle.handle_bytes > MAX_HANDLE_SZ)
+                return -EINVAL;
+        handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+                         GFP_KERNEL);
+        if (!handle)
+                return -ENOMEM;
+        /* convert handle size to  multiple of sizeof(u32) */
+        handle_dwords = f_handle.handle_bytes >> 2;
+        /* we ask for a non connected handle */
+        retval = exportfs_encode_fh(path->dentry,
+                                    (struct fid *)handle->f_handle,
+                                    &handle_dwords,  0);
+        handle->handle_type = retval;
+        /* convert handle size to bytes */
+        handle_bytes = handle_dwords * sizeof(u32);
+        handle->handle_bytes = handle_bytes;
+        if ((handle->handle_bytes > f_handle.handle_bytes) ||
+            (retval == 255) || (retval == -ENOSPC)) {
+                /* As per old exportfs_encode_fh documentation
+                 * we could return ENOSPC to indicate overflow
+                 * But file system returned 255 always. So handle
+                 * both the values
+                 */
+                /*
+                 * set the handle size to zero so we copy only
+                 * non variable part of the file_handle
+                 */
+                handle_bytes = 0;
+                retval = -EOVERFLOW;
+        } else
+                retval = 0;
+        /* copy the mount id */
+        if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) ||
+            copy_to_user(ufh, handle,
+                         sizeof(struct file_handle) + handle_bytes))
+                retval = -EFAULT;
+        kfree(handle);
+        return retval;
+}
+/**
+ * sys_name_to_handle_at: convert name to handle
+ * @dfd: directory relative to which name is interpreted if not absolute
+ * @name: name that should be converted to handle.
+ * @handle: resulting file handle
+ * @mnt_id: mount id of the file system containing the file
+ * @flag: flag value to indicate whether to follow symlink or not
+ *
+ * @handle->handle_size indicate the space available to store the
+ * variable part of the file handle in bytes. If there is not
+ * enough space, the field is updated to return the minimum
+ * value required.
+ */
+SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
+                struct file_handle __user *, handle, int __user *, mnt_id,
+                int, flag)
+{
+        struct path path;
+        int lookup_flags;
+        int err;
+        if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
+                return -EINVAL;
+        lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
+        if (flag & AT_EMPTY_PATH)
+                lookup_flags |= LOOKUP_EMPTY;
+        err = user_path_at(dfd, name, lookup_flags, &path);
+        if (!err) {
+                err = do_sys_name_to_handle(&path, handle, mnt_id);
+                path_put(&path);
+        }
+        return err;
+}
+static struct vfsmount *get_vfsmount_from_fd(int fd)
+{
+        struct path path;
+        if (fd == AT_FDCWD) {
+                struct fs_struct *fs = current->fs;
+                spin_lock(&fs->lock);
+                path = fs->pwd;
+                mntget(path.mnt);
+                spin_unlock(&fs->lock);
+        } else {
+                int fput_needed;
+                struct file *file = fget_light(fd, &fput_needed);
+                if (!file)
+                        return ERR_PTR(-EBADF);
+                path = file->f_path;
+                mntget(path.mnt);
+                fput_light(file, fput_needed);
+        }
+        return path.mnt;
+}
+static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
+{
+        return 1;
+}
+static int do_handle_to_path(int mountdirfd, struct file_handle *handle,
+                             struct path *path)
+{
+        int retval = 0;
+        int handle_dwords;
+        path->mnt = get_vfsmount_from_fd(mountdirfd);
+        if (IS_ERR(path->mnt)) {
+                retval = PTR_ERR(path->mnt);
+                goto out_err;
+        }
+        /* change the handle size to multiple of sizeof(u32) */
+        handle_dwords = handle->handle_bytes >> 2;
+        path->dentry = exportfs_decode_fh(path->mnt,
+                                          (struct fid *)handle->f_handle,
+                                          handle_dwords, handle->handle_type,
+                                          vfs_dentry_acceptable, NULL);
+        if (IS_ERR(path->dentry)) {
+                retval = PTR_ERR(path->dentry);
+                goto out_mnt;
+        }
+        return 0;
+out_mnt:
+        mntput(path->mnt);
+out_err:
+        return retval;
+}
+static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
+                   struct path *path)
+{
+        int retval = 0;
+        struct file_handle f_handle;
+        struct file_handle *handle = NULL;
+        /*
+         * With handle we don't look at the execute bit on the
+         * the directory. Ideally we would like CAP_DAC_SEARCH.
+         * But we don't have that
+         */
+        if (!capable(CAP_DAC_READ_SEARCH)) {
+                retval = -EPERM;
+                goto out_err;
+        }
+        if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
+                retval = -EFAULT;
+                goto out_err;
+        }
+        if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
+            (f_handle.handle_bytes == 0)) {
+                retval = -EINVAL;
+                goto out_err;
+        }
+        handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+                         GFP_KERNEL);
+        if (!handle) {
+                retval = -ENOMEM;
+                goto out_err;
+        }
+        /* copy the full handle */
+        if (copy_from_user(handle, ufh,
+                           sizeof(struct file_handle) +
+                           f_handle.handle_bytes)) {
+                retval = -EFAULT;
+                goto out_handle;
+        }
+        retval = do_handle_to_path(mountdirfd, handle, path);
+out_handle:
+        kfree(handle);
+out_err:
+        return retval;
+}
+long do_handle_open(int mountdirfd,
+                    struct file_handle __user *ufh, int open_flag)
+{
+        long retval = 0;
+        struct path path;
+        struct file *file;
+        int fd;
+        retval = handle_to_path(mountdirfd, ufh, &path);
+        if (retval)
+                return retval;
+        fd = get_unused_fd_flags(open_flag);
+        if (fd < 0) {
+                path_put(&path);
+                return fd;
+        }
+        file = file_open_root(path.dentry, path.mnt, "", open_flag);
+        if (IS_ERR(file)) {
+                put_unused_fd(fd);
+                retval =  PTR_ERR(file);
+        } else {
+                retval = fd;
+                fsnotify_open(file);
+                fd_install(fd, file);
+        }
+        path_put(&path);
+        return retval;
+}
+/**
+ * sys_open_by_handle_at: Open the file handle
+ * @mountdirfd: directory file descriptor
+ * @handle: file handle to be opened
+ * @flag: open flags.
+ *
+ * @mountdirfd indicate the directory file descriptor
+ * of the mount point. file handle is decoded relative
+ * to the vfsmount pointed by the @mountdirfd. @flags
+ * value is same as the open(2) flags.
+ */
+SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
+                struct file_handle __user *, handle,
+                int, flags)
+{
+        long ret;
+        if (force_o_largefile())
+                flags |= O_LARGEFILE;
+        ret = do_handle_open(mountdirfd, handle, flags);
+        return ret;
+}
diff --git a/fs/file_table.c b/fs/file_table.c
index eb36b6b17e26..01e4c1e8e6b6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -190,7 +190,8 @@ struct file *alloc_file(struct path *path, fmode_t mode,
                file_take_write(file);
                WARN_ON(mnt_clone_write(path->mnt));
        }
-        ima_counts_get(file);
+        if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+                i_readcount_inc(path->dentry->d_inode);
        return file;
 }
 EXPORT_SYMBOL(alloc_file);
@@ -246,11 +247,15 @@ static void __fput(struct file *file)
                file->f_op->release(inode, file);
        security_file_free(file);
        ima_file_free(file);
-        if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
+        if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
+                     !(file->f_mode & FMODE_PATH))) {
                cdev_put(inode->i_cdev);
+        }
        fops_put(file->f_op);
        put_pid(file->f_owner.pid);
        file_sb_list_del(file);
+        if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+                i_readcount_dec(inode);
        if (file->f_mode & FMODE_WRITE)
                drop_file_write_access(file);
        file->f_path.dentry = NULL;
@@ -276,11 +281,10 @@ struct file *fget(unsigned int fd)
        rcu_read_lock();
        file = fcheck_files(files, fd);
        if (file) {
-                if (!atomic_long_inc_not_zero(&file->f_count)) {
+                /* File object ref couldn't be taken */
-                        /* File object ref couldn't be taken */
+                if (file->f_mode & FMODE_PATH ||
-                        rcu_read_unlock();
+                    !atomic_long_inc_not_zero(&file->f_count))
-                        return NULL;
+                        file = NULL;
-                }
        }
        rcu_read_unlock();
@@ -289,6 +293,25 @@ struct file *fget(unsigned int fd)
 EXPORT_SYMBOL(fget);
+struct file *fget_raw(unsigned int fd)
+{
+        struct file *file;
+        struct files_struct *files = current->files;
+        rcu_read_lock();
+        file = fcheck_files(files, fd);
+        if (file) {
+                /* File object ref couldn't be taken */
+                if (!atomic_long_inc_not_zero(&file->f_count))
+                        file = NULL;
+        }
+        rcu_read_unlock();
+        return file;
+}
+EXPORT_SYMBOL(fget_raw);
 /*
 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
 *
@@ -313,6 +336,33 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
        *fput_needed = 0;
        if (atomic_read(&files->count) == 1) {
                file = fcheck_files(files, fd);
+                if (file && (file->f_mode & FMODE_PATH))
+                        file = NULL;
+        } else {
+                rcu_read_lock();
+                file = fcheck_files(files, fd);
+                if (file) {
+                        if (!(file->f_mode & FMODE_PATH) &&
+                            atomic_long_inc_not_zero(&file->f_count))
+                                *fput_needed = 1;
+                        else
+                                /* Didn't get the reference, someone's freed */
+                                file = NULL;
+                }
+                rcu_read_unlock();
+        }
+        return file;
+}
+struct file *fget_raw_light(unsigned int fd, int *fput_needed)
+{
+        struct file *file;
+        struct files_struct *files = current->files;
+        *fput_needed = 0;
+        if (atomic_read(&files->count) == 1) {
+                file = fcheck_files(files, fd);
        } else {
                rcu_read_lock();
                file = fcheck_files(files, fd);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index bfed8447ed80..8bd0ef9286c3 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -158,7 +158,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
        struct inode *inode;
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        inode = entry->d_inode;
@@ -1283,8 +1283,11 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
        if (err)
                return err;
-        if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc)
+        if (attr->ia_valid & ATTR_OPEN) {
-                return 0;
+                if (fc->atomic_o_trunc)
+                        return 0;
+                file = NULL;
+        }
        if (attr->ia_valid & ATTR_SIZE)
                is_truncate = true;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 95da1bc1c826..9e0832dbb1e3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -86,18 +86,52 @@ struct fuse_file *fuse_file_get(struct fuse_file *ff)
        return ff;
 }
+static void fuse_release_async(struct work_struct *work)
+{
+        struct fuse_req *req;
+        struct fuse_conn *fc;
+        struct path path;
+        req = container_of(work, struct fuse_req, misc.release.work);
+        path = req->misc.release.path;
+        fc = get_fuse_conn(path.dentry->d_inode);
+        fuse_put_request(fc, req);
+        path_put(&path);
+}
 static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
 {
-        path_put(&req->misc.release.path);
+        if (fc->destroy_req) {
+                /*
+                 * If this is a fuseblk mount, then it's possible that
+                 * releasing the path will result in releasing the
+                 * super block and sending the DESTROY request.  If
+                 * the server is single threaded, this would hang.
+                 * For this reason do the path_put() in a separate
+                 * thread.
+                 */
+                atomic_inc(&req->count);
+                INIT_WORK(&req->misc.release.work, fuse_release_async);
+                schedule_work(&req->misc.release.work);
+        } else {
+                path_put(&req->misc.release.path);
+        }
 }
-static void fuse_file_put(struct fuse_file *ff)
+static void fuse_file_put(struct fuse_file *ff, bool sync)
 {
        if (atomic_dec_and_test(&ff->count)) {
                struct fuse_req *req = ff->reserved_req;
-                req->end = fuse_release_end;
+                if (sync) {
-                fuse_request_send_background(ff->fc, req);
+                        fuse_request_send(ff->fc, req);
+                        path_put(&req->misc.release.path);
+                        fuse_put_request(ff->fc, req);
+                } else {
+                        req->end = fuse_release_end;
+                        fuse_request_send_background(ff->fc, req);
+                }
                kfree(ff);
        }
 }
@@ -219,8 +253,12 @@ void fuse_release_common(struct file *file, int opcode)
         * Normally this will send the RELEASE request, however if
         * some asynchronous READ or WRITE requests are outstanding,
         * the sending will be delayed.
+         *
+         * Make the release synchronous if this is a fuseblk mount,
+         * synchronous RELEASE is allowed (and desirable) in this case
+         * because the server can be trusted not to screw up.
         */
-        fuse_file_put(ff);
+        fuse_file_put(ff, ff->fc->destroy_req != NULL);
 }
 static int fuse_open(struct inode *inode, struct file *file)
@@ -558,7 +596,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
                page_cache_release(page);
        }
        if (req->ff)
-                fuse_file_put(req->ff);
+                fuse_file_put(req->ff, false);
 }
 static void fuse_send_readpages(struct fuse_req *req, struct file *file)
@@ -1137,7 +1175,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
 static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 {
        __free_page(req->pages[0]);
-        fuse_file_put(req->ff);
+        fuse_file_put(req->ff, false);
 }
 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ae5744a2f9e9..d4286947bc2c 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,6 +21,7 @@
 #include <linux/rwsem.h>
 #include <linux/rbtree.h>
 #include <linux/poll.h>
+#include <linux/workqueue.h>
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -262,7 +263,10 @@ struct fuse_req {
        /** Data for asynchronous requests */
        union {
                struct {
-                        struct fuse_release_in in;
+                        union {
+                                struct fuse_release_in in;
+                                struct work_struct work;
+                        };
                        struct path path;
                } release;
                struct fuse_init_in init_in;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9e3f68cc1bd1..051b1a084528 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -637,8 +637,10 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
        u64 nodeid;
        u32 generation;
-        if (*max_len < len)
+        if (*max_len < len) {
+                *max_len = len;
                return  255;
+        }
        nodeid = get_fuse_inode(inode)->nodeid;
        generation = inode->i_generation;
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 7118f1a780a9..cbc07155b1a0 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -80,8 +80,11 @@ int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
        struct posix_acl *acl;
        int error;
-        if (flags & IPERM_FLAG_RCU)
+        if (flags & IPERM_FLAG_RCU) {
-                return -ECHILD;
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
        acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 4f36f8832b9b..aad77e4f61b5 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -695,6 +695,7 @@ out:
        if (error == 0)
                return 0;
+        unlock_page(page);
        page_cache_release(page);
        gfs2_trans_end(sdp);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3c4039d5eef1..ef3dc4b9fae2 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -21,6 +21,7 @@
 #include "meta_io.h"
 #include "quota.h"
 #include "rgrp.h"
+#include "super.h"
 #include "trans.h"
 #include "dir.h"
 #include "util.h"
@@ -757,7 +758,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrp_list rlist;
        u64 bn, bstart;
-        u32 blen;
+        u32 blen, btotal;
        __be64 *p;
        unsigned int rg_blocks = 0;
        int metadata;
@@ -839,6 +840,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        bstart = 0;
        blen = 0;
+        btotal = 0;
        for (p = top; p < bottom; p++) {
                if (!*p)
@@ -851,9 +853,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
                else {
                        if (bstart) {
                                if (metadata)
-                                        gfs2_free_meta(ip, bstart, blen);
+                                        __gfs2_free_meta(ip, bstart, blen);
                                else
-                                        gfs2_free_data(ip, bstart, blen);
+                                        __gfs2_free_data(ip, bstart, blen);
+                                btotal += blen;
                        }
                        bstart = bn;
@@ -865,11 +869,17 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        }
        if (bstart) {
                if (metadata)
-                        gfs2_free_meta(ip, bstart, blen);
+                        __gfs2_free_meta(ip, bstart, blen);
                else
-                        gfs2_free_data(ip, bstart, blen);
+                        __gfs2_free_data(ip, bstart, blen);
+                btotal += blen;
        }
+        gfs2_statfs_change(sdp, 0, +btotal, 0);
+        gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
+                          ip->i_inode.i_gid);
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_dinode_out(ip, dibh->b_data);
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 4a456338b873..0da8da2c991d 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -44,7 +44,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
        int error;
        int had_lock = 0;
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        parent = dget_parent(dentry);
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9023db8184f9..b5a5e60df0d5 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -36,9 +36,13 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
        struct super_block *sb = inode->i_sb;
        struct gfs2_inode *ip = GFS2_I(inode);
-        if (*len < GFS2_SMALL_FH_SIZE ||
+        if (connectable && (*len < GFS2_LARGE_FH_SIZE)) {
-            (connectable && *len < GFS2_LARGE_FH_SIZE))
+                *len = GFS2_LARGE_FH_SIZE;
                return 255;
+        } else if (*len < GFS2_SMALL_FH_SIZE) {
+                *len = GFS2_SMALL_FH_SIZE;
+                return 255;
+        }
        fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
        fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 7cfdcb913363..4074b952b059 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -448,15 +448,20 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-        if (!(file->f_flags & O_NOATIME)) {
+        if (!(file->f_flags & O_NOATIME) &&
+            !IS_NOATIME(&ip->i_inode)) {
                struct gfs2_holder i_gh;
                int error;
-                gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+                gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
                error = gfs2_glock_nq(&i_gh);
-                file_accessed(file);
+                if (error == 0) {
-                if (error == 0)
+                        file_accessed(file);
-                        gfs2_glock_dq_uninit(&i_gh);
+                        gfs2_glock_dq(&i_gh);
+                }
+                gfs2_holder_uninit(&i_gh);
+                if (error)
+                        return error;
        }
        vma->vm_ops = &gfs2_vm_ops;
        vma->vm_flags |= VM_CAN_NONLINEAR;
@@ -617,8 +622,7 @@ static void empty_write_end(struct page *page, unsigned from,
 {
        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-        page_zero_new_buffers(page, from, to);
+        zero_user(page, from, to-from);
-        flush_dcache_page(page);
        mark_page_accessed(page);
        if (!gfs2_is_writeback(ip))
@@ -627,36 +631,43 @@ static void empty_write_end(struct page *page, unsigned from,
        block_commit_write(page, from, to);
 }
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+static int needs_empty_write(sector_t block, struct inode *inode)
 {
-        unsigned start, end, next;
-        struct buffer_head *bh, *head;
        int error;
+        struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
-        if (!page_has_buffers(page)) {
+        bh_map.b_size = 1 << inode->i_blkbits;
-                error = __block_write_begin(page, from, to - from, gfs2_block_map);
+        error = gfs2_block_map(inode, block, &bh_map, 0);
-                if (unlikely(error))
+        if (unlikely(error))
-                        return error;
+                return error;
+        return !buffer_mapped(&bh_map);
+}
-                empty_write_end(page, from, to);
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
-                return 0;
+{
-        }
+        struct inode *inode = page->mapping->host;
+        unsigned start, end, next, blksize;
+        sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        int ret;
-        bh = head = page_buffers(page);
+        blksize = 1 << inode->i_blkbits;
        next = end = 0;
        while (next < from) {
-                next += bh->b_size;
+                next += blksize;
-                bh = bh->b_this_page;
+                block++;
        }
        start = next;
        do {
-                next += bh->b_size;
+                next += blksize;
-                if (buffer_mapped(bh)) {
+                ret = needs_empty_write(block, inode);
+                if (unlikely(ret < 0))
+                        return ret;
+                if (ret == 0) {
                        if (end) {
-                                error = __block_write_begin(page, start, end - start,
+                                ret = __block_write_begin(page, start, end - start,
-                                                            gfs2_block_map);
+                                                          gfs2_block_map);
-                                if (unlikely(error))
+                                if (unlikely(ret))
-                                        return error;
+                                        return ret;
                                empty_write_end(page, start, end);
                                end = 0;
                        }
@@ -664,13 +675,13 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
                }
                else
                        end = next;
-                bh = bh->b_this_page;
+                block++;
        } while (next < to);
        if (end) {
-                error = __block_write_begin(page, start, end - start, gfs2_block_map);
+                ret = __block_write_begin(page, start, end - start, gfs2_block_map);
-                if (unlikely(error))
+                if (unlikely(ret))
-                        return error;
+                        return ret;
                empty_write_end(page, start, end);
        }
@@ -976,8 +987,10 @@ static void do_unflock(struct file *file, struct file_lock *fl)
        mutex_lock(&fp->f_fl_mutex);
        flock_lock_file_wait(file, fl);
-        if (fl_gh->gh_gl)
+        if (fl_gh->gh_gl) {
-                gfs2_glock_dq_uninit(fl_gh);
+                gfs2_glock_dq_wait(fl_gh);
+                gfs2_holder_uninit(fl_gh);
+        }
        mutex_unlock(&fp->f_fl_mutex);
 }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 7cd9a5a68d59..e2431313491f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -26,6 +26,9 @@
 #include <linux/freezer.h>
 #include <linux/workqueue.h>
 #include <linux/jiffies.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
+#include <linux/bit_spinlock.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -41,10 +44,6 @@
 #define CREATE_TRACE_POINTS
 #include "trace_gfs2.h"
-struct gfs2_gl_hash_bucket {
-        struct hlist_head hb_list;
-};
 struct gfs2_glock_iter {
        int hash;                       /* hash bucket index         */
        struct gfs2_sbd *sdp;           /* incore superblock         */
@@ -54,7 +53,6 @@ struct gfs2_glock_iter {
 typedef void (*glock_examiner) (struct gfs2_glock * gl);
-static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
 static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
 #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
 static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
@@ -70,57 +68,9 @@ static DEFINE_SPINLOCK(lru_lock);
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
 #define GFS2_GL_HASH_MASK       (GFS2_GL_HASH_SIZE - 1)
-static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE];
+static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE];
 static struct dentry *gfs2_root;
-/*
- * Despite what you might think, the numbers below are not arbitrary :-)
- * They are taken from the ipv4 routing hash code, which is well tested
- * and thus should be nearly optimal. Later on we might tweek the numbers
- * but for now this should be fine.
- *
- * The reason for putting the locks in a separate array from the list heads
- * is that we can have fewer locks than list heads and save memory. We use
- * the same hash function for both, but with a different hash mask.
- */
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
-        defined(CONFIG_PROVE_LOCKING)
-#ifdef CONFIG_LOCKDEP
-# define GL_HASH_LOCK_SZ        256
-#else
-# if NR_CPUS >= 32
-#  define GL_HASH_LOCK_SZ       4096
-# elif NR_CPUS >= 16
-#  define GL_HASH_LOCK_SZ       2048
-# elif NR_CPUS >= 8
-#  define GL_HASH_LOCK_SZ       1024
-# elif NR_CPUS >= 4
-#  define GL_HASH_LOCK_SZ       512
-# else
-#  define GL_HASH_LOCK_SZ       256
-# endif
-#endif
-/* We never want more locks than chains */
-#if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ
-# undef GL_HASH_LOCK_SZ
-# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE
-#endif
-static rwlock_t gl_hash_locks[GL_HASH_LOCK_SZ];
-static inline rwlock_t *gl_lock_addr(unsigned int x)
-{
-        return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)];
-}
-#else /* not SMP, so no spinlocks required */
-static inline rwlock_t *gl_lock_addr(unsigned int x)
-{
-        return NULL;
-}
-#endif
 /**
 * gl_hash() - Turn glock number into hash bucket number
 * @lock: The glock number
@@ -141,25 +91,35 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp,
        return h;
 }
-/**
+static inline void spin_lock_bucket(unsigned int hash)
- * glock_free() - Perform a few checks and then release struct gfs2_glock
+{
- * @gl: The glock to release
+        struct hlist_bl_head *bl = &gl_hash_table[hash];
- *
+        bit_spin_lock(0, (unsigned long *)bl);
- * Also calls lock module to release its internal structure for this glock.
+}
- *
- */
-static void glock_free(struct gfs2_glock *gl)
+static inline void spin_unlock_bucket(unsigned int hash)
+{
+        struct hlist_bl_head *bl = &gl_hash_table[hash];
+        __bit_spin_unlock(0, (unsigned long *)bl);
+}
+static void gfs2_glock_dealloc(struct rcu_head *rcu)
+{
+        struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
+        if (gl->gl_ops->go_flags & GLOF_ASPACE)
+                kmem_cache_free(gfs2_glock_aspace_cachep, gl);
+        else
+                kmem_cache_free(gfs2_glock_cachep, gl);
+}
+void gfs2_glock_free(struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
-        struct address_space *mapping = gfs2_glock2aspace(gl);
-        struct kmem_cache *cachep = gfs2_glock_cachep;
-        GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
+        call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
-        trace_gfs2_glock_put(gl);
+        if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-        if (mapping)
+                wake_up(&sdp->sd_glock_wait);
-                cachep = gfs2_glock_aspace_cachep;
-        sdp->sd_lockstruct.ls_ops->lm_put_lock(cachep, gl);
 }
 /**
@@ -185,34 +145,49 @@ static int demote_ok(const struct gfs2_glock *gl)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        /* assert_spin_locked(&gl->gl_spin); */
        if (gl->gl_state == LM_ST_UNLOCKED)
                return 0;
-        if (!list_empty(&gl->gl_holders))
+        if (test_bit(GLF_LFLUSH, &gl->gl_flags))
+                return 0;
+        if ((gl->gl_name.ln_type != LM_TYPE_INODE) &&
+            !list_empty(&gl->gl_holders))
                return 0;
        if (glops->go_demote_ok)
                return glops->go_demote_ok(gl);
        return 1;
 }
 /**
- * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
+ * __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
 * @gl: the glock
 *
+ * If the glock is demotable, then we add it (or move it) to the end
+ * of the glock LRU list.
 */
-static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+static void __gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 {
-        int may_reclaim;
+        if (demote_ok(gl)) {
-        may_reclaim = (demote_ok(gl) &&
+                spin_lock(&lru_lock);
-                       (atomic_read(&gl->gl_ref) == 1 ||
-                        (gl->gl_name.ln_type == LM_TYPE_INODE &&
+                if (!list_empty(&gl->gl_lru))
-                         atomic_read(&gl->gl_ref) <= 2)));
+                        list_del_init(&gl->gl_lru);
-        spin_lock(&lru_lock);
+                else
-        if (list_empty(&gl->gl_lru) && may_reclaim) {
+                        atomic_inc(&lru_count);
                list_add_tail(&gl->gl_lru, &lru_list);
-                atomic_inc(&lru_count);
+                spin_unlock(&lru_lock);
        }
-        spin_unlock(&lru_lock);
+}
+void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+{
+        spin_lock(&gl->gl_spin);
+        __gfs2_glock_schedule_for_reclaim(gl);
+        spin_unlock(&gl->gl_spin);
 }
 /**
@@ -227,7 +202,6 @@ void gfs2_glock_put_nolock(struct gfs2_glock *gl)
 {
        if (atomic_dec_and_test(&gl->gl_ref))
                GLOCK_BUG_ON(gl, 1);
-        gfs2_glock_schedule_for_reclaim(gl);
 }
 /**
@@ -236,30 +210,26 @@ void gfs2_glock_put_nolock(struct gfs2_glock *gl)
 *
 */
-int gfs2_glock_put(struct gfs2_glock *gl)
+void gfs2_glock_put(struct gfs2_glock *gl)
 {
-        int rv = 0;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct address_space *mapping = gfs2_glock2aspace(gl);
-        write_lock(gl_lock_addr(gl->gl_hash));
+        if (atomic_dec_and_test(&gl->gl_ref)) {
-        if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) {
+                spin_lock_bucket(gl->gl_hash);
-                hlist_del(&gl->gl_list);
+                hlist_bl_del_rcu(&gl->gl_list);
+                spin_unlock_bucket(gl->gl_hash);
+                spin_lock(&lru_lock);
                if (!list_empty(&gl->gl_lru)) {
                        list_del_init(&gl->gl_lru);
                        atomic_dec(&lru_count);
                }
                spin_unlock(&lru_lock);
-                write_unlock(gl_lock_addr(gl->gl_hash));
                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
-                glock_free(gl);
+                GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
-                rv = 1;
+                trace_gfs2_glock_put(gl);
-                goto out;
+                sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
        }
-        spin_lock(&gl->gl_spin);
-        gfs2_glock_schedule_for_reclaim(gl);
-        spin_unlock(&gl->gl_spin);
-        write_unlock(gl_lock_addr(gl->gl_hash));
-out:
-        return rv;
 }
 /**
@@ -275,17 +245,15 @@ static struct gfs2_glock *search_bucket(unsigned int hash,
                                        const struct lm_lockname *name)
 {
        struct gfs2_glock *gl;
-        struct hlist_node *h;
+        struct hlist_bl_node *h;
-        hlist_for_each_entry(gl, h, &gl_hash_table[hash].hb_list, gl_list) {
+        hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) {
                if (!lm_name_equal(&gl->gl_name, name))
                        continue;
                if (gl->gl_sbd != sdp)
                        continue;
+                if (atomic_inc_not_zero(&gl->gl_ref))
-                atomic_inc(&gl->gl_ref);
+                        return gl;
-                return gl;
        }
        return NULL;
@@ -743,10 +711,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        struct gfs2_glock *gl, *tmp;
        unsigned int hash = gl_hash(sdp, &name);
        struct address_space *mapping;
+        struct kmem_cache *cachep;
-        read_lock(gl_lock_addr(hash));
+        rcu_read_lock();
        gl = search_bucket(hash, sdp, &name);
-        read_unlock(gl_lock_addr(hash));
+        rcu_read_unlock();
        *glp = gl;
        if (gl)
@@ -755,9 +724,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                return -ENOENT;
        if (glops->go_flags & GLOF_ASPACE)
-                gl = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_KERNEL);
+                cachep = gfs2_glock_aspace_cachep;
        else
-                gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
+                cachep = gfs2_glock_cachep;
+        gl = kmem_cache_alloc(cachep, GFP_KERNEL);
        if (!gl)
                return -ENOMEM;
@@ -790,15 +760,16 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                mapping->writeback_index = 0;
        }
-        write_lock(gl_lock_addr(hash));
+        spin_lock_bucket(hash);
        tmp = search_bucket(hash, sdp, &name);
        if (tmp) {
-                write_unlock(gl_lock_addr(hash));
+                spin_unlock_bucket(hash);
-                glock_free(gl);
+                kmem_cache_free(cachep, gl);
+                atomic_dec(&sdp->sd_glock_disposal);
                gl = tmp;
        } else {
-                hlist_add_head(&gl->gl_list, &gl_hash_table[hash].hb_list);
+                hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]);
-                write_unlock(gl_lock_addr(hash));
+                spin_unlock_bucket(hash);
        }
        *glp = gl;
@@ -1007,13 +978,13 @@ fail:
                        insert_pt = &gh2->gh_list;
        }
        set_bit(GLF_QUEUED, &gl->gl_flags);
+        trace_gfs2_glock_queue(gh, 1);
        if (likely(insert_pt == NULL)) {
                list_add_tail(&gh->gh_list, &gl->gl_holders);
                if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
                        goto do_cancel;
                return;
        }
-        trace_gfs2_glock_queue(gh, 1);
        list_add_tail(&gh->gh_list, insert_pt);
 do_cancel:
        gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
@@ -1113,6 +1084,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
                    !test_bit(GLF_DEMOTE, &gl->gl_flags))
                        fast_path = 1;
        }
+        __gfs2_glock_schedule_for_reclaim(gl);
        trace_gfs2_glock_queue(gh, 0);
        spin_unlock(&gl->gl_spin);
        if (likely(fast_path))
@@ -1276,10 +1248,8 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
 {
-        unsigned int x;
+        while (num_gh--)
+                gfs2_glock_dq(&ghs[num_gh]);
-        for (x = 0; x < num_gh; x++)
-                gfs2_glock_dq(&ghs[x]);
 }
 /**
@@ -1291,10 +1261,8 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
 {
-        unsigned int x;
+        while (num_gh--)
+                gfs2_glock_dq_uninit(&ghs[num_gh]);
-        for (x = 0; x < num_gh; x++)
-                gfs2_glock_dq_uninit(&ghs[x]);
 }
 void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
@@ -1440,42 +1408,30 @@ static struct shrinker glock_shrinker = {
 * @sdp: the filesystem
 * @bucket: the bucket
 *
- * Returns: 1 if the bucket has entries
 */
-static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
+static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
                          unsigned int hash)
 {
-        struct gfs2_glock *gl, *prev = NULL;
+        struct gfs2_glock *gl;
-        int has_entries = 0;
+        struct hlist_bl_head *head = &gl_hash_table[hash];
-        struct hlist_head *head = &gl_hash_table[hash].hb_list;
+        struct hlist_bl_node *pos;
-        read_lock(gl_lock_addr(hash));
+        rcu_read_lock();
-        /* Can't use hlist_for_each_entry - don't want prefetch here */
+        hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
-        if (hlist_empty(head))
+                if ((gl->gl_sbd == sdp) && atomic_read(&gl->gl_ref))
-                goto out;
-        gl = list_entry(head->first, struct gfs2_glock, gl_list);
-        while(1) {
-                if (!sdp || gl->gl_sbd == sdp) {
-                        gfs2_glock_hold(gl);
-                        read_unlock(gl_lock_addr(hash));
-                        if (prev)
-                                gfs2_glock_put(prev);
-                        prev = gl;
                        examiner(gl);
-                        has_entries = 1;
-                        read_lock(gl_lock_addr(hash));
-                }
-                if (gl->gl_list.next == NULL)
-                        break;
-                gl = list_entry(gl->gl_list.next, struct gfs2_glock, gl_list);
        }
-out:
+        rcu_read_unlock();
-        read_unlock(gl_lock_addr(hash));
-        if (prev)
-                gfs2_glock_put(prev);
        cond_resched();
-        return has_entries;
+}
+static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
+{
+        unsigned x;
+        for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+                examine_bucket(examiner, sdp, x);
 }
@@ -1529,10 +1485,21 @@ static void clear_glock(struct gfs2_glock *gl)
 void gfs2_glock_thaw(struct gfs2_sbd *sdp)
 {
-        unsigned x;
+        glock_hash_walk(thaw_glock, sdp);
+}
-        for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
-                examine_bucket(thaw_glock, sdp, x);
+{
+        int ret;
+        spin_lock(&gl->gl_spin);
+        ret = __dump_glock(seq, gl);
+        spin_unlock(&gl->gl_spin);
+        return ret;
+}
+static void dump_glock_func(struct gfs2_glock *gl)
+{
+        dump_glock(NULL, gl);
 }
 /**
@@ -1545,13 +1512,10 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
-        unsigned int x;
+        glock_hash_walk(clear_glock, sdp);
-        for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
-                examine_bucket(clear_glock, sdp, x);
        flush_workqueue(glock_workqueue);
        wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
-        gfs2_dump_lockstate(sdp);
+        glock_hash_walk(dump_glock_func, sdp);
 }
 void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
@@ -1717,66 +1681,15 @@ out:
        return error;
 }
-static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
-{
-        int ret;
-        spin_lock(&gl->gl_spin);
-        ret = __dump_glock(seq, gl);
-        spin_unlock(&gl->gl_spin);
-        return ret;
-}
-/**
- * gfs2_dump_lockstate - print out the current lockstate
- * @sdp: the filesystem
- * @ub: the buffer to copy the information into
- *
- * If @ub is NULL, dump the lockstate to the console.
- *
- */
-static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
-{
-        struct gfs2_glock *gl;
-        struct hlist_node *h;
-        unsigned int x;
-        int error = 0;
-        for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
-                read_lock(gl_lock_addr(x));
-                hlist_for_each_entry(gl, h, &gl_hash_table[x].hb_list, gl_list) {
-                        if (gl->gl_sbd != sdp)
-                                continue;
-                        error = dump_glock(NULL, gl);
-                        if (error)
-                                break;
-                }
-                read_unlock(gl_lock_addr(x));
-                if (error)
-                        break;
-        }
-        return error;
-}
 int __init gfs2_glock_init(void)
 {
        unsigned i;
        for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
-                INIT_HLIST_HEAD(&gl_hash_table[i].hb_list);
+                INIT_HLIST_BL_HEAD(&gl_hash_table[i]);
-        }
-#ifdef GL_HASH_LOCK_SZ
-        for(i = 0; i < GL_HASH_LOCK_SZ; i++) {
-                rwlock_init(&gl_hash_locks[i]);
        }
-#endif
        glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
                                          WQ_HIGHPRI | WQ_FREEZABLE, 0);
@@ -1802,62 +1715,54 @@ void gfs2_glock_exit(void)
        destroy_workqueue(gfs2_delete_workqueue);
 }
+static inline struct gfs2_glock *glock_hash_chain(unsigned hash)
+{
+        return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]),
+                              struct gfs2_glock, gl_list);
+}
+static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl)
+{
+        return hlist_bl_entry(rcu_dereference(gl->gl_list.next),
+                              struct gfs2_glock, gl_list);
+}
 static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 {
        struct gfs2_glock *gl;
-restart:
+        do {
-        read_lock(gl_lock_addr(gi->hash));
+                gl = gi->gl;
-        gl = gi->gl;
+                if (gl) {
-        if (gl) {
+                        gi->gl = glock_hash_next(gl);
-                gi->gl = hlist_entry(gl->gl_list.next,
+                } else {
-                                     struct gfs2_glock, gl_list);
+                        gi->gl = glock_hash_chain(gi->hash);
-        } else {
+                }
-                gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
+                while (gi->gl == NULL) {
-                                     struct gfs2_glock, gl_list);
+                        gi->hash++;
-        }
+                        if (gi->hash >= GFS2_GL_HASH_SIZE) {
-        if (gi->gl)
+                                rcu_read_unlock();
-                gfs2_glock_hold(gi->gl);
+                                return 1;
-        read_unlock(gl_lock_addr(gi->hash));
+                        }
-        if (gl)
+                        gi->gl = glock_hash_chain(gi->hash);
-                gfs2_glock_put(gl);
+                }
-        while (gi->gl == NULL) {
+        /* Skip entries for other sb and dead entries */
-                gi->hash++;
+        } while (gi->sdp != gi->gl->gl_sbd || atomic_read(&gi->gl->gl_ref) == 0);
-                if (gi->hash >= GFS2_GL_HASH_SIZE)
-                        return 1;
-                read_lock(gl_lock_addr(gi->hash));
-                gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
-                                     struct gfs2_glock, gl_list);
-                if (gi->gl)
-                        gfs2_glock_hold(gi->gl);
-                read_unlock(gl_lock_addr(gi->hash));
-        }
-        if (gi->sdp != gi->gl->gl_sbd)
-                goto restart;
        return 0;
 }
-static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi)
-{
-        if (gi->gl)
-                gfs2_glock_put(gi->gl);
-        gi->gl = NULL;
-}
 static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
 {
        struct gfs2_glock_iter *gi = seq->private;
        loff_t n = *pos;
        gi->hash = 0;
+        rcu_read_lock();
        do {
-                if (gfs2_glock_iter_next(gi)) {
+                if (gfs2_glock_iter_next(gi))
-                        gfs2_glock_iter_free(gi);
                        return NULL;
-                }
        } while (n--);
        return gi->gl;
@@ -1870,10 +1775,8 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
        (*pos)++;
-        if (gfs2_glock_iter_next(gi)) {
+        if (gfs2_glock_iter_next(gi))
-                gfs2_glock_iter_free(gi);
                return NULL;
-        }
        return gi->gl;
 }
@@ -1881,7 +1784,10 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
 static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
 {
        struct gfs2_glock_iter *gi = seq->private;
-        gfs2_glock_iter_free(gi);
+        if (gi->gl)
+                rcu_read_unlock();
+        gi->gl = NULL;
 }
 static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 691851ceb615..aea160690e94 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -118,7 +118,7 @@ struct lm_lockops {
        int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
        void (*lm_unmount) (struct gfs2_sbd *sdp);
        void (*lm_withdraw) (struct gfs2_sbd *sdp);
-        void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
+        void (*lm_put_lock) (struct gfs2_glock *gl);
        int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
                        unsigned int flags);
        void (*lm_cancel) (struct gfs2_glock *gl);
@@ -174,7 +174,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp,
                   int create, struct gfs2_glock **glp);
 void gfs2_glock_hold(struct gfs2_glock *gl);
 void gfs2_glock_put_nolock(struct gfs2_glock *gl);
-int gfs2_glock_put(struct gfs2_glock *gl);
+void gfs2_glock_put(struct gfs2_glock *gl);
 void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
                      struct gfs2_holder *gh);
 void gfs2_holder_reinit(unsigned int state, unsigned flags,
@@ -223,25 +223,22 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
        return error;
 }
-/*  Lock Value Block functions  */
+extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
+extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
-int gfs2_lvb_hold(struct gfs2_glock *gl);
+extern void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
-void gfs2_lvb_unhold(struct gfs2_glock *gl);
+extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
-void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
+extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
-void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
+extern void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
-void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
+extern void gfs2_glock_free(struct gfs2_glock *gl);
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
-void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
+extern int __init gfs2_glock_init(void);
-void gfs2_glock_thaw(struct gfs2_sbd *sdp);
+extern void gfs2_glock_exit(void);
-int __init gfs2_glock_init(void);
+extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
-void gfs2_glock_exit(void);
+extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
+extern int gfs2_register_debugfs(void);
-int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
+extern void gfs2_unregister_debugfs(void);
-void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
-int gfs2_register_debugfs(void);
-void gfs2_unregister_debugfs(void);
 extern const struct lm_lockops gfs2_dlm_ops;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 263561bf1a50..3754e3cbf02b 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -56,20 +56,26 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
        BUG_ON(current->journal_info);
        current->journal_info = &tr;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        while (!list_empty(head)) {
                bd = list_entry(head->next, struct gfs2_bufdata,
                                bd_ail_gl_list);
                bh = bd->bd_bh;
                gfs2_remove_from_ail(bd);
+                spin_unlock(&sdp->sd_ail_lock);
                bd->bd_bh = NULL;
                bh->b_private = NULL;
                bd->bd_blkno = bh->b_blocknr;
+                gfs2_log_lock(sdp);
                gfs2_assert_withdraw(sdp, !buffer_busy(bh));
                gfs2_trans_add_revoke(sdp, bd);
+                gfs2_log_unlock(sdp);
+                spin_lock(&sdp->sd_ail_lock);
        }
        gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
        gfs2_trans_end(sdp);
        gfs2_log_flush(sdp, NULL);
@@ -206,8 +212,17 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 static int inode_go_demote_ok(const struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_holder *gh;
        if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
                return 0;
+        if (!list_empty(&gl->gl_holders)) {
+                gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+                if (gh->gh_list.next != &gl->gl_holders)
+                        return 0;
+        }
        return 1;
 }
@@ -272,19 +287,6 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 }
 /**
- * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
- * @gl: the glock
- *
- * Returns: 1 if it's ok
- */
-static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
-{
-        const struct address_space *mapping = (const struct address_space *)(gl + 1);
-        return !mapping->nrpages;
-}
-/**
 * rgrp_go_lock - operation done after an rgrp lock is locked by
 *    a first holder on this node.
 * @gl: the glock
@@ -410,7 +412,6 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
        .go_xmote_th = rgrp_go_sync,
        .go_inval = rgrp_go_inval,
-        .go_demote_ok = rgrp_go_demote_ok,
        .go_lock = rgrp_go_lock,
        .go_unlock = rgrp_go_unlock,
        .go_dump = gfs2_rgrp_dump,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a79790c06275..870a89d6d4dc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -15,6 +15,8 @@
 #include <linux/workqueue.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
 #define DIO_WAIT        0x00000010
 #define DIO_METADATA    0x00000020
@@ -201,7 +203,7 @@ enum {
 };
 struct gfs2_glock {
-        struct hlist_node gl_list;
+        struct hlist_bl_node gl_list;
        unsigned long gl_flags;         /* GLF_... */
        struct lm_lockname gl_name;
        atomic_t gl_ref;
@@ -234,6 +236,7 @@ struct gfs2_glock {
        atomic_t gl_ail_count;
        struct delayed_work gl_work;
        struct work_struct gl_delete;
+        struct rcu_head gl_rcu;
 };
 #define GFS2_MIN_LVB_SIZE 32    /* Min size of LVB that gfs2 supports */
@@ -314,6 +317,7 @@ enum {
        QDF_USER                = 0,
        QDF_CHANGE              = 1,
        QDF_LOCKED              = 2,
+        QDF_REFRESH             = 3,
 };
 struct gfs2_quota_data {
@@ -647,6 +651,7 @@ struct gfs2_sbd {
        unsigned int sd_log_flush_head;
        u64 sd_log_flush_wrapped;
+        spinlock_t sd_ail_lock;
        struct list_head sd_ail1_list;
        struct list_head sd_ail2_list;
        u64 sd_ail_sync_gen;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7aa7d4f8984a..97d54a28776a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -763,14 +763,15 @@ fail:
        return error;
 }
-static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
+static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
+                              const struct qstr *qstr)
 {
        int err;
        size_t len;
        void *value;
        char *name;
-        err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
+        err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
                                           &name, &value, &len);
        if (err) {
@@ -854,7 +855,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
        if (error)
                goto fail_gunlock2;
-        error = gfs2_security_init(dip, GFS2_I(inode));
+        error = gfs2_security_init(dip, GFS2_I(inode), name);
        if (error)
                goto fail_gunlock2;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 6e493aee28f8..98c80d8c2a62 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -22,7 +22,6 @@ static void gdlm_ast(void *arg)
 {
        struct gfs2_glock *gl = arg;
        unsigned ret = gl->gl_state;
-        struct gfs2_sbd *sdp = gl->gl_sbd;
        BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
@@ -31,12 +30,7 @@ static void gdlm_ast(void *arg)
        switch (gl->gl_lksb.sb_status) {
        case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
-                if (gl->gl_ops->go_flags & GLOF_ASPACE)
+                gfs2_glock_free(gl);
-                        kmem_cache_free(gfs2_glock_aspace_cachep, gl);
-                else
-                        kmem_cache_free(gfs2_glock_cachep, gl);
-                if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-                        wake_up(&sdp->sd_glock_wait);
                return;
        case -DLM_ECANCEL: /* Cancel while getting lock */
                ret |= LM_OUT_CANCELED;
@@ -164,16 +158,14 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
                        GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
 }
-static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
+static void gdlm_put_lock(struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
        int error;
        if (gl->gl_lksb.sb_lkid == 0) {
-                kmem_cache_free(cachep, gl);
+                gfs2_glock_free(gl);
-                if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-                        wake_up(&sdp->sd_glock_wait);
                return;
        }
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index eb01f3575e10..e7ed31f858dd 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -67,7 +67,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
 * @mapping: The associated mapping (maybe NULL)
 * @bd: The gfs2_bufdata to remove
 *
- * The log lock _must_ be held when calling this function
+ * The ail lock _must_ be held when calling this function
 *
 */
@@ -88,8 +88,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
 */
 static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
-__releases(&sdp->sd_log_lock)
+__releases(&sdp->sd_ail_lock)
-__acquires(&sdp->sd_log_lock)
+__acquires(&sdp->sd_ail_lock)
 {
        struct gfs2_bufdata *bd, *s;
        struct buffer_head *bh;
@@ -117,7 +117,7 @@ __acquires(&sdp->sd_log_lock)
                        list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
                        get_bh(bh);
-                        gfs2_log_unlock(sdp);
+                        spin_unlock(&sdp->sd_ail_lock);
                        lock_buffer(bh);
                        if (test_clear_buffer_dirty(bh)) {
                                bh->b_end_io = end_buffer_write_sync;
@@ -126,7 +126,7 @@ __acquires(&sdp->sd_log_lock)
                                unlock_buffer(bh);
                                brelse(bh);
                        }
-                        gfs2_log_lock(sdp);
+                        spin_lock(&sdp->sd_ail_lock);
                        retry = 1;
                        break;
@@ -175,10 +175,10 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
        struct gfs2_ail *ai;
        int done = 0;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        head = &sdp->sd_ail1_list;
        if (list_empty(head)) {
-                gfs2_log_unlock(sdp);
+                spin_unlock(&sdp->sd_ail_lock);
                return;
        }
        sync_gen = sdp->sd_ail_sync_gen++;
@@ -189,13 +189,13 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
                        if (ai->ai_sync_gen >= sync_gen)
                                continue;
                        ai->ai_sync_gen = sync_gen;
-                        gfs2_ail1_start_one(sdp, ai); /* This may drop log lock */
+                        gfs2_ail1_start_one(sdp, ai); /* This may drop ail lock */
                        done = 0;
                        break;
                }
        }
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
 }
 static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
@@ -203,7 +203,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
        struct gfs2_ail *ai, *s;
        int ret;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
                if (gfs2_ail1_empty_one(sdp, ai, flags))
@@ -214,7 +214,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
        ret = list_empty(&sdp->sd_ail1_list);
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
        return ret;
 }
@@ -247,7 +247,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
        int wrap = (new_tail < old_tail);
        int a, b, rm;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
                a = (old_tail <= ai->ai_first);
@@ -263,7 +263,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
                kfree(ai);
        }
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
 }
 /**
@@ -421,7 +421,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
        struct gfs2_ail *ai;
        unsigned int tail;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        if (list_empty(&sdp->sd_ail1_list)) {
                tail = sdp->sd_log_head;
@@ -430,7 +430,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
                tail = ai->ai_first;
        }
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
        return tail;
 }
@@ -743,10 +743,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
        sdp->sd_log_commited_databuf = 0;
        sdp->sd_log_commited_revoke = 0;
+        spin_lock(&sdp->sd_ail_lock);
        if (!list_empty(&ai->ai_ail1_list)) {
                list_add(&ai->ai_list, &sdp->sd_ail1_list);
                ai = NULL;
        }
+        spin_unlock(&sdp->sd_ail_lock);
        gfs2_log_unlock(sdp);
        trace_gfs2_log_flush(sdp, 0);
        up_write(&sdp->sd_log_flush_lock);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index bf33f822058d..e919abf25ecd 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -51,8 +51,10 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
        /* If this buffer is in the AIL and it has already been written
         * to in-place disk block, remove it from the AIL.
         */
+        spin_lock(&sdp->sd_ail_lock);
        if (bd->bd_ail)
                list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
+        spin_unlock(&sdp->sd_ail_lock);
        get_bh(bh);
        atomic_inc(&sdp->sd_log_pinned);
        trace_gfs2_pin(bd, 1);
@@ -80,7 +82,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
        mark_buffer_dirty(bh);
        clear_buffer_pinned(bh);
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        if (bd->bd_ail) {
                list_del(&bd->bd_ail_st_list);
                brelse(bh);
@@ -91,9 +93,11 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
        }
        bd->bd_ail = ai;
        list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
-        clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+        spin_unlock(&sdp->sd_ail_lock);
+        if (test_and_clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags))
+                gfs2_glock_schedule_for_reclaim(bd->bd_gl);
        trace_gfs2_pin(bd, 0);
-        gfs2_log_unlock(sdp);
        unlock_buffer(bh);
        atomic_dec(&sdp->sd_log_pinned);
 }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 85ba027d1c4d..888a5f5a1a58 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -14,6 +14,8 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
 #include <asm/atomic.h>
 #include "gfs2.h"
@@ -45,7 +47,7 @@ static void gfs2_init_glock_once(void *foo)
 {
        struct gfs2_glock *gl = foo;
-        INIT_HLIST_NODE(&gl->gl_list);
+        INIT_HLIST_BL_NODE(&gl->gl_list);
        spin_lock_init(&gl->gl_spin);
        INIT_LIST_HEAD(&gl->gl_holders);
        INIT_LIST_HEAD(&gl->gl_lru);
@@ -59,14 +61,7 @@ static void gfs2_init_gl_aspace_once(void *foo)
        struct address_space *mapping = (struct address_space *)(gl + 1);
        gfs2_init_glock_once(gl);
-        memset(mapping, 0, sizeof(*mapping));
+        address_space_init_once(mapping);
-        INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-        spin_lock_init(&mapping->tree_lock);
-        spin_lock_init(&mapping->i_mmap_lock);
-        INIT_LIST_HEAD(&mapping->private_list);
-        spin_lock_init(&mapping->private_lock);
-        INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
 }
 /**
@@ -198,6 +193,8 @@ static void __exit exit_gfs2_fs(void)
        unregister_filesystem(&gfs2meta_fs_type);
        destroy_workqueue(gfs_recovery_wq);
+        rcu_barrier();
        kmem_cache_destroy(gfs2_quotad_cachep);
        kmem_cache_destroy(gfs2_rgrpd_cachep);
        kmem_cache_destroy(gfs2_bufdata_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 939739c7b3f9..01d97f486553 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -326,6 +326,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
                brelse(bh);
        }
        if (bd) {
+                spin_lock(&sdp->sd_ail_lock);
                if (bd->bd_ail) {
                        gfs2_remove_from_ail(bd);
                        bh->b_private = NULL;
@@ -333,6 +334,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
                        bd->bd_blkno = bh->b_blocknr;
                        gfs2_trans_add_revoke(sdp, bd);
                }
+                spin_unlock(&sdp->sd_ail_lock);
        }
        clear_buffer_dirty(bh);
        clear_buffer_uptodate(bh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 777927ce6f79..42ef24355afb 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -99,6 +99,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        init_waitqueue_head(&sdp->sd_log_waitq);
        init_waitqueue_head(&sdp->sd_logd_waitq);
+        spin_lock_init(&sdp->sd_ail_lock);
        INIT_LIST_HEAD(&sdp->sd_ail1_list);
        INIT_LIST_HEAD(&sdp->sd_ail2_list);
@@ -928,17 +929,9 @@ static const match_table_t nolock_tokens = {
        { Opt_err, NULL },
 };
-static void nolock_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        kmem_cache_free(cachep, gl);
-        if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-                wake_up(&sdp->sd_glock_wait);
-}
 static const struct lm_lockops nolock_ops = {
        .lm_proto_name = "lock_nolock",
-        .lm_put_lock = nolock_put_lock,
+        .lm_put_lock = gfs2_glock_free,
        .lm_tokens = &nolock_tokens,
 };
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index d8b26ac2e20b..09e436a50723 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1026,9 +1026,9 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 /**
 * gfs2_permission -
- * @inode:
+ * @inode: The inode
- * @mask:
+ * @mask: The mask to be tested
- * @nd: passed from Linux VFS, ignored by us
+ * @flags: Indicates whether this is an RCU path walk or not
 *
 * This may be called from the VFS directly, or from within GFS2 with the
 * inode locked, so we look to see if the glock is already locked and only
@@ -1044,11 +1044,11 @@ int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
        int error;
        int unlock = 0;
-        if (flags & IPERM_FLAG_RCU)
-                return -ECHILD;
        ip = GFS2_I(inode);
        if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
+                if (flags & IPERM_FLAG_RCU)
+                        return -ECHILD;
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
                if (error)
                        return error;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a689901963de..e23d9864c418 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -834,6 +834,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
                        goto out_end_trans;
                do_qc(qd, -qd->qd_change_sync);
+                set_bit(QDF_REFRESH, &qd->qd_flags);
        }
        error = 0;
@@ -929,6 +930,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al = ip->i_alloc;
+        struct gfs2_quota_data *qd;
        unsigned int x;
        int error = 0;
@@ -942,7 +944,11 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
             sort_qd, NULL);
        for (x = 0; x < al->al_qd_num; x++) {
-                error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]);
+                int force = NO_FORCE;
+                qd = al->al_qd[x];
+                if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags))
+                        force = FORCE;
+                error = do_glock(qd, force, &al->al_qd_ghs[x]);
                if (error)
                        break;
        }
@@ -1587,6 +1593,8 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
        offset = qd2offset(qd);
        alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota));
+        if (gfs2_is_stuffed(ip))
+                alloc_required = 1;
        if (alloc_required) {
                al = gfs2_alloc_get(ip);
                if (al == NULL)
@@ -1600,7 +1608,9 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
                blocks += gfs2_rg_blocks(al);
        }
-        error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
+        /* Some quotas span block boundaries and can update two blocks,
+           adding an extra block to the transaction to handle such quotas */
+        error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 2, 0);
        if (error)
                goto out_release;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7293ea27020c..cf930cd9664a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1602,7 +1602,7 @@ rgrp_error:
 *
 */
-void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
+void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd;
@@ -1617,7 +1617,21 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_trans_add_rg(rgd);
+}
+/**
+ * gfs2_free_data - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        __gfs2_free_data(ip, bstart, blen);
        gfs2_statfs_change(sdp, 0, +blen, 0);
        gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
 }
@@ -1630,7 +1644,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
 *
 */
-void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd;
@@ -1645,10 +1659,24 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_trans_add_rg(rgd);
+        gfs2_meta_wipe(ip, bstart, blen);
+}
+/**
+ * gfs2_free_meta - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        __gfs2_free_meta(ip, bstart, blen);
        gfs2_statfs_change(sdp, 0, +blen, 0);
        gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
-        gfs2_meta_wipe(ip, bstart, blen);
 }
 void gfs2_unlink_di(struct inode *inode)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 50c2bb04369c..a80e3034ac47 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -52,7 +52,9 @@ extern int gfs2_ri_update(struct gfs2_inode *ip);
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
 extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
+extern void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
 extern void gfs2_unlink_di(struct inode *inode);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index afa66aaa2237..b4d70b13be92 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -238,46 +238,22 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 }
 /*
- * hfs_unlink()
+ * hfs_remove()
 *
- * This is the unlink() entry in the inode_operations structure for
+ * This serves as both unlink() and rmdir() in the inode_operations
- * regular HFS directories.  The purpose is to delete an existing
+ * structure for regular HFS directories.  The purpose is to delete
- * file, given the inode for the parent directory and the name
+ * an existing child, given the inode for the parent directory and
- * (and its length) of the existing file.
+ * the name (and its length) of the existing directory.
- */
-static int hfs_unlink(struct inode *dir, struct dentry *dentry)
-{
-        struct inode *inode;
-        int res;
-        inode = dentry->d_inode;
-        res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
-        if (res)
-                return res;
-        drop_nlink(inode);
-        hfs_delete_inode(inode);
-        inode->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(inode);
-        return res;
-}
-/*
- * hfs_rmdir()
 *
- * This is the rmdir() entry in the inode_operations structure for
+ * HFS does not have hardlinks, so both rmdir and unlink set the
- * regular HFS directories.  The purpose is to delete an existing
+ * link count to 0.  The only difference is the emptiness check.
- * directory, given the inode for the parent directory and the name
- * (and its length) of the existing directory.
 */
-static int hfs_rmdir(struct inode *dir, struct dentry *dentry)
+static int hfs_remove(struct inode *dir, struct dentry *dentry)
 {
-        struct inode *inode;
+        struct inode *inode = dentry->d_inode;
        int res;
-        inode = dentry->d_inode;
+        if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
-        if (inode->i_size != 2)
                return -ENOTEMPTY;
        res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
        if (res)
@@ -307,7 +283,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        /* Unlink destination if it already exists */
        if (new_dentry->d_inode) {
-                res = hfs_unlink(new_dir, new_dentry);
+                res = hfs_remove(new_dir, new_dentry);
                if (res)
                        return res;
        }
@@ -332,9 +308,9 @@ const struct file_operations hfs_dir_operations = {
 const struct inode_operations hfs_dir_inode_operations = {
        .create         = hfs_create,
        .lookup         = hfs_lookup,
-        .unlink         = hfs_unlink,
+        .unlink         = hfs_remove,
        .mkdir          = hfs_mkdir,
-        .rmdir          = hfs_rmdir,
+        .rmdir          = hfs_remove,
        .rename         = hfs_rename,
        .setattr        = hfs_inode_setattr,
 };
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 63b6f5632318..0c39dc3ef7d7 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -1,7 +1,7 @@
 config HPFS_FS
        tristate "OS/2 HPFS file system support"
        depends on BLOCK
-        depends on BKL # nontrivial to fix
+        depends on BROKEN || !PREEMPT
        help
          OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
          is the file system used for organizing files on OS/2 hard disk
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index d32f63a569f7..b3d7c0ddb609 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -6,16 +6,15 @@
 *  directory VFS functions
 */
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include "hpfs_fn.h"
 static int hpfs_dir_release(struct inode *inode, struct file *filp)
 {
-        lock_kernel();
+        hpfs_lock(inode->i_sb);
        hpfs_del_pos(inode, &filp->f_pos);
        /*hpfs_write_if_changed(inode);*/
-        unlock_kernel();
+        hpfs_unlock(inode->i_sb);
        return 0;
 }
@@ -30,7 +29,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
        struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
        struct super_block *s = i->i_sb;
-        lock_kernel();
+        hpfs_lock(s);
        /*printk("dir lseek\n");*/
        if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
@@ -43,12 +42,12 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
        }
        mutex_unlock(&i->i_mutex);
 ok:
-        unlock_kernel();
+        hpfs_unlock(s);
        return filp->f_pos = new_off;
 fail:
        mutex_unlock(&i->i_mutex);
        /*printk("illegal lseek: %016llx\n", new_off);*/
-        unlock_kernel();
+        hpfs_unlock(s);
        return -ESPIPE;
 }
@@ -64,7 +63,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        int c1, c2 = 0;
        int ret = 0;
-        lock_kernel();
+        hpfs_lock(inode->i_sb);
        if (hpfs_sb(inode->i_sb)->sb_chk) {
                if (hpfs_chk_sectors(inode->i_sb, inode->i_ino, 1, "dir_fnode")) {
@@ -167,7 +166,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                hpfs_brelse4(&qbh);
        }
 out:
-        unlock_kernel();
+        hpfs_unlock(inode->i_sb);
        return ret;
 }
@@ -197,10 +196,10 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
        struct inode *result = NULL;
        struct hpfs_inode_info *hpfs_result;
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        if ((err = hpfs_chk_name(name, &len))) {
                if (err == -ENAMETOOLONG) {
-                        unlock_kernel();
+                        hpfs_unlock(dir->i_sb);
                        return ERR_PTR(-ENAMETOOLONG);
                }
                goto end_add;
@@ -298,7 +297,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
        end:
        end_add:
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        d_add(dentry, result);
        return NULL;
@@ -311,7 +310,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
        
        /*bail:*/
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return ERR_PTR(-ENOENT);
 }
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index c0340887c7ea..2dbae20450f8 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -6,16 +6,15 @@
 *  file VFS functions
 */
-#include <linux/smp_lock.h>
 #include "hpfs_fn.h"
 #define BLOCKS(size) (((size) + 511) >> 9)
 static int hpfs_file_release(struct inode *inode, struct file *file)
 {
-        lock_kernel();
+        hpfs_lock(inode->i_sb);
        hpfs_write_if_changed(inode);
-        unlock_kernel();
+        hpfs_unlock(inode->i_sb);
        return 0;
 }
@@ -49,14 +48,14 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
 static void hpfs_truncate(struct inode *i)
 {
        if (IS_IMMUTABLE(i)) return /*-EPERM*/;
-        lock_kernel();
+        hpfs_lock(i->i_sb);
        hpfs_i(i)->i_n_secs = 0;
        i->i_blocks = 1 + ((i->i_size + 511) >> 9);
        hpfs_i(i)->mmu_private = i->i_size;
        hpfs_truncate_btree(i->i_sb, i->i_ino, 1, ((i->i_size + 511) >> 9));
        hpfs_write_inode(i);
        hpfs_i(i)->i_n_secs = 0;
-        unlock_kernel();
+        hpfs_unlock(i->i_sb);
 }
 static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 1c43dbea55e8..c15adbca07ff 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -342,3 +342,25 @@ static inline time32_t gmt_to_local(struct super_block *s, time_t t)
        extern struct timezone sys_tz;
        return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift;
 }
+/*
+ * Locking:
+ *
+ * hpfs_lock() is a leftover from the big kernel lock.
+ * Right now, these functions are empty and only left
+ * for documentation purposes. The file system no longer
+ * works on SMP systems, so the lock is not needed
+ * any more.
+ *
+ * If someone is interested in making it work again, this
+ * would be the place to start by adding a per-superblock
+ * mutex and fixing all the bugs and performance issues
+ * caused by that.
+ */
+static inline void hpfs_lock(struct super_block *s)
+{
+}
+static inline void hpfs_unlock(struct super_block *s)
+{
+}
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 1ae35baa539e..87f1f787e767 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -6,7 +6,6 @@
 *  inode VFS functions
 */
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include "hpfs_fn.h"
@@ -267,7 +266,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
        struct inode *inode = dentry->d_inode;
        int error = -EINVAL;
-        lock_kernel();
+        hpfs_lock(inode->i_sb);
        if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root)
                goto out_unlock;
        if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size)
@@ -290,7 +289,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
        hpfs_write_inode(inode);
 out_unlock:
-        unlock_kernel();
+        hpfs_unlock(inode->i_sb);
        return error;
 }
@@ -307,8 +306,8 @@ void hpfs_evict_inode(struct inode *inode)
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
        if (!inode->i_nlink) {
-                lock_kernel();
+                hpfs_lock(inode->i_sb);
                hpfs_remove_fnode(inode->i_sb, inode->i_ino);
-                unlock_kernel();
+                hpfs_unlock(inode->i_sb);
        }
 }
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index f4ad9e31ddc4..d5f8c8a19023 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -6,7 +6,6 @@
 *  adding & removing files & directories
 */
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include "hpfs_fn.h"
 static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
@@ -25,7 +24,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct hpfs_dirent dee;
        int err;
        if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        err = -ENOSPC;
        fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
        if (!fnode)
@@ -103,7 +102,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        }
        d_instantiate(dentry, result);
        mutex_unlock(&hpfs_i(dir)->i_mutex);
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return 0;
 bail3:
        mutex_unlock(&hpfs_i(dir)->i_mutex);
@@ -115,7 +114,7 @@ bail1:
        brelse(bh);
        hpfs_free_sectors(dir->i_sb, fno, 1);
 bail:
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return err;
 }
@@ -132,7 +131,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
        int err;
        if ((err = hpfs_chk_name(name, &len)))
                return err==-ENOENT ? -EINVAL : err;
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        err = -ENOSPC;
        fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
        if (!fnode)
@@ -195,7 +194,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
        }
        d_instantiate(dentry, result);
        mutex_unlock(&hpfs_i(dir)->i_mutex);
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return 0;
 bail2:
@@ -205,7 +204,7 @@ bail1:
        brelse(bh);
        hpfs_free_sectors(dir->i_sb, fno, 1);
 bail:
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return err;
 }
@@ -224,7 +223,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
        if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
        if (!new_valid_dev(rdev))
                return -EINVAL;
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        err = -ENOSPC;
        fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
        if (!fnode)
@@ -274,7 +273,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
        d_instantiate(dentry, result);
        mutex_unlock(&hpfs_i(dir)->i_mutex);
        brelse(bh);
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return 0;
 bail2:
        mutex_unlock(&hpfs_i(dir)->i_mutex);
@@ -283,7 +282,7 @@ bail1:
        brelse(bh);
        hpfs_free_sectors(dir->i_sb, fno, 1);
 bail:
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return err;
 }
@@ -299,9 +298,9 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
        struct inode *result;
        int err;
        if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        if (hpfs_sb(dir->i_sb)->sb_eas < 2) {
-                unlock_kernel();
+                hpfs_unlock(dir->i_sb);
                return -EPERM;
        }
        err = -ENOSPC;
@@ -354,7 +353,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
        hpfs_write_inode_nolock(result);
        d_instantiate(dentry, result);
        mutex_unlock(&hpfs_i(dir)->i_mutex);
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return 0;
 bail2:
        mutex_unlock(&hpfs_i(dir)->i_mutex);
@@ -363,7 +362,7 @@ bail1:
        brelse(bh);
        hpfs_free_sectors(dir->i_sb, fno, 1);
 bail:
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return err;
 }
@@ -380,7 +379,7 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
        int rep = 0;
        int err;
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        hpfs_adjust_length(name, &len);
 again:
        mutex_lock(&hpfs_i(inode)->i_parent_mutex);
@@ -416,7 +415,7 @@ again:
                dentry_unhash(dentry);
                if (!d_unhashed(dentry)) {
                        dput(dentry);
-                        unlock_kernel();
+                        hpfs_unlock(dir->i_sb);
                        return -ENOSPC;
                }
                if (generic_permission(inode, MAY_WRITE, 0, NULL) ||
@@ -435,7 +434,7 @@ again:
                        if (!err)
                                goto again;
                }
-                unlock_kernel();
+                hpfs_unlock(dir->i_sb);
                return -ENOSPC;
        default:
                drop_nlink(inode);
@@ -448,7 +447,7 @@ out1:
 out:
        mutex_unlock(&hpfs_i(dir)->i_mutex);
        mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return err;
 }
@@ -466,7 +465,7 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
        int r;
        hpfs_adjust_length(name, &len);
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        mutex_lock(&hpfs_i(inode)->i_parent_mutex);
        mutex_lock(&hpfs_i(dir)->i_mutex);
        err = -ENOENT;
@@ -508,7 +507,7 @@ out1:
 out:
        mutex_unlock(&hpfs_i(dir)->i_mutex);
        mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return err;
 }
@@ -521,21 +520,21 @@ static int hpfs_symlink_readpage(struct file *file, struct page *page)
        int err;
        err = -EIO;
-        lock_kernel();
+        hpfs_lock(i->i_sb);
        if (!(fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh)))
                goto fail;
        err = hpfs_read_ea(i->i_sb, fnode, "SYMLINK", link, PAGE_SIZE);
        brelse(bh);
        if (err)
                goto fail;
-        unlock_kernel();
+        hpfs_unlock(i->i_sb);
        SetPageUptodate(page);
        kunmap(page);
        unlock_page(page);
        return 0;
 fail:
-        unlock_kernel();
+        hpfs_unlock(i->i_sb);
        SetPageError(page);
        kunmap(page);
        unlock_page(page);
@@ -567,7 +566,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        err = 0;
        hpfs_adjust_length(old_name, &old_len);
-        lock_kernel();
+        hpfs_lock(i->i_sb);
        /* order doesn't matter, due to VFS exclusion */
        mutex_lock(&hpfs_i(i)->i_parent_mutex);
        if (new_inode)
@@ -659,7 +658,7 @@ end1:
        mutex_unlock(&hpfs_i(i)->i_parent_mutex);
        if (new_inode)
                mutex_unlock(&hpfs_i(new_inode)->i_parent_mutex);
-        unlock_kernel();
+        hpfs_unlock(i->i_sb);
        return err;
 }
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index b30426b1fc97..c89b40808587 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -13,7 +13,6 @@
 #include <linux/statfs.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/bitmap.h>
 #include <linux/slab.h>
@@ -103,15 +102,11 @@ static void hpfs_put_super(struct super_block *s)
 {
        struct hpfs_sb_info *sbi = hpfs_sb(s);
-        lock_kernel();
        kfree(sbi->sb_cp_table);
        kfree(sbi->sb_bmp_dir);
        unmark_dirty(s);
        s->s_fs_info = NULL;
        kfree(sbi);
-        unlock_kernel();
 }
 unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
@@ -143,7 +138,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct super_block *s = dentry->d_sb;
        struct hpfs_sb_info *sbi = hpfs_sb(s);
        u64 id = huge_encode_dev(s->s_bdev->bd_dev);
-        lock_kernel();
+        hpfs_lock(s);
        /*if (sbi->sb_n_free == -1) {*/
                sbi->sb_n_free = count_bitmaps(s);
@@ -160,7 +155,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = 254;
-        unlock_kernel();
+        hpfs_unlock(s);
        return 0;
 }
@@ -406,7 +401,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
        
        *flags |= MS_NOATIME;
        
-        lock_kernel();
+        hpfs_lock(s);
        lock_super(s);
        uid = sbi->sb_uid; gid = sbi->sb_gid;
        umask = 0777 & ~sbi->sb_mode;
@@ -441,12 +436,12 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
        replace_mount_options(s, new_opts);
        unlock_super(s);
-        unlock_kernel();
+        hpfs_unlock(s);
        return 0;
 out_err:
        unlock_super(s);
-        unlock_kernel();
+        hpfs_unlock(s);
        kfree(new_opts);
        return -EINVAL;
 }
@@ -484,13 +479,15 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
        int o;
-        lock_kernel();
+        if (num_possible_cpus() > 1) {
+                printk(KERN_ERR "HPFS is not SMP safe\n");
+                return -EINVAL;
+        }
        save_mount_options(s, options);
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi) {
-                unlock_kernel();
                return -ENOMEM;
        }
        s->s_fs_info = sbi;
@@ -677,7 +674,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
                        root->i_blocks = 5;
                hpfs_brelse4(&qbh);
        }
-        unlock_kernel();
        return 0;
 bail4:  brelse(bh2);
@@ -689,7 +685,6 @@ bail0:
        kfree(sbi->sb_cp_table);
        s->s_fs_info = NULL;
        kfree(sbi);
-        unlock_kernel();
        return -EINVAL;
 }
diff --git a/fs/inode.c b/fs/inode.c
index da85e56378f3..9910c039f026 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -84,16 +84,13 @@ static struct hlist_head *inode_hashtable __read_mostly;
 DEFINE_SPINLOCK(inode_lock);
 /*
- * iprune_sem provides exclusion between the kswapd or try_to_free_pages
+ * iprune_sem provides exclusion between the icache shrinking and the
- * icache shrinking path, and the umount path.  Without this exclusion,
+ * umount path.
- * by the time prune_icache calls iput for the inode whose pages it has
- * been invalidating, or by the time it calls clear_inode & destroy_inode
- * from its final dispose_list, the struct super_block they refer to
- * (for inode->i_sb->s_op) may already have been freed and reused.
 *
- * We make this an rwsem because the fastpath is icache shrinking. In
+ * We don't actually need it to protect anything in the umount path,
- * some cases a filesystem may be doing a significant amount of work in
+ * but only need to cycle through it to make sure any inode that
- * its inode reclaim code, so this should improve parallelism.
+ * prune_icache took off the LRU list has been fully torn down by the
+ * time we are past evict_inodes.
 */
 static DECLARE_RWSEM(iprune_sem);
@@ -295,6 +292,20 @@ static void destroy_inode(struct inode *inode)
                call_rcu(&inode->i_rcu, i_callback);
 }
+void address_space_init_once(struct address_space *mapping)
+{
+        memset(mapping, 0, sizeof(*mapping));
+        INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+        spin_lock_init(&mapping->tree_lock);
+        spin_lock_init(&mapping->i_mmap_lock);
+        INIT_LIST_HEAD(&mapping->private_list);
+        spin_lock_init(&mapping->private_lock);
+        INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+        mutex_init(&mapping->unmap_mutex);
+}
+EXPORT_SYMBOL(address_space_init_once);
 /*
 * These are initializations that only need to be done
 * once, because the fields are idempotent across use
@@ -308,13 +319,7 @@ void inode_init_once(struct inode *inode)
        INIT_LIST_HEAD(&inode->i_devices);
        INIT_LIST_HEAD(&inode->i_wb_list);
        INIT_LIST_HEAD(&inode->i_lru);
-        INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
+        address_space_init_once(&inode->i_data);
-        spin_lock_init(&inode->i_data.tree_lock);
-        spin_lock_init(&inode->i_data.i_mmap_lock);
-        INIT_LIST_HEAD(&inode->i_data.private_list);
-        spin_lock_init(&inode->i_data.private_lock);
-        INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
-        INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
        i_size_ordered_init(inode);
 #ifdef CONFIG_FSNOTIFY
        INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
@@ -508,17 +513,12 @@ void evict_inodes(struct super_block *sb)
        struct inode *inode, *next;
        LIST_HEAD(dispose);
-        down_write(&iprune_sem);
        spin_lock(&inode_lock);
        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
                if (atomic_read(&inode->i_count))
                        continue;
+                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
-                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
-                        WARN_ON(1);
                        continue;
-                }
                inode->i_state |= I_FREEING;
@@ -534,28 +534,40 @@ void evict_inodes(struct super_block *sb)
        spin_unlock(&inode_lock);
        dispose_list(&dispose);
+        /*
+         * Cycle through iprune_sem to make sure any inode that prune_icache
+         * moved off the list before we took the lock has been fully torn
+         * down.
+         */
+        down_write(&iprune_sem);
        up_write(&iprune_sem);
 }
 /**
 * invalidate_inodes    - attempt to free all inodes on a superblock
 * @sb:         superblock to operate on
+ * @kill_dirty: flag to guide handling of dirty inodes
 *
 * Attempts to free all inodes for a given superblock.  If there were any
 * busy inodes return a non-zero value, else zero.
+ * If @kill_dirty is set, discard dirty inodes too, otherwise treat
+ * them as busy.
 */
-int invalidate_inodes(struct super_block *sb)
+int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 {
        int busy = 0;
        struct inode *inode, *next;
        LIST_HEAD(dispose);
-        down_write(&iprune_sem);
        spin_lock(&inode_lock);
        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
                        continue;
+                if (inode->i_state & I_DIRTY && !kill_dirty) {
+                        busy = 1;
+                        continue;
+                }
                if (atomic_read(&inode->i_count)) {
                        busy = 1;
                        continue;
@@ -575,7 +587,6 @@ int invalidate_inodes(struct super_block *sb)
        spin_unlock(&inode_lock);
        dispose_list(&dispose);
-        up_write(&iprune_sem);
        return busy;
 }
diff --git a/fs/internal.h b/fs/internal.h
index 0663568b1247..f3d15de44b15 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -106,10 +106,23 @@ extern void put_super(struct super_block *sb);
 struct nameidata;
 extern struct file *nameidata_to_filp(struct nameidata *);
 extern void release_open_intent(struct nameidata *);
+struct open_flags {
+        int open_flag;
+        int mode;
+        int acc_mode;
+        int intent;
+};
+extern struct file *do_filp_open(int dfd, const char *pathname,
+                const struct open_flags *op, int lookup_flags);
+extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
+                const char *, const struct open_flags *, int lookup_flags);
+extern long do_handle_open(int mountdirfd,
+                           struct file_handle __user *ufh, int open_flag);
 /*
 * inode.c
 */
 extern int get_nr_dirty_inodes(void);
 extern void evict_inodes(struct super_block *);
-extern int invalidate_inodes(struct super_block *);
+extern int invalidate_inodes(struct super_block *, bool);
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index ed752cb38474..dd4687ff30d0 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -124,9 +124,13 @@ isofs_export_encode_fh(struct dentry *dentry,
         * offset of the inode and the upper 16 bits of fh32[1] to
         * hold the offset of the parent.
         */
+        if (connectable && (len < 5)) {
-        if (len < 3 || (connectable && len < 5))
+                *max_len = 5;
+                return 255;
+        } else if (len < 3) {
+                *max_len = 3;
                return 255;
+        }
        len = 3;
        fh32[0] = ei->i_iget5_block;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index da1b5e4ffce1..eb11601f2e00 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -839,7 +839,7 @@ journal_t * journal_init_inode (struct inode *inode)
        err = journal_bmap(journal, 0, &blocknr);
        /* If that failed, give up */
        if (err) {
-                printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
+                printk(KERN_ERR "%s: Cannot locate journal superblock\n",
                       __func__);
                goto out_err;
        }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 97e73469b2c4..90407b8fece7 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -991,7 +991,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
        err = jbd2_journal_bmap(journal, 0, &blocknr);
        /* If that failed, give up */
        if (err) {
-                printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
+                printk(KERN_ERR "%s: Cannot locate journal superblock\n",
                       __func__);
                goto out_err;
        }
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 92978658ed18..82faddd1f321 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -215,8 +215,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
           no chance of AB-BA deadlock involving its f->sem). */
        mutex_unlock(&f->sem);
-        ret = jffs2_do_create(c, dir_f, f, ri,
+        ret = jffs2_do_create(c, dir_f, f, ri, &dentry->d_name);
-                              dentry->d_name.name, dentry->d_name.len);
        if (ret)
                goto fail;
@@ -386,7 +385,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
        jffs2_complete_reservation(c);
-        ret = jffs2_init_security(inode, dir_i);
+        ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
        if (ret)
                goto fail;
@@ -530,7 +529,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
        jffs2_complete_reservation(c);
-        ret = jffs2_init_security(inode, dir_i);
+        ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
        if (ret)
                goto fail;
@@ -703,7 +702,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
        jffs2_complete_reservation(c);
-        ret = jffs2_init_security(inode, dir_i);
+        ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
        if (ret)
                goto fail;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 5a53d9bdb2b5..e4619b00f7c5 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -401,7 +401,7 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
                            struct jffs2_raw_inode *ri, unsigned char *buf,
                            uint32_t offset, uint32_t writelen, uint32_t *retlen);
 int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f,
-                    struct jffs2_raw_inode *ri, const char *name, int namelen);
+                    struct jffs2_raw_inode *ri, const struct qstr *qstr);
 int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name,
                    int namelen, struct jffs2_inode_info *dead_f, uint32_t time);
 int jffs2_do_link(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino,
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 239f51216a68..cfeb7164b085 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -23,14 +23,15 @@
 #include "nodelist.h"
 /* ---- Initial Security Label Attachment -------------- */
-int jffs2_init_security(struct inode *inode, struct inode *dir)
+int jffs2_init_security(struct inode *inode, struct inode *dir,
+                        const struct qstr *qstr)
 {
        int rc;
        size_t len;
        void *value;
        char *name;
-        rc = security_inode_init_security(inode, dir, &name, &value, &len);
+        rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
        if (rc) {
                if (rc == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index c819eb0e982d..30d175b6d290 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -424,7 +424,9 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
        return ret;
 }
-int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, const char *name, int namelen)
+int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
+                    struct jffs2_inode_info *f, struct jffs2_raw_inode *ri,
+                    const struct qstr *qstr)
 {
        struct jffs2_raw_dirent *rd;
        struct jffs2_full_dnode *fn;
@@ -466,15 +468,15 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
        mutex_unlock(&f->sem);
        jffs2_complete_reservation(c);
-        ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode);
+        ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode, qstr);
        if (ret)
                return ret;
        ret = jffs2_init_acl_post(&f->vfs_inode);
        if (ret)
                return ret;
-        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+        ret = jffs2_reserve_space(c, sizeof(*rd)+qstr->len, &alloclen,
-                                ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+                                ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(qstr->len));
        if (ret) {
                /* Eep. */
@@ -493,19 +495,19 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
        rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
        rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
-        rd->totlen = cpu_to_je32(sizeof(*rd) + namelen);
+        rd->totlen = cpu_to_je32(sizeof(*rd) + qstr->len);
        rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4));
        rd->pino = cpu_to_je32(dir_f->inocache->ino);
        rd->version = cpu_to_je32(++dir_f->highest_version);
        rd->ino = ri->ino;
        rd->mctime = ri->ctime;
-        rd->nsize = namelen;
+        rd->nsize = qstr->len;
        rd->type = DT_REG;
        rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
-        rd->name_crc = cpu_to_je32(crc32(0, name, namelen));
+        rd->name_crc = cpu_to_je32(crc32(0, qstr->name, qstr->len));
-        fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_NORMAL);
+        fd = jffs2_write_dirent(c, dir_f, rd, qstr->name, qstr->len, ALLOC_NORMAL);
        jffs2_free_raw_dirent(rd);
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index cf4f5759b42b..7be4beb306f3 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -121,10 +121,11 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
 #endif /* CONFIG_JFFS2_FS_XATTR */
 #ifdef CONFIG_JFFS2_FS_SECURITY
-extern int jffs2_init_security(struct inode *inode, struct inode *dir);
+extern int jffs2_init_security(struct inode *inode, struct inode *dir,
+                               const struct qstr *qstr);
 extern const struct xattr_handler jffs2_security_xattr_handler;
 #else
-#define jffs2_init_security(inode,dir)  (0)
+#define jffs2_init_security(inode,dir,qstr)     (0)
 #endif /* CONFIG_JFFS2_FS_SECURITY */
 #endif /* _JFFS2_FS_XATTR_H_ */
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index 88b6cc535bf2..e9e100fd7c09 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -62,10 +62,11 @@ extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
 extern int jfs_removexattr(struct dentry *, const char *);
 #ifdef CONFIG_JFS_SECURITY
-extern int jfs_init_security(tid_t, struct inode *, struct inode *);
+extern int jfs_init_security(tid_t, struct inode *, struct inode *,
+                             const struct qstr *);
 #else
 static inline int jfs_init_security(tid_t tid, struct inode *inode,
-                                    struct inode *dir)
+                                    struct inode *dir, const struct qstr *qstr)
 {
        return 0;
 }
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 81ead850ddb6..eaaf2b511e89 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -115,7 +115,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
        if (rc)
                goto out3;
-        rc = jfs_init_security(tid, ip, dip);
+        rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
        if (rc) {
                txAbort(tid, 0);
                goto out3;
@@ -253,7 +253,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
        if (rc)
                goto out3;
-        rc = jfs_init_security(tid, ip, dip);
+        rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
        if (rc) {
                txAbort(tid, 0);
                goto out3;
@@ -809,9 +809,6 @@ static int jfs_link(struct dentry *old_dentry,
        if (ip->i_nlink == JFS_LINK_MAX)
                return -EMLINK;
-        if (ip->i_nlink == 0)
-                return -ENOENT;
        dquot_initialize(dir);
        tid = txBegin(ip->i_sb, 0);
@@ -932,7 +929,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
        mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
        mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
-        rc = jfs_init_security(tid, ip, dip);
+        rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
        if (rc)
                goto out3;
@@ -1395,7 +1392,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
        if (rc)
                goto out3;
-        rc = jfs_init_security(tid, ip, dir);
+        rc = jfs_init_security(tid, ip, dir, &dentry->d_name);
        if (rc) {
                txAbort(tid, 0);
                goto out3;
@@ -1600,7 +1597,7 @@ out:
 static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        /*
         * This is not negative dentry. Always valid.
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 2d7f165d0f1d..3fa4c32272df 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -1091,7 +1091,8 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
 }
 #ifdef CONFIG_JFS_SECURITY
-int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir)
+int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
+                      const struct qstr *qstr)
 {
        int rc;
        size_t len;
@@ -1099,7 +1100,8 @@ int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir)
        char *suffix;
        char *name;
-        rc = security_inode_init_security(inode, dir, &suffix, &value, &len);
+        rc = security_inode_init_security(inode, dir, qstr, &suffix, &value,
+                                          &len);
        if (rc) {
                if (rc == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/locks.c b/fs/locks.c
index 0f3998291f78..822c3d1843af 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -145,7 +145,6 @@ static DEFINE_SPINLOCK(file_lock_lock);
 /*
 * Protects the two list heads above, plus the inode->i_flock list
- * FIXME: should use a spinlock, once lockd and ceph are ready.
 */
 void lock_flocks(void)
 {
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index ce7337ddfdbf..6e6777f1b4b2 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -213,7 +213,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
                new_de = minix_find_entry(new_dentry, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                minix_set_link(new_de, new_page, old_inode);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
@@ -225,18 +224,15 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
                        if (new_dir->i_nlink >= info->s_link_max)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = minix_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
        minix_delete_entry(old_de, old_page);
-        inode_dec_link_count(old_inode);
+        mark_inode_dirty(old_inode);
        if (dir_de) {
                minix_set_link(dir_de, dir_page, new_dir);
diff --git a/fs/namei.c b/fs/namei.c
index 0087cf9c2c6b..b912b7abe747 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -136,7 +136,7 @@ static int do_getname(const char __user *filename, char *page)
        return retval;
 }
-char * getname(const char __user * filename)
+static char *getname_flags(const char __user * filename, int flags)
 {
        char *tmp, *result;
@@ -147,14 +147,21 @@ char * getname(const char __user * filename)
                result = tmp;
                if (retval < 0) {
-                        __putname(tmp);
+                        if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
-                        result = ERR_PTR(retval);
+                                __putname(tmp);
+                                result = ERR_PTR(retval);
+                        }
                }
        }
        audit_getname(result);
        return result;
 }
+char *getname(const char __user * filename)
+{
+        return getname_flags(filename, 0);
+}
 #ifdef CONFIG_AUDITSYSCALL
 void putname(const char *name)
 {
@@ -401,9 +408,11 @@ static int nameidata_drop_rcu(struct nameidata *nd)
 {
        struct fs_struct *fs = current->fs;
        struct dentry *dentry = nd->path.dentry;
+        int want_root = 0;
        BUG_ON(!(nd->flags & LOOKUP_RCU));
-        if (nd->root.mnt) {
+        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+                want_root = 1;
                spin_lock(&fs->lock);
                if (nd->root.mnt != fs->root.mnt ||
                                nd->root.dentry != fs->root.dentry)
@@ -414,7 +423,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
                goto err;
        BUG_ON(nd->inode != dentry->d_inode);
        spin_unlock(&dentry->d_lock);
-        if (nd->root.mnt) {
+        if (want_root) {
                path_get(&nd->root);
                spin_unlock(&fs->lock);
        }
@@ -427,7 +436,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
 err:
        spin_unlock(&dentry->d_lock);
 err_root:
-        if (nd->root.mnt)
+        if (want_root)
                spin_unlock(&fs->lock);
        return -ECHILD;
 }
@@ -454,9 +463,11 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
 {
        struct fs_struct *fs = current->fs;
        struct dentry *parent = nd->path.dentry;
+        int want_root = 0;
        BUG_ON(!(nd->flags & LOOKUP_RCU));
-        if (nd->root.mnt) {
+        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+                want_root = 1;
                spin_lock(&fs->lock);
                if (nd->root.mnt != fs->root.mnt ||
                                nd->root.dentry != fs->root.dentry)
@@ -476,7 +487,7 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
        parent->d_count++;
        spin_unlock(&dentry->d_lock);
        spin_unlock(&parent->d_lock);
-        if (nd->root.mnt) {
+        if (want_root) {
                path_get(&nd->root);
                spin_unlock(&fs->lock);
        }
@@ -490,7 +501,7 @@ err:
        spin_unlock(&dentry->d_lock);
        spin_unlock(&parent->d_lock);
 err_root:
-        if (nd->root.mnt)
+        if (want_root)
                spin_unlock(&fs->lock);
        return -ECHILD;
 }
@@ -498,8 +509,16 @@ err_root:
 /* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
 static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
 {
-        if (nd->flags & LOOKUP_RCU)
+        if (nd->flags & LOOKUP_RCU) {
-                return nameidata_dentry_drop_rcu(nd, dentry);
+                if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
+                        nd->flags &= ~LOOKUP_RCU;
+                        if (!(nd->flags & LOOKUP_ROOT))
+                                nd->root.mnt = NULL;
+                        rcu_read_unlock();
+                        br_read_unlock(vfsmount_lock);
+                        return -ECHILD;
+                }
+        }
        return 0;
 }
@@ -518,7 +537,8 @@ static int nameidata_drop_rcu_last(struct nameidata *nd)
        BUG_ON(!(nd->flags & LOOKUP_RCU));
        nd->flags &= ~LOOKUP_RCU;
-        nd->root.mnt = NULL;
+        if (!(nd->flags & LOOKUP_ROOT))
+                nd->root.mnt = NULL;
        spin_lock(&dentry->d_lock);
        if (!__d_rcu_to_refcount(dentry, nd->seq))
                goto err_unlock;
@@ -539,14 +559,6 @@ err_unlock:
        return -ECHILD;
 }
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
-{
-        if (likely(nd->flags & LOOKUP_RCU))
-                return nameidata_drop_rcu_last(nd);
-        return 0;
-}
 /**
 * release_open_intent - free up open intent resources
 * @nd: pointer to nameidata
@@ -590,42 +602,8 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
        return dentry;
 }
-static inline struct dentry *
-do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd)
-{
-        int status = d_revalidate(dentry, nd);
-        if (likely(status > 0))
-                return dentry;
-        if (status == -ECHILD) {
-                if (nameidata_dentry_drop_rcu(nd, dentry))
-                        return ERR_PTR(-ECHILD);
-                return do_revalidate(dentry, nd);
-        }
-        if (status < 0)
-                return ERR_PTR(status);
-        /* Don't d_invalidate in rcu-walk mode */
-        if (nameidata_dentry_drop_rcu(nd, dentry))
-                return ERR_PTR(-ECHILD);
-        if (!d_invalidate(dentry)) {
-                dput(dentry);
-                dentry = NULL;
-        }
-        return dentry;
-}
-static inline int need_reval_dot(struct dentry *dentry)
-{
-        if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
-                return 0;
-        if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
-                return 0;
-        return 1;
-}
 /*
- * force_reval_path - force revalidation of a dentry
+ * handle_reval_path - force revalidation of a dentry
 *
 * In some situations the path walking code will trust dentries without
 * revalidating them. This causes problems for filesystems that depend on
@@ -639,27 +617,28 @@ static inline int need_reval_dot(struct dentry *dentry)
 * invalidate the dentry. It's up to the caller to handle putting references
 * to the path if necessary.
 */
-static int
+static inline int handle_reval_path(struct nameidata *nd)
-force_reval_path(struct path *path, struct nameidata *nd)
 {
+        struct dentry *dentry = nd->path.dentry;
        int status;
-        struct dentry *dentry = path->dentry;
-        /*
+        if (likely(!(nd->flags & LOOKUP_JUMPED)))
-         * only check on filesystems where it's possible for the dentry to
+                return 0;
-         * become stale.
-         */
+        if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
-        if (!need_reval_dot(dentry))
+                return 0;
+        if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
                return 0;
+        /* Note: we do not d_invalidate() */
        status = d_revalidate(dentry, nd);
        if (status > 0)
                return 0;
-        if (!status) {
+        if (!status)
-                d_invalidate(dentry);
                status = -ESTALE;
-        }
        return status;
 }
@@ -728,6 +707,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
                path_put(&nd->path);
                nd->path = nd->root;
                path_get(&nd->root);
+                nd->flags |= LOOKUP_JUMPED;
        }
        nd->inode = nd->path.dentry->d_inode;
@@ -757,19 +737,42 @@ static inline void path_to_nameidata(const struct path *path,
        nd->path.dentry = path->dentry;
 }
+static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
+{
+        struct inode *inode = link->dentry->d_inode;
+        if (!IS_ERR(cookie) && inode->i_op->put_link)
+                inode->i_op->put_link(link->dentry, nd, cookie);
+        path_put(link);
+}
 static __always_inline int
-__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
+follow_link(struct path *link, struct nameidata *nd, void **p)
 {
        int error;
        struct dentry *dentry = link->dentry;
        BUG_ON(nd->flags & LOOKUP_RCU);
+        if (link->mnt == nd->path.mnt)
+                mntget(link->mnt);
+        if (unlikely(current->total_link_count >= 40)) {
+                *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
+                path_put(&nd->path);
+                return -ELOOP;
+        }
+        cond_resched();
+        current->total_link_count++;
        touch_atime(link->mnt, dentry);
        nd_set_link(nd, NULL);
-        if (link->mnt == nd->path.mnt)
+        error = security_inode_follow_link(link->dentry, nd);
-                mntget(link->mnt);
+        if (error) {
+                *p = ERR_PTR(error); /* no ->put_link(), please */
+                path_put(&nd->path);
+                return error;
+        }
        nd->last_type = LAST_BIND;
        *p = dentry->d_inode->i_op->follow_link(dentry, nd);
@@ -780,56 +783,18 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
                if (s)
                        error = __vfs_follow_link(nd, s);
                else if (nd->last_type == LAST_BIND) {
-                        error = force_reval_path(&nd->path, nd);
+                        nd->flags |= LOOKUP_JUMPED;
-                        if (error)
+                        nd->inode = nd->path.dentry->d_inode;
+                        if (nd->inode->i_op->follow_link) {
+                                /* stepped on a _really_ weird one */
                                path_put(&nd->path);
+                                error = -ELOOP;
+                        }
                }
        }
        return error;
 }
-/*
- * This limits recursive symlink follows to 8, while
- * limiting consecutive symlinks to 40.
- *
- * Without that kind of total limit, nasty chains of consecutive
- * symlinks can cause almost arbitrarily long lookups. 
- */
-static inline int do_follow_link(struct inode *inode, struct path *path, struct nameidata *nd)
-{
-        void *cookie;
-        int err = -ELOOP;
-        /* We drop rcu-walk here */
-        if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
-                return -ECHILD;
-        BUG_ON(inode != path->dentry->d_inode);
-        if (current->link_count >= MAX_NESTED_LINKS)
-                goto loop;
-        if (current->total_link_count >= 40)
-                goto loop;
-        BUG_ON(nd->depth >= MAX_NESTED_LINKS);
-        cond_resched();
-        err = security_inode_follow_link(path->dentry, nd);
-        if (err)
-                goto loop;
-        current->link_count++;
-        current->total_link_count++;
-        nd->depth++;
-        err = __do_follow_link(path, nd, &cookie);
-        if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
-                path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
-        path_put(path);
-        current->link_count--;
-        nd->depth--;
-        return err;
-loop:
-        path_put_conditional(path, nd);
-        path_put(&nd->path);
-        return err;
-}
 static int follow_up_rcu(struct path *path)
 {
        struct vfsmount *parent;
@@ -1068,7 +1033,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                        seq = read_seqcount_begin(&parent->d_seq);
                        if (read_seqcount_retry(&old->d_seq, nd->seq))
-                                return -ECHILD;
+                                goto failed;
                        inode = parent->d_inode;
                        nd->path.dentry = parent;
                        nd->seq = seq;
@@ -1081,8 +1046,15 @@ static int follow_dotdot_rcu(struct nameidata *nd)
        }
        __follow_mount_rcu(nd, &nd->path, &inode, true);
        nd->inode = inode;
        return 0;
+failed:
+        nd->flags &= ~LOOKUP_RCU;
+        if (!(nd->flags & LOOKUP_ROOT))
+                nd->root.mnt = NULL;
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        return -ECHILD;
 }
 /*
@@ -1216,68 +1188,85 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 {
        struct vfsmount *mnt = nd->path.mnt;
        struct dentry *dentry, *parent = nd->path.dentry;
-        struct inode *dir;
+        int need_reval = 1;
+        int status = 1;
        int err;
        /*
-         * See if the low-level filesystem might want
-         * to use its own hash..
-         */
-        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-                err = parent->d_op->d_hash(parent, nd->inode, name);
-                if (err < 0)
-                        return err;
-        }
-        /*
         * Rename seqlock is not required here because in the off chance
         * of a false negative due to a concurrent rename, we're going to
         * do the non-racy lookup, below.
         */
        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;
                *inode = nd->inode;
                dentry = __d_lookup_rcu(parent, name, &seq, inode);
-                if (!dentry) {
+                if (!dentry)
-                        if (nameidata_drop_rcu(nd))
+                        goto unlazy;
-                                return -ECHILD;
-                        goto need_lookup;
-                }
                /* Memory barrier in read_seqcount_begin of child is enough */
                if (__read_seqcount_retry(&parent->d_seq, nd->seq))
                        return -ECHILD;
                nd->seq = seq;
                if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
-                        dentry = do_revalidate_rcu(dentry, nd);
+                        status = d_revalidate(dentry, nd);
-                        if (!dentry)
+                        if (unlikely(status <= 0)) {
-                                goto need_lookup;
+                                if (status != -ECHILD)
-                        if (IS_ERR(dentry))
+                                        need_reval = 0;
-                                goto fail;
+                                goto unlazy;
-                        if (!(nd->flags & LOOKUP_RCU))
+                        }
-                                goto done;
                }
                path->mnt = mnt;
                path->dentry = dentry;
                if (likely(__follow_mount_rcu(nd, path, inode, false)))
                        return 0;
-                if (nameidata_drop_rcu(nd))
+unlazy:
-                        return -ECHILD;
+                if (dentry) {
-                /* fallthru */
+                        if (nameidata_dentry_drop_rcu(nd, dentry))
+                                return -ECHILD;
+                } else {
+                        if (nameidata_drop_rcu(nd))
+                                return -ECHILD;
+                }
+        } else {
+                dentry = __d_lookup(parent, name);
        }
-        dentry = __d_lookup(parent, name);
-        if (!dentry)
+retry:
-                goto need_lookup;
+        if (unlikely(!dentry)) {
-found:
+                struct inode *dir = parent->d_inode;
-        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
+                BUG_ON(nd->inode != dir);
-                dentry = do_revalidate(dentry, nd);
-                if (!dentry)
+                mutex_lock(&dir->i_mutex);
-                        goto need_lookup;
+                dentry = d_lookup(parent, name);
-                if (IS_ERR(dentry))
+                if (likely(!dentry)) {
-                        goto fail;
+                        dentry = d_alloc_and_lookup(parent, name, nd);
+                        if (IS_ERR(dentry)) {
+                                mutex_unlock(&dir->i_mutex);
+                                return PTR_ERR(dentry);
+                        }
+                        /* known good */
+                        need_reval = 0;
+                        status = 1;
+                }
+                mutex_unlock(&dir->i_mutex);
        }
-done:
+        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
+                status = d_revalidate(dentry, nd);
+        if (unlikely(status <= 0)) {
+                if (status < 0) {
+                        dput(dentry);
+                        return status;
+                }
+                if (!d_invalidate(dentry)) {
+                        dput(dentry);
+                        dentry = NULL;
+                        need_reval = 1;
+                        goto retry;
+                }
+        }
        path->mnt = mnt;
        path->dentry = dentry;
        err = follow_managed(path, nd->flags);
@@ -1287,39 +1276,113 @@ done:
        }
        *inode = path->dentry->d_inode;
        return 0;
+}
+static inline int may_lookup(struct nameidata *nd)
+{
+        if (nd->flags & LOOKUP_RCU) {
+                int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+                if (err != -ECHILD)
+                        return err;
+                if (nameidata_drop_rcu(nd))
+                        return -ECHILD;
+        }
+        return exec_permission(nd->inode, 0);
+}
-need_lookup:
+static inline int handle_dots(struct nameidata *nd, int type)
-        dir = parent->d_inode;
+{
-        BUG_ON(nd->inode != dir);
+        if (type == LAST_DOTDOT) {
+                if (nd->flags & LOOKUP_RCU) {
+                        if (follow_dotdot_rcu(nd))
+                                return -ECHILD;
+                } else
+                        follow_dotdot(nd);
+        }
+        return 0;
+}
-        mutex_lock(&dir->i_mutex);
+static void terminate_walk(struct nameidata *nd)
-        /*
+{
-         * First re-do the cached lookup just in case it was created
+        if (!(nd->flags & LOOKUP_RCU)) {
-         * while we waited for the directory semaphore, or the first
+                path_put(&nd->path);
-         * lookup failed due to an unrelated rename.
+        } else {
-         *
+                nd->flags &= ~LOOKUP_RCU;
-         * This could use version numbering or similar to avoid unnecessary
+                if (!(nd->flags & LOOKUP_ROOT))
-         * cache lookups, but then we'd have to do the first lookup in the
+                        nd->root.mnt = NULL;
-         * non-racy way. However in the common case here, everything should
+                rcu_read_unlock();
-         * be hot in cache, so would it be a big win?
+                br_read_unlock(vfsmount_lock);
-         */
-        dentry = d_lookup(parent, name);
-        if (likely(!dentry)) {
-                dentry = d_alloc_and_lookup(parent, name, nd);
-                mutex_unlock(&dir->i_mutex);
-                if (IS_ERR(dentry))
-                        goto fail;
-                goto done;
        }
+}
+static inline int walk_component(struct nameidata *nd, struct path *path,
+                struct qstr *name, int type, int follow)
+{
+        struct inode *inode;
+        int err;
        /*
-         * Uhhuh! Nasty case: the cache was re-populated while
+         * "." and ".." are special - ".." especially so because it has
-         * we waited on the semaphore. Need to revalidate.
+         * to be able to know about the current root directory and
+         * parent relationships.
         */
-        mutex_unlock(&dir->i_mutex);
+        if (unlikely(type != LAST_NORM))
-        goto found;
+                return handle_dots(nd, type);
+        err = do_lookup(nd, name, path, &inode);
+        if (unlikely(err)) {
+                terminate_walk(nd);
+                return err;
+        }
+        if (!inode) {
+                path_to_nameidata(path, nd);
+                terminate_walk(nd);
+                return -ENOENT;
+        }
+        if (unlikely(inode->i_op->follow_link) && follow) {
+                if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
+                        return -ECHILD;
+                BUG_ON(inode != path->dentry->d_inode);
+                return 1;
+        }
+        path_to_nameidata(path, nd);
+        nd->inode = inode;
+        return 0;
+}
-fail:
+/*
-        return PTR_ERR(dentry);
+ * This limits recursive symlink follows to 8, while
+ * limiting consecutive symlinks to 40.
+ *
+ * Without that kind of total limit, nasty chains of consecutive
+ * symlinks can cause almost arbitrarily long lookups.
+ */
+static inline int nested_symlink(struct path *path, struct nameidata *nd)
+{
+        int res;
+        BUG_ON(nd->depth >= MAX_NESTED_LINKS);
+        if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
+                path_put_conditional(path, nd);
+                path_put(&nd->path);
+                return -ELOOP;
+        }
+        nd->depth++;
+        current->link_count++;
+        do {
+                struct path link = *path;
+                void *cookie;
+                res = follow_link(&link, nd, &cookie);
+                if (!res)
+                        res = walk_component(nd, path, &nd->last,
+                                             nd->last_type, LOOKUP_FOLLOW);
+                put_link(nd, &link, cookie);
+        } while (res > 0);
+        current->link_count--;
+        nd->depth--;
+        return res;
 }
 /*
@@ -1339,30 +1402,18 @@ static int link_path_walk(const char *name, struct nameidata *nd)
        while (*name=='/')
                name++;
        if (!*name)
-                goto return_reval;
+                return 0;
-        if (nd->depth)
-                lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
        /* At this point we know we have a real path component. */
        for(;;) {
-                struct inode *inode;
                unsigned long hash;
                struct qstr this;
                unsigned int c;
+                int type;
                nd->flags |= LOOKUP_CONTINUE;
-                if (nd->flags & LOOKUP_RCU) {
-                        err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+                err = may_lookup(nd);
-                        if (err == -ECHILD) {
-                                if (nameidata_drop_rcu(nd))
-                                        return -ECHILD;
-                                goto exec_again;
-                        }
-                } else {
-exec_again:
-                        err = exec_permission(nd->inode, 0);
-                }
                if (err)
                        break;
@@ -1378,52 +1429,43 @@ exec_again:
                this.len = name - (const char *) this.name;
                this.hash = end_name_hash(hash);
+                type = LAST_NORM;
+                if (this.name[0] == '.') switch (this.len) {
+                        case 2:
+                                if (this.name[1] == '.') {
+                                        type = LAST_DOTDOT;
+                                        nd->flags |= LOOKUP_JUMPED;
+                                }
+                                break;
+                        case 1:
+                                type = LAST_DOT;
+                }
+                if (likely(type == LAST_NORM)) {
+                        struct dentry *parent = nd->path.dentry;
+                        nd->flags &= ~LOOKUP_JUMPED;
+                        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
+                                err = parent->d_op->d_hash(parent, nd->inode,
+                                                           &this);
+                                if (err < 0)
+                                        break;
+                        }
+                }
                /* remove trailing slashes? */
                if (!c)
                        goto last_component;
                while (*++name == '/');
                if (!*name)
-                        goto last_with_slashes;
+                        goto last_component;
-                /*
+                err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
-                 * "." and ".." are special - ".." especially so because it has
+                if (err < 0)
-                 * to be able to know about the current root directory and
+                        return err;
-                 * parent relationships.
-                 */
-                if (this.name[0] == '.') switch (this.len) {
-                        default:
-                                break;
-                        case 2:
-                                if (this.name[1] != '.')
-                                        break;
-                                if (nd->flags & LOOKUP_RCU) {
-                                        if (follow_dotdot_rcu(nd))
-                                                return -ECHILD;
-                                } else
-                                        follow_dotdot(nd);
-                                /* fallthrough */
-                        case 1:
-                                continue;
-                }
-                /* This does the actual lookups.. */
-                err = do_lookup(nd, &this, &next, &inode);
-                if (err)
-                        break;
-                err = -ENOENT;
-                if (!inode)
-                        goto out_dput;
-                if (inode->i_op->follow_link) {
+                if (err) {
-                        err = do_follow_link(inode, &next, nd);
+                        err = nested_symlink(&next, nd);
                        if (err)
-                                goto return_err;
+                                return err;
-                        nd->inode = nd->path.dentry->d_inode;
-                        err = -ENOENT;
-                        if (!nd->inode)
-                                break;
-                } else {
-                        path_to_nameidata(&next, nd);
-                        nd->inode = inode;
                }
                err = -ENOTDIR; 
                if (!nd->inode->i_op->lookup)
@@ -1431,209 +1473,109 @@ exec_again:
                continue;
                /* here ends the main loop */
-last_with_slashes:
-                lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 last_component:
                /* Clear LOOKUP_CONTINUE iff it was previously unset */
                nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
-                if (lookup_flags & LOOKUP_PARENT)
-                        goto lookup_parent;
-                if (this.name[0] == '.') switch (this.len) {
-                        default:
-                                break;
-                        case 2:
-                                if (this.name[1] != '.')
-                                        break;
-                                if (nd->flags & LOOKUP_RCU) {
-                                        if (follow_dotdot_rcu(nd))
-                                                return -ECHILD;
-                                } else
-                                        follow_dotdot(nd);
-                                /* fallthrough */
-                        case 1:
-                                goto return_reval;
-                }
-                err = do_lookup(nd, &this, &next, &inode);
-                if (err)
-                        break;
-                if (inode && unlikely(inode->i_op->follow_link) &&
-                    (lookup_flags & LOOKUP_FOLLOW)) {
-                        err = do_follow_link(inode, &next, nd);
-                        if (err)
-                                goto return_err;
-                        nd->inode = nd->path.dentry->d_inode;
-                } else {
-                        path_to_nameidata(&next, nd);
-                        nd->inode = inode;
-                }
-                err = -ENOENT;
-                if (!nd->inode)
-                        break;
-                if (lookup_flags & LOOKUP_DIRECTORY) {
-                        err = -ENOTDIR; 
-                        if (!nd->inode->i_op->lookup)
-                                break;
-                }
-                goto return_base;
-lookup_parent:
                nd->last = this;
-                nd->last_type = LAST_NORM;
+                nd->last_type = type;
-                if (this.name[0] != '.')
-                        goto return_base;
-                if (this.len == 1)
-                        nd->last_type = LAST_DOT;
-                else if (this.len == 2 && this.name[1] == '.')
-                        nd->last_type = LAST_DOTDOT;
-                else
-                        goto return_base;
-return_reval:
-                /*
-                 * We bypassed the ordinary revalidation routines.
-                 * We may need to check the cached dentry for staleness.
-                 */
-                if (need_reval_dot(nd->path.dentry)) {
-                        if (nameidata_drop_rcu_last_maybe(nd))
-                                return -ECHILD;
-                        /* Note: we do not d_invalidate() */
-                        err = d_revalidate(nd->path.dentry, nd);
-                        if (!err)
-                                err = -ESTALE;
-                        if (err < 0)
-                                break;
-                        return 0;
-                }
-return_base:
-                if (nameidata_drop_rcu_last_maybe(nd))
-                        return -ECHILD;
                return 0;
-out_dput:
-                if (!(nd->flags & LOOKUP_RCU))
-                        path_put_conditional(&next, nd);
-                break;
        }
-        if (!(nd->flags & LOOKUP_RCU))
+        terminate_walk(nd);
-                path_put(&nd->path);
-return_err:
        return err;
 }
-static inline int path_walk_rcu(const char *name, struct nameidata *nd)
+static int path_init(int dfd, const char *name, unsigned int flags,
-{
+                     struct nameidata *nd, struct file **fp)
-        current->total_link_count = 0;
-        return link_path_walk(name, nd);
-}
-static inline int path_walk_simple(const char *name, struct nameidata *nd)
-{
-        current->total_link_count = 0;
-        return link_path_walk(name, nd);
-}
-static int path_walk(const char *name, struct nameidata *nd)
-{
-        struct path save = nd->path;
-        int result;
-        current->total_link_count = 0;
-        /* make sure the stuff we saved doesn't go away */
-        path_get(&save);
-        result = link_path_walk(name, nd);
-        if (result == -ESTALE) {
-                /* nd->path had been dropped */
-                current->total_link_count = 0;
-                nd->path = save;
-                path_get(&nd->path);
-                nd->flags |= LOOKUP_REVAL;
-                result = link_path_walk(name, nd);
-        }
-        path_put(&save);
-        return result;
-}
-static void path_finish_rcu(struct nameidata *nd)
-{
-        if (nd->flags & LOOKUP_RCU) {
-                /* RCU dangling. Cancel it. */
-                nd->flags &= ~LOOKUP_RCU;
-                nd->root.mnt = NULL;
-                rcu_read_unlock();
-                br_read_unlock(vfsmount_lock);
-        }
-        if (nd->file)
-                fput(nd->file);
-}
-static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
        int retval = 0;
        int fput_needed;
        struct file *file;
        nd->last_type = LAST_ROOT; /* if there are only slashes... */
-        nd->flags = flags | LOOKUP_RCU;
+        nd->flags = flags | LOOKUP_JUMPED;
        nd->depth = 0;
+        if (flags & LOOKUP_ROOT) {
+                struct inode *inode = nd->root.dentry->d_inode;
+                if (*name) {
+                        if (!inode->i_op->lookup)
+                                return -ENOTDIR;
+                        retval = inode_permission(inode, MAY_EXEC);
+                        if (retval)
+                                return retval;
+                }
+                nd->path = nd->root;
+                nd->inode = inode;
+                if (flags & LOOKUP_RCU) {
+                        br_read_lock(vfsmount_lock);
+                        rcu_read_lock();
+                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                } else {
+                        path_get(&nd->path);
+                }
+                return 0;
+        }
        nd->root.mnt = NULL;
-        nd->file = NULL;
        if (*name=='/') {
-                struct fs_struct *fs = current->fs;
+                if (flags & LOOKUP_RCU) {
-                unsigned seq;
+                        br_read_lock(vfsmount_lock);
+                        rcu_read_lock();
-                br_read_lock(vfsmount_lock);
+                        set_root_rcu(nd);
-                rcu_read_lock();
+                } else {
+                        set_root(nd);
-                do {
+                        path_get(&nd->root);
-                        seq = read_seqcount_begin(&fs->seq);
+                }
-                        nd->root = fs->root;
+                nd->path = nd->root;
-                        nd->path = nd->root;
-                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-                } while (read_seqcount_retry(&fs->seq, seq));
        } else if (dfd == AT_FDCWD) {
-                struct fs_struct *fs = current->fs;
+                if (flags & LOOKUP_RCU) {
-                unsigned seq;
+                        struct fs_struct *fs = current->fs;
+                        unsigned seq;
-                br_read_lock(vfsmount_lock);
-                rcu_read_lock();
-                do {
+                        br_read_lock(vfsmount_lock);
-                        seq = read_seqcount_begin(&fs->seq);
+                        rcu_read_lock();
-                        nd->path = fs->pwd;
-                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-                } while (read_seqcount_retry(&fs->seq, seq));
+                        do {
+                                seq = read_seqcount_begin(&fs->seq);
+                                nd->path = fs->pwd;
+                                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                        } while (read_seqcount_retry(&fs->seq, seq));
+                } else {
+                        get_fs_pwd(current->fs, &nd->path);
+                }
        } else {
                struct dentry *dentry;
-                file = fget_light(dfd, &fput_needed);
+                file = fget_raw_light(dfd, &fput_needed);
                retval = -EBADF;
                if (!file)
                        goto out_fail;
                dentry = file->f_path.dentry;
-                retval = -ENOTDIR;
+                if (*name) {
-                if (!S_ISDIR(dentry->d_inode->i_mode))
+                        retval = -ENOTDIR;
-                        goto fput_fail;
+                        if (!S_ISDIR(dentry->d_inode->i_mode))
+                                goto fput_fail;
-                retval = file_permission(file, MAY_EXEC);
+                        retval = file_permission(file, MAY_EXEC);
-                if (retval)
+                        if (retval)
-                        goto fput_fail;
+                                goto fput_fail;
+                }
                nd->path = file->f_path;
-                if (fput_needed)
+                if (flags & LOOKUP_RCU) {
-                        nd->file = file;
+                        if (fput_needed)
+                                *fp = file;
-                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-                br_read_lock(vfsmount_lock);
+                        br_read_lock(vfsmount_lock);
-                rcu_read_lock();
+                        rcu_read_lock();
+                } else {
+                        path_get(&file->f_path);
+                        fput_light(file, fput_needed);
+                }
        }
        nd->inode = nd->path.dentry->d_inode;
        return 0;
@@ -1643,60 +1585,23 @@ out_fail:
        return retval;
 }
-static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+static inline int lookup_last(struct nameidata *nd, struct path *path)
 {
-        int retval = 0;
+        if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
-        int fput_needed;
+                nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
-        struct file *file;
-        nd->last_type = LAST_ROOT; /* if there are only slashes... */
-        nd->flags = flags;
-        nd->depth = 0;
-        nd->root.mnt = NULL;
-        if (*name=='/') {
-                set_root(nd);
-                nd->path = nd->root;
-                path_get(&nd->root);
-        } else if (dfd == AT_FDCWD) {
-                get_fs_pwd(current->fs, &nd->path);
-        } else {
-                struct dentry *dentry;
-                file = fget_light(dfd, &fput_needed);
-                retval = -EBADF;
-                if (!file)
-                        goto out_fail;
-                dentry = file->f_path.dentry;
-                retval = -ENOTDIR;
-                if (!S_ISDIR(dentry->d_inode->i_mode))
-                        goto fput_fail;
-                retval = file_permission(file, MAY_EXEC);
+        nd->flags &= ~LOOKUP_PARENT;
-                if (retval)
+        return walk_component(nd, path, &nd->last, nd->last_type,
-                        goto fput_fail;
+                                        nd->flags & LOOKUP_FOLLOW);
-                nd->path = file->f_path;
-                path_get(&file->f_path);
-                fput_light(file, fput_needed);
-        }
-        nd->inode = nd->path.dentry->d_inode;
-        return 0;
-fput_fail:
-        fput_light(file, fput_needed);
-out_fail:
-        return retval;
 }
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-static int do_path_lookup(int dfd, const char *name,
+static int path_lookupat(int dfd, const char *name,
                                unsigned int flags, struct nameidata *nd)
 {
-        int retval;
+        struct file *base = NULL;
+        struct path path;
+        int err;
        /*
         * Path walking is largely split up into 2 different synchronisation
@@ -1712,44 +1617,75 @@ static int do_path_lookup(int dfd, const char *name,
         * be handled by restarting a traditional ref-walk (which will always
         * be able to complete).
         */
-        retval = path_init_rcu(dfd, name, flags, nd);
+        err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
-        if (unlikely(retval))
-                return retval;
+        if (unlikely(err))
-        retval = path_walk_rcu(name, nd);
+                return err;
-        path_finish_rcu(nd);
-        if (nd->root.mnt) {
+        current->total_link_count = 0;
-                path_put(&nd->root);
+        err = link_path_walk(name, nd);
-                nd->root.mnt = NULL;
+        if (!err && !(flags & LOOKUP_PARENT)) {
+                err = lookup_last(nd, &path);
+                while (err > 0) {
+                        void *cookie;
+                        struct path link = path;
+                        nd->flags |= LOOKUP_PARENT;
+                        err = follow_link(&link, nd, &cookie);
+                        if (!err)
+                                err = lookup_last(nd, &path);
+                        put_link(nd, &link, cookie);
+                }
        }
-        if (unlikely(retval == -ECHILD || retval == -ESTALE)) {
+        if (nd->flags & LOOKUP_RCU) {
-                /* slower, locked walk */
+                /* went all way through without dropping RCU */
-                if (retval == -ESTALE)
+                BUG_ON(err);
-                        flags |= LOOKUP_REVAL;
+                if (nameidata_drop_rcu_last(nd))
-                retval = path_init(dfd, name, flags, nd);
+                        err = -ECHILD;
-                if (unlikely(retval))
+        }
-                        return retval;
-                retval = path_walk(name, nd);
+        if (!err)
-                if (nd->root.mnt) {
+                err = handle_reval_path(nd);
-                        path_put(&nd->root);
-                        nd->root.mnt = NULL;
+        if (!err && nd->flags & LOOKUP_DIRECTORY) {
+                if (!nd->inode->i_op->lookup) {
+                        path_put(&nd->path);
+                        return -ENOTDIR;
                }
        }
+        if (base)
+                fput(base);
+        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+                path_put(&nd->root);
+                nd->root.mnt = NULL;
+        }
+        return err;
+}
+static int do_path_lookup(int dfd, const char *name,
+                                unsigned int flags, struct nameidata *nd)
+{
+        int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
+        if (unlikely(retval == -ECHILD))
+                retval = path_lookupat(dfd, name, flags, nd);
+        if (unlikely(retval == -ESTALE))
+                retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
        if (likely(!retval)) {
                if (unlikely(!audit_dummy_context())) {
                        if (nd->path.dentry && nd->inode)
                                audit_inode(name, nd->path.dentry);
                }
        }
        return retval;
 }
-int path_lookup(const char *name, unsigned int flags,
+int kern_path_parent(const char *name, struct nameidata *nd)
-                        struct nameidata *nd)
 {
-        return do_path_lookup(AT_FDCWD, name, flags, nd);
+        return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd);
 }
 int kern_path(const char *name, unsigned int flags, struct path *path)
@@ -1773,29 +1709,10 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
                    const char *name, unsigned int flags,
                    struct nameidata *nd)
 {
-        int retval;
+        nd->root.dentry = dentry;
+        nd->root.mnt = mnt;
-        /* same as do_path_lookup */
+        /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
-        nd->last_type = LAST_ROOT;
+        return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd);
-        nd->flags = flags;
-        nd->depth = 0;
-        nd->path.dentry = dentry;
-        nd->path.mnt = mnt;
-        path_get(&nd->path);
-        nd->root = nd->path;
-        path_get(&nd->root);
-        nd->inode = nd->path.dentry->d_inode;
-        retval = path_walk(name, nd);
-        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
-                                nd->inode))
-                audit_inode(name, nd->path.dentry);
-        path_put(&nd->root);
-        nd->root.mnt = NULL;
-        return retval;
 }
 static struct dentry *__lookup_hash(struct qstr *name,
@@ -1810,17 +1727,6 @@ static struct dentry *__lookup_hash(struct qstr *name,
                return ERR_PTR(err);
        /*
-         * See if the low-level filesystem might want
-         * to use its own hash..
-         */
-        if (base->d_flags & DCACHE_OP_HASH) {
-                err = base->d_op->d_hash(base, inode, name);
-                dentry = ERR_PTR(err);
-                if (err < 0)
-                        goto out;
-        }
-        /*
         * Don't bother with __d_lookup: callers are for creat as
         * well as unlink, so a lot of the time it would cost
         * a double lookup.
@@ -1832,7 +1738,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
        if (!dentry)
                dentry = d_alloc_and_lookup(base, name, nd);
-out:
        return dentry;
 }
@@ -1846,28 +1752,6 @@ static struct dentry *lookup_hash(struct nameidata *nd)
        return __lookup_hash(&nd->last, nd->path.dentry, nd);
 }
-static int __lookup_one_len(const char *name, struct qstr *this,
-                struct dentry *base, int len)
-{
-        unsigned long hash;
-        unsigned int c;
-        this->name = name;
-        this->len = len;
-        if (!len)
-                return -EACCES;
-        hash = init_name_hash();
-        while (len--) {
-                c = *(const unsigned char *)name++;
-                if (c == '/' || c == '\0')
-                        return -EACCES;
-                hash = partial_name_hash(c, hash);
-        }
-        this->hash = end_name_hash(hash);
-        return 0;
-}
 /**
 * lookup_one_len - filesystem helper to lookup single pathname component
 * @name:       pathname component to lookup
@@ -1881,14 +1765,34 @@ static int __lookup_one_len(const char *name, struct qstr *this,
 */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
-        int err;
        struct qstr this;
+        unsigned long hash;
+        unsigned int c;
        WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
-        err = __lookup_one_len(name, &this, base, len);
+        this.name = name;
-        if (err)
+        this.len = len;
-                return ERR_PTR(err);
+        if (!len)
+                return ERR_PTR(-EACCES);
+        hash = init_name_hash();
+        while (len--) {
+                c = *(const unsigned char *)name++;
+                if (c == '/' || c == '\0')
+                        return ERR_PTR(-EACCES);
+                hash = partial_name_hash(c, hash);
+        }
+        this.hash = end_name_hash(hash);
+        /*
+         * See if the low-level filesystem might want
+         * to use its own hash..
+         */
+        if (base->d_flags & DCACHE_OP_HASH) {
+                int err = base->d_op->d_hash(base, base->d_inode, &this);
+                if (err < 0)
+                        return ERR_PTR(err);
+        }
        return __lookup_hash(&this, base, NULL);
 }
@@ -1897,7 +1801,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
                 struct path *path)
 {
        struct nameidata nd;
-        char *tmp = getname(name);
+        char *tmp = getname_flags(name, flags);
        int err = PTR_ERR(tmp);
        if (!IS_ERR(tmp)) {
@@ -2077,12 +1981,16 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        return error;
 }
-int may_open(struct path *path, int acc_mode, int flag)
+static int may_open(struct path *path, int acc_mode, int flag)
 {
        struct dentry *dentry = path->dentry;
        struct inode *inode = dentry->d_inode;
        int error;
+        /* O_PATH? */
+        if (!acc_mode)
+                return 0;
        if (!inode)
                return -ENOENT;
@@ -2151,34 +2059,6 @@ static int handle_truncate(struct file *filp)
 }
 /*
- * Be careful about ever adding any more callers of this
- * function.  Its flags must be in the namei format, not
- * what get passed to sys_open().
- */
-static int __open_namei_create(struct nameidata *nd, struct path *path,
-                                int open_flag, int mode)
-{
-        int error;
-        struct dentry *dir = nd->path.dentry;
-        if (!IS_POSIXACL(dir->d_inode))
-                mode &= ~current_umask();
-        error = security_path_mknod(&nd->path, path->dentry, mode, 0);
-        if (error)
-                goto out_unlock;
-        error = vfs_create(dir->d_inode, path->dentry, mode, nd);
-out_unlock:
-        mutex_unlock(&dir->d_inode->i_mutex);
-        dput(nd->path.dentry);
-        nd->path.dentry = path->dentry;
-        if (error)
-                return error;
-        /* Don't check for write permission, don't truncate */
-        return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
-}
-/*
 * Note that while the flag value (low two bits) for sys_open means:
 *      00 - read-only
 *      01 - write-only
@@ -2202,126 +2082,115 @@ static inline int open_to_namei_flags(int flag)
        return flag;
 }
-static int open_will_truncate(int flag, struct inode *inode)
-{
-        /*
-         * We'll never write to the fs underlying
-         * a device file.
-         */
-        if (special_file(inode->i_mode))
-                return 0;
-        return (flag & O_TRUNC);
-}
-static struct file *finish_open(struct nameidata *nd,
-                                int open_flag, int acc_mode)
-{
-        struct file *filp;
-        int will_truncate;
-        int error;
-        will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
-        if (will_truncate) {
-                error = mnt_want_write(nd->path.mnt);
-                if (error)
-                        goto exit;
-        }
-        error = may_open(&nd->path, acc_mode, open_flag);
-        if (error) {
-                if (will_truncate)
-                        mnt_drop_write(nd->path.mnt);
-                goto exit;
-        }
-        filp = nameidata_to_filp(nd);
-        if (!IS_ERR(filp)) {
-                error = ima_file_check(filp, acc_mode);
-                if (error) {
-                        fput(filp);
-                        filp = ERR_PTR(error);
-                }
-        }
-        if (!IS_ERR(filp)) {
-                if (will_truncate) {
-                        error = handle_truncate(filp);
-                        if (error) {
-                                fput(filp);
-                                filp = ERR_PTR(error);
-                        }
-                }
-        }
-        /*
-         * It is now safe to drop the mnt write
-         * because the filp has had a write taken
-         * on its behalf.
-         */
-        if (will_truncate)
-                mnt_drop_write(nd->path.mnt);
-        path_put(&nd->path);
-        return filp;
-exit:
-        path_put(&nd->path);
-        return ERR_PTR(error);
-}
 /*
- * Handle O_CREAT case for do_filp_open
+ * Handle the last step of open()
 */
 static struct file *do_last(struct nameidata *nd, struct path *path,
-                            int open_flag, int acc_mode,
+                            const struct open_flags *op, const char *pathname)
-                            int mode, const char *pathname)
 {
        struct dentry *dir = nd->path.dentry;
+        struct dentry *dentry;
+        int open_flag = op->open_flag;
+        int will_truncate = open_flag & O_TRUNC;
+        int want_write = 0;
+        int acc_mode = op->acc_mode;
        struct file *filp;
-        int error = -EISDIR;
+        int error;
+        nd->flags &= ~LOOKUP_PARENT;
+        nd->flags |= op->intent;
        switch (nd->last_type) {
        case LAST_DOTDOT:
-                follow_dotdot(nd);
-                dir = nd->path.dentry;
        case LAST_DOT:
-                if (need_reval_dot(dir)) {
+                error = handle_dots(nd, nd->last_type);
-                        int status = d_revalidate(nd->path.dentry, nd);
+                if (error)
-                        if (!status)
+                        return ERR_PTR(error);
-                                status = -ESTALE;
-                        if (status < 0) {
-                                error = status;
-                                goto exit;
-                        }
-                }
                /* fallthrough */
        case LAST_ROOT:
-                goto exit;
+                if (nd->flags & LOOKUP_RCU) {
+                        if (nameidata_drop_rcu_last(nd))
+                                return ERR_PTR(-ECHILD);
+                }
+                error = handle_reval_path(nd);
+                if (error)
+                        goto exit;
+                audit_inode(pathname, nd->path.dentry);
+                if (open_flag & O_CREAT) {
+                        error = -EISDIR;
+                        goto exit;
+                }
+                goto ok;
        case LAST_BIND:
+                /* can't be RCU mode here */
+                error = handle_reval_path(nd);
+                if (error)
+                        goto exit;
                audit_inode(pathname, dir);
                goto ok;
        }
+        if (!(open_flag & O_CREAT)) {
+                int symlink_ok = 0;
+                if (nd->last.name[nd->last.len])
+                        nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+                if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
+                        symlink_ok = 1;
+                /* we _can_ be in RCU mode here */
+                error = walk_component(nd, path, &nd->last, LAST_NORM,
+                                        !symlink_ok);
+                if (error < 0)
+                        return ERR_PTR(error);
+                if (error) /* symlink */
+                        return NULL;
+                /* sayonara */
+                if (nd->flags & LOOKUP_RCU) {
+                        if (nameidata_drop_rcu_last(nd))
+                                return ERR_PTR(-ECHILD);
+                }
+                error = -ENOTDIR;
+                if (nd->flags & LOOKUP_DIRECTORY) {
+                        if (!nd->inode->i_op->lookup)
+                                goto exit;
+                }
+                audit_inode(pathname, nd->path.dentry);
+                goto ok;
+        }
+        /* create side of things */
+        if (nd->flags & LOOKUP_RCU) {
+                if (nameidata_drop_rcu_last(nd))
+                        return ERR_PTR(-ECHILD);
+        }
+        audit_inode(pathname, dir);
+        error = -EISDIR;
        /* trailing slashes? */
        if (nd->last.name[nd->last.len])
                goto exit;
        mutex_lock(&dir->d_inode->i_mutex);
-        path->dentry = lookup_hash(nd);
+        dentry = lookup_hash(nd);
-        path->mnt = nd->path.mnt;
+        error = PTR_ERR(dentry);
+        if (IS_ERR(dentry)) {
-        error = PTR_ERR(path->dentry);
-        if (IS_ERR(path->dentry)) {
                mutex_unlock(&dir->d_inode->i_mutex);
                goto exit;
        }
-        if (IS_ERR(nd->intent.open.file)) {
+        path->dentry = dentry;
-                error = PTR_ERR(nd->intent.open.file);
+        path->mnt = nd->path.mnt;
-                goto exit_mutex_unlock;
-        }
        /* Negative dentry, just create the file */
-        if (!path->dentry->d_inode) {
+        if (!dentry->d_inode) {
+                int mode = op->mode;
+                if (!IS_POSIXACL(dir->d_inode))
+                        mode &= ~current_umask();
                /*
                 * This write is needed to ensure that a
-                 * ro->rw transition does not occur between
+                 * rw->ro transition does not occur between
                 * the time when the file is created and when
                 * a permanent write count is taken through
                 * the 'struct file' in nameidata_to_filp().
@@ -2329,22 +2198,21 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                error = mnt_want_write(nd->path.mnt);
                if (error)
                        goto exit_mutex_unlock;
-                error = __open_namei_create(nd, path, open_flag, mode);
+                want_write = 1;
-                if (error) {
+                /* Don't check for write permission, don't truncate */
-                        mnt_drop_write(nd->path.mnt);
+                open_flag &= ~O_TRUNC;
-                        goto exit;
+                will_truncate = 0;
-                }
+                acc_mode = MAY_OPEN;
-                filp = nameidata_to_filp(nd);
+                error = security_path_mknod(&nd->path, dentry, mode, 0);
-                mnt_drop_write(nd->path.mnt);
+                if (error)
-                path_put(&nd->path);
+                        goto exit_mutex_unlock;
-                if (!IS_ERR(filp)) {
+                error = vfs_create(dir->d_inode, dentry, mode, nd);
-                        error = ima_file_check(filp, acc_mode);
+                if (error)
-                        if (error) {
+                        goto exit_mutex_unlock;
-                                fput(filp);
+                mutex_unlock(&dir->d_inode->i_mutex);
-                                filp = ERR_PTR(error);
+                dput(nd->path.dentry);
-                        }
+                nd->path.dentry = dentry;
-                }
+                goto common;
-                return filp;
        }
        /*
@@ -2374,7 +2242,40 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        if (S_ISDIR(nd->inode->i_mode))
                goto exit;
 ok:
-        filp = finish_open(nd, open_flag, acc_mode);
+        if (!S_ISREG(nd->inode->i_mode))
+                will_truncate = 0;
+        if (will_truncate) {
+                error = mnt_want_write(nd->path.mnt);
+                if (error)
+                        goto exit;
+                want_write = 1;
+        }
+common:
+        error = may_open(&nd->path, acc_mode, open_flag);
+        if (error)
+                goto exit;
+        filp = nameidata_to_filp(nd);
+        if (!IS_ERR(filp)) {
+                error = ima_file_check(filp, op->acc_mode);
+                if (error) {
+                        fput(filp);
+                        filp = ERR_PTR(error);
+                }
+        }
+        if (!IS_ERR(filp)) {
+                if (will_truncate) {
+                        error = handle_truncate(filp);
+                        if (error) {
+                                fput(filp);
+                                filp = ERR_PTR(error);
+                        }
+                }
+        }
+out:
+        if (want_write)
+                mnt_drop_write(nd->path.mnt);
+        path_put(&nd->path);
        return filp;
 exit_mutex_unlock:
@@ -2382,197 +2283,103 @@ exit_mutex_unlock:
 exit_dput:
        path_put_conditional(path, nd);
 exit:
-        path_put(&nd->path);
+        filp = ERR_PTR(error);
-        return ERR_PTR(error);
+        goto out;
 }
-/*
+static struct file *path_openat(int dfd, const char *pathname,
- * Note that the low bits of the passed in "open_flag"
+                struct nameidata *nd, const struct open_flags *op, int flags)
- * are not the same as in the local variable "flag". See
- * open_to_namei_flags() for more details.
- */
-struct file *do_filp_open(int dfd, const char *pathname,
-                int open_flag, int mode, int acc_mode)
 {
+        struct file *base = NULL;
        struct file *filp;
-        struct nameidata nd;
-        int error;
        struct path path;
-        int count = 0;
+        int error;
-        int flag = open_to_namei_flags(open_flag);
-        int flags;
-        if (!(open_flag & O_CREAT))
-                mode = 0;
-        /* Must never be set by userspace */
-        open_flag &= ~FMODE_NONOTIFY;
-        /*
-         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
-         * check for O_DSYNC if the need any syncing at all we enforce it's
-         * always set instead of having to deal with possibly weird behaviour
-         * for malicious applications setting only __O_SYNC.
-         */
-        if (open_flag & __O_SYNC)
-                open_flag |= O_DSYNC;
-        if (!acc_mode)
-                acc_mode = MAY_OPEN | ACC_MODE(open_flag);
-        /* O_TRUNC implies we need access checks for write permissions */
-        if (open_flag & O_TRUNC)
-                acc_mode |= MAY_WRITE;
-        /* Allow the LSM permission hook to distinguish append 
-           access from general write access. */
-        if (open_flag & O_APPEND)
-                acc_mode |= MAY_APPEND;
-        flags = LOOKUP_OPEN;
-        if (open_flag & O_CREAT) {
-                flags |= LOOKUP_CREATE;
-                if (open_flag & O_EXCL)
-                        flags |= LOOKUP_EXCL;
-        }
-        if (open_flag & O_DIRECTORY)
-                flags |= LOOKUP_DIRECTORY;
-        if (!(open_flag & O_NOFOLLOW))
-                flags |= LOOKUP_FOLLOW;
        filp = get_empty_filp();
        if (!filp)
                return ERR_PTR(-ENFILE);
-        filp->f_flags = open_flag;
+        filp->f_flags = op->open_flag;
-        nd.intent.open.file = filp;
+        nd->intent.open.file = filp;
-        nd.intent.open.flags = flag;
+        nd->intent.open.flags = open_to_namei_flags(op->open_flag);
-        nd.intent.open.create_mode = mode;
+        nd->intent.open.create_mode = op->mode;
-        if (open_flag & O_CREAT)
-                goto creat;
-        /* !O_CREAT, simple open */
+        error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base);
-        error = do_path_lookup(dfd, pathname, flags, &nd);
        if (unlikely(error))
                goto out_filp;
-        error = -ELOOP;
-        if (!(nd.flags & LOOKUP_FOLLOW)) {
-                if (nd.inode->i_op->follow_link)
-                        goto out_path;
-        }
-        error = -ENOTDIR;
-        if (nd.flags & LOOKUP_DIRECTORY) {
-                if (!nd.inode->i_op->lookup)
-                        goto out_path;
-        }
-        audit_inode(pathname, nd.path.dentry);
-        filp = finish_open(&nd, open_flag, acc_mode);
-        release_open_intent(&nd);
-        return filp;
-creat:
+        current->total_link_count = 0;
-        /* OK, have to create the file. Find the parent. */
+        error = link_path_walk(pathname, nd);
-        error = path_init_rcu(dfd, pathname,
-                        LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
-        if (error)
-                goto out_filp;
-        error = path_walk_rcu(pathname, &nd);
-        path_finish_rcu(&nd);
-        if (unlikely(error == -ECHILD || error == -ESTALE)) {
-                /* slower, locked walk */
-                if (error == -ESTALE) {
-reval:
-                        flags |= LOOKUP_REVAL;
-                }
-                error = path_init(dfd, pathname,
-                                LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
-                if (error)
-                        goto out_filp;
-                error = path_walk_simple(pathname, &nd);
-        }
        if (unlikely(error))
                goto out_filp;
-        if (unlikely(!audit_dummy_context()))
-                audit_inode(pathname, nd.path.dentry);
-        /*
+        filp = do_last(nd, &path, op, pathname);
-         * We have the parent and last component.
-         */
-        nd.flags = flags;
-        filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
        while (unlikely(!filp)) { /* trailing symlink */
                struct path link = path;
-                struct inode *linki = link.dentry->d_inode;
                void *cookie;
-                error = -ELOOP;
+                if (!(nd->flags & LOOKUP_FOLLOW)) {
-                if (!(nd.flags & LOOKUP_FOLLOW))
+                        path_put_conditional(&path, nd);
-                        goto exit_dput;
+                        path_put(&nd->path);
-                if (count++ == 32)
+                        filp = ERR_PTR(-ELOOP);
-                        goto exit_dput;
+                        break;
-                /*
-                 * This is subtle. Instead of calling do_follow_link() we do
-                 * the thing by hands. The reason is that this way we have zero
-                 * link_count and path_walk() (called from ->follow_link)
-                 * honoring LOOKUP_PARENT.  After that we have the parent and
-                 * last component, i.e. we are in the same situation as after
-                 * the first path_walk().  Well, almost - if the last component
-                 * is normal we get its copy stored in nd->last.name and we will
-                 * have to putname() it when we are done. Procfs-like symlinks
-                 * just set LAST_BIND.
-                 */
-                nd.flags |= LOOKUP_PARENT;
-                error = security_inode_follow_link(link.dentry, &nd);
-                if (error)
-                        goto exit_dput;
-                error = __do_follow_link(&link, &nd, &cookie);
-                if (unlikely(error)) {
-                        if (!IS_ERR(cookie) && linki->i_op->put_link)
-                                linki->i_op->put_link(link.dentry, &nd, cookie);
-                        /* nd.path had been dropped */
-                        nd.path = link;
-                        goto out_path;
                }
-                nd.flags &= ~LOOKUP_PARENT;
+                nd->flags |= LOOKUP_PARENT;
-                filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
+                nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
-                if (linki->i_op->put_link)
+                error = follow_link(&link, nd, &cookie);
-                        linki->i_op->put_link(link.dentry, &nd, cookie);
+                if (unlikely(error))
-                path_put(&link);
+                        filp = ERR_PTR(error);
+                else
+                        filp = do_last(nd, &path, op, pathname);
+                put_link(nd, &link, cookie);
        }
 out:
-        if (nd.root.mnt)
+        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
-                path_put(&nd.root);
+                path_put(&nd->root);
-        if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
+        if (base)
-                goto reval;
+                fput(base);
-        release_open_intent(&nd);
+        release_open_intent(nd);
        return filp;
-exit_dput:
-        path_put_conditional(&path, &nd);
-out_path:
-        path_put(&nd.path);
 out_filp:
        filp = ERR_PTR(error);
        goto out;
 }
-/**
+struct file *do_filp_open(int dfd, const char *pathname,
- * filp_open - open file and return file pointer
+                const struct open_flags *op, int flags)
- *
+{
- * @filename:   path to open
+        struct nameidata nd;
- * @flags:      open flags as per the open(2) second argument
+        struct file *filp;
- * @mode:       mode for the new file if O_CREAT is set, else ignored
- *
+        filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
- * This is the helper to open a file from kernelspace if you really
+        if (unlikely(filp == ERR_PTR(-ECHILD)))
- * have to.  But in generally you should not do this, so please move
+                filp = path_openat(dfd, pathname, &nd, op, flags);
- * along, nothing to see here..
+        if (unlikely(filp == ERR_PTR(-ESTALE)))
- */
+                filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
-struct file *filp_open(const char *filename, int flags, int mode)
+        return filp;
+}
+struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+                const char *name, const struct open_flags *op, int flags)
 {
-        return do_filp_open(AT_FDCWD, filename, flags, mode, 0);
+        struct nameidata nd;
+        struct file *file;
+        nd.root.mnt = mnt;
+        nd.root.dentry = dentry;
+        flags |= LOOKUP_ROOT;
+        if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
+                return ERR_PTR(-ELOOP);
+        file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
+        if (unlikely(file == ERR_PTR(-ECHILD)))
+                file = path_openat(-1, name, &nd, op, flags);
+        if (unlikely(file == ERR_PTR(-ESTALE)))
+                file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
+        return file;
 }
-EXPORT_SYMBOL(filp_open);
 /**
 * lookup_create - lookup a dentry, creating it if it doesn't exist
@@ -3111,7 +2918,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
                return error;
        mutex_lock(&inode->i_mutex);
-        error = dir->i_op->link(old_dentry, dir, new_dentry);
+        /* Make sure we don't allow creating hardlink to an unlinked file */
+        if (inode->i_nlink == 0)
+                error =  -ENOENT;
+        else
+                error = dir->i_op->link(old_dentry, dir, new_dentry);
        mutex_unlock(&inode->i_mutex);
        if (!error)
                fsnotify_link(dir, inode, new_dentry);
@@ -3133,15 +2944,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
        struct dentry *new_dentry;
        struct nameidata nd;
        struct path old_path;
+        int how = 0;
        int error;
        char *to;
-        if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
+        if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;
+        /*
+         * To use null names we require CAP_DAC_READ_SEARCH
+         * This ensures that not everyone will be able to create
+         * handlink using the passed filedescriptor.
+         */
+        if (flags & AT_EMPTY_PATH) {
+                if (!capable(CAP_DAC_READ_SEARCH))
+                        return -ENOENT;
+                how = LOOKUP_EMPTY;
+        }
+        if (flags & AT_SYMLINK_FOLLOW)
+                how |= LOOKUP_FOLLOW;
-        error = user_path_at(olddfd, oldname,
+        error = user_path_at(olddfd, oldname, how, &old_path);
-                             flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
-                             &old_path);
        if (error)
                return error;
@@ -3578,7 +3401,7 @@ EXPORT_SYMBOL(page_readlink);
 EXPORT_SYMBOL(__page_symlink);
 EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
-EXPORT_SYMBOL(path_lookup);
+EXPORT_SYMBOL(kern_path_parent);
 EXPORT_SYMBOL(kern_path);
 EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(inode_permission);
diff --git a/fs/namespace.c b/fs/namespace.c
index 7b0b95371696..d7513485c1f3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -978,7 +978,13 @@ static int show_vfsmnt(struct seq_file *m, void *v)
        int err = 0;
        struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
-        mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+        if (mnt->mnt_sb->s_op->show_devname) {
+                err = mnt->mnt_sb->s_op->show_devname(m, mnt);
+                if (err)
+                        goto out;
+        } else {
+                mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+        }
        seq_putc(m, ' ');
        seq_path(m, &mnt_path, " \t\n\\");
        seq_putc(m, ' ');
@@ -1002,6 +1008,18 @@ const struct seq_operations mounts_op = {
        .show   = show_vfsmnt
 };
+static int uuid_is_nil(u8 *uuid)
+{
+        int i;
+        u8  *cp = (u8 *)uuid;
+        for (i = 0; i < 16; i++) {
+                if (*cp++)
+                        return 0;
+        }
+        return 1;
+}
 static int show_mountinfo(struct seq_file *m, void *v)
 {
        struct proc_mounts *p = m->private;
@@ -1013,7 +1031,12 @@ static int show_mountinfo(struct seq_file *m, void *v)
        seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id,
                   MAJOR(sb->s_dev), MINOR(sb->s_dev));
-        seq_dentry(m, mnt->mnt_root, " \t\n\\");
+        if (sb->s_op->show_path)
+                err = sb->s_op->show_path(m, mnt);
+        else
+                seq_dentry(m, mnt->mnt_root, " \t\n\\");
+        if (err)
+                goto out;
        seq_putc(m, ' ');
        seq_path_root(m, &mnt_path, &root, " \t\n\\");
        if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) {
@@ -1040,11 +1063,20 @@ static int show_mountinfo(struct seq_file *m, void *v)
        if (IS_MNT_UNBINDABLE(mnt))
                seq_puts(m, " unbindable");
+        if (!uuid_is_nil(mnt->mnt_sb->s_uuid))
+                /* print the uuid */
+                seq_printf(m, " uuid:%pU", mnt->mnt_sb->s_uuid);
        /* Filesystem specific data */
        seq_puts(m, " - ");
        show_type(m, sb);
        seq_putc(m, ' ');
-        mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+        if (sb->s_op->show_devname)
+                err = sb->s_op->show_devname(m, mnt);
+        else
+                mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+        if (err)
+                goto out;
        seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
        err = show_sb_opts(m, sb);
        if (err)
@@ -1070,11 +1102,15 @@ static int show_vfsstat(struct seq_file *m, void *v)
        int err = 0;
        /* device */
-        if (mnt->mnt_devname) {
+        if (mnt->mnt_sb->s_op->show_devname) {
-                seq_puts(m, "device ");
+                err = mnt->mnt_sb->s_op->show_devname(m, mnt);
-                mangle(m, mnt->mnt_devname);
+        } else {
-        } else
+                if (mnt->mnt_devname) {
-                seq_puts(m, "no device");
+                        seq_puts(m, "device ");
+                        mangle(m, mnt->mnt_devname);
+                } else
+                        seq_puts(m, "no device");
+        }
        /* mount point */
        seq_puts(m, " mounted on ");
@@ -1088,7 +1124,8 @@ static int show_vfsstat(struct seq_file *m, void *v)
        /* optional statistics */
        if (mnt->mnt_sb->s_op->show_stats) {
                seq_putc(m, ' ');
-                err = mnt->mnt_sb->s_op->show_stats(m, mnt);
+                if (!err)
+                        err = mnt->mnt_sb->s_op->show_stats(m, mnt);
        }
        seq_putc(m, '\n');
@@ -1244,7 +1281,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
                 */
                br_write_lock(vfsmount_lock);
                if (mnt_get_count(mnt) != 2) {
-                        br_write_lock(vfsmount_lock);
+                        br_write_unlock(vfsmount_lock);
                        return -EBUSY;
                }
                br_write_unlock(vfsmount_lock);
@@ -1767,6 +1804,10 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
        if (path->dentry != path->mnt->mnt_root)
                return -EINVAL;
+        err = security_sb_remount(sb, data);
+        if (err)
+                return err;
        down_write(&sb->s_umount);
        if (flags & MS_BIND)
                err = change_mount_flags(path->mnt, flags);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 89587573fe50..2f41dccea18e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -188,10 +188,10 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
                        rv = NFS4ERR_DELAY;
                list_del_init(&lo->plh_bulk_recall);
                spin_unlock(&ino->i_lock);
+                pnfs_free_lseg_list(&free_me_list);
                put_layout_hdr(lo);
                iput(ino);
        }
-        pnfs_free_lseg_list(&free_me_list);
        return rv;
 }
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index bd3ca32879e7..139be9647d80 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -82,6 +82,11 @@ retry:
 #endif /* CONFIG_NFS_V4 */
 /*
+ * Turn off NFSv4 uid/gid mapping when using AUTH_SYS
+ */
+static int nfs4_disable_idmapping = 0;
+/*
 * RPC cruft for NFS
 */
 static struct rpc_version *nfs_version[5] = {
@@ -481,7 +486,12 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 * Look up a client by IP address and protocol version
 * - creates a new record if one doesn't yet exist
 */
-static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
+static struct nfs_client *
+nfs_get_client(const struct nfs_client_initdata *cl_init,
+               const struct rpc_timeout *timeparms,
+               const char *ip_addr,
+               rpc_authflavor_t authflavour,
+               int noresvport)
 {
        struct nfs_client *clp, *new = NULL;
        int error;
@@ -512,6 +522,13 @@ install_client:
        clp = new;
        list_add(&clp->cl_share_link, &nfs_client_list);
        spin_unlock(&nfs_client_lock);
+        error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr,
+                                              authflavour, noresvport);
+        if (error < 0) {
+                nfs_put_client(clp);
+                return ERR_PTR(error);
+        }
        dprintk("--> nfs_get_client() = %p [new]\n", clp);
        return clp;
@@ -767,9 +784,9 @@ static int nfs_init_server_rpcclient(struct nfs_server *server,
 /*
 * Initialise an NFS2 or NFS3 client
 */
-static int nfs_init_client(struct nfs_client *clp,
+int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms,
-                           const struct rpc_timeout *timeparms,
+                    const char *ip_addr, rpc_authflavor_t authflavour,
-                           const struct nfs_parsed_mount_data *data)
+                    int noresvport)
 {
        int error;
@@ -784,7 +801,7 @@ static int nfs_init_client(struct nfs_client *clp,
         * - RFC 2623, sec 2.3.2
         */
        error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
-                                      0, data->flags & NFS_MOUNT_NORESVPORT);
+                                      0, noresvport);
        if (error < 0)
                goto error;
        nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -820,19 +837,17 @@ static int nfs_init_server(struct nfs_server *server,
                cl_init.rpc_ops = &nfs_v3_clientops;
 #endif
+        nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+                        data->timeo, data->retrans);
        /* Allocate or find a client reference we can use */
-        clp = nfs_get_client(&cl_init);
+        clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX,
+                             data->flags & NFS_MOUNT_NORESVPORT);
        if (IS_ERR(clp)) {
                dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
                return PTR_ERR(clp);
        }
-        nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
-                        data->timeo, data->retrans);
-        error = nfs_init_client(clp, &timeparms, data);
-        if (error < 0)
-                goto error;
        server->nfs_client = clp;
        /* Initialise the client representation from the mount data */
@@ -1009,14 +1024,19 @@ static void nfs_server_insert_lists(struct nfs_server *server)
        spin_lock(&nfs_client_lock);
        list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
        list_add_tail(&server->master_link, &nfs_volume_list);
+        clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
        spin_unlock(&nfs_client_lock);
 }
 static void nfs_server_remove_lists(struct nfs_server *server)
 {
+        struct nfs_client *clp = server->nfs_client;
        spin_lock(&nfs_client_lock);
        list_del_rcu(&server->client_link);
+        if (clp && list_empty(&clp->cl_superblocks))
+                set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
        list_del(&server->master_link);
        spin_unlock(&nfs_client_lock);
@@ -1307,11 +1327,11 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
 /*
 * Initialise an NFS4 client record
 */
-static int nfs4_init_client(struct nfs_client *clp,
+int nfs4_init_client(struct nfs_client *clp,
-                const struct rpc_timeout *timeparms,
+                     const struct rpc_timeout *timeparms,
-                const char *ip_addr,
+                     const char *ip_addr,
-                rpc_authflavor_t authflavour,
+                     rpc_authflavor_t authflavour,
-                int flags)
+                     int noresvport)
 {
        int error;
@@ -1325,7 +1345,7 @@ static int nfs4_init_client(struct nfs_client *clp,
        clp->rpc_ops = &nfs_v4_clientops;
        error = nfs_create_rpc_client(clp, timeparms, authflavour,
-                                      1, flags & NFS_MOUNT_NORESVPORT);
+                                      1, noresvport);
        if (error < 0)
                goto error;
        strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
@@ -1378,27 +1398,71 @@ static int nfs4_set_client(struct nfs_server *server,
        dprintk("--> nfs4_set_client()\n");
        /* Allocate or find a client reference we can use */
-        clp = nfs_get_client(&cl_init);
+        clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour,
+                             server->flags & NFS_MOUNT_NORESVPORT);
        if (IS_ERR(clp)) {
                error = PTR_ERR(clp);
                goto error;
        }
-        error = nfs4_init_client(clp, timeparms, ip_addr, authflavour,
-                                        server->flags);
+        /*
-        if (error < 0)
+         * Query for the lease time on clientid setup or renewal
-                goto error_put;
+         *
+         * Note that this will be set on nfs_clients that were created
+         * only for the DS role and did not set this bit, but now will
+         * serve a dual role.
+         */
+        set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state);
        server->nfs_client = clp;
        dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
        return 0;
-error_put:
-        nfs_put_client(clp);
 error:
        dprintk("<-- nfs4_set_client() = xerror %d\n", error);
        return error;
 }
+/*
+ * Set up a pNFS Data Server client.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+                const struct sockaddr *ds_addr,
+                int ds_addrlen, int ds_proto)
+{
+        struct nfs_client_initdata cl_init = {
+                .addr = ds_addr,
+                .addrlen = ds_addrlen,
+                .rpc_ops = &nfs_v4_clientops,
+                .proto = ds_proto,
+                .minorversion = mds_clp->cl_minorversion,
+        };
+        struct rpc_timeout ds_timeout = {
+                .to_initval = 15 * HZ,
+                .to_maxval = 15 * HZ,
+                .to_retries = 1,
+                .to_exponential = 1,
+        };
+        struct nfs_client *clp;
+        /*
+         * Set an authflavor equual to the MDS value. Use the MDS nfs_client
+         * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
+         * (section 13.1 RFC 5661).
+         */
+        clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
+                             mds_clp->cl_rpcclient->cl_auth->au_flavor, 0);
+        dprintk("<-- %s %p\n", __func__, clp);
+        return clp;
+}
+EXPORT_SYMBOL(nfs4_set_ds_client);
 /*
 * Session has been established, and the client marked ready.
@@ -1435,6 +1499,10 @@ static int nfs4_server_common_setup(struct nfs_server *server,
        BUG_ON(!server->nfs_client->rpc_ops);
        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+        /* data servers support only a subset of NFSv4.1 */
+        if (is_ds_only_client(server->nfs_client))
+                return -EPROTONOSUPPORT;
        fattr = nfs_alloc_fattr();
        if (fattr == NULL)
                return -ENOMEM;
@@ -1504,6 +1572,13 @@ static int nfs4_init_server(struct nfs_server *server,
        if (error < 0)
                goto error;
+        /*
+         * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
+         * authentication.
+         */
+        if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX)
+                server->caps |= NFS_CAP_UIDGID_NOMAP;
        if (data->rsize)
                server->rsize = nfs_block_size(data->rsize, NULL);
        if (data->wsize)
@@ -1921,3 +1996,7 @@ void nfs_fs_proc_exit(void)
 }
 #endif /* CONFIG_PROC_FS */
+module_param(nfs4_disable_idmapping, bool, 0644);
+MODULE_PARM_DESC(nfs4_disable_idmapping,
+                "Turn off NFSv4 idmapping when using 'sec=sys'");
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2c3eb33b904d..abdf38d5971d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1169,11 +1169,23 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
        iput(inode);
 }
+static void nfs_d_release(struct dentry *dentry)
+{
+        /* free cached devname value, if it survived that far */
+        if (unlikely(dentry->d_fsdata)) {
+                if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+                        WARN_ON(1);
+                else
+                        kfree(dentry->d_fsdata);
+        }
+}
 const struct dentry_operations nfs_dentry_operations = {
        .d_revalidate   = nfs_lookup_revalidate,
        .d_delete       = nfs_dentry_delete,
        .d_iput         = nfs_dentry_iput,
        .d_automount    = nfs_d_automount,
+        .d_release      = nfs_d_release,
 };
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -1248,6 +1260,7 @@ const struct dentry_operations nfs4_dentry_operations = {
        .d_delete       = nfs_dentry_delete,
        .d_iput         = nfs_dentry_iput,
        .d_automount    = nfs_d_automount,
+        .d_release      = nfs_d_release,
 };
 /*
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 9943a75bb6d1..8eea25366717 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -45,6 +45,7 @@
 #include <linux/pagemap.h>
 #include <linux/kref.h>
 #include <linux/slab.h>
+#include <linux/task_io_accounting_ops.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
@@ -649,8 +650,7 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 {
        struct nfs_write_data *data = calldata;
-        if (nfs_writeback_done(task, data) != 0)
+        nfs_writeback_done(task, data);
-                return;
 }
 /*
@@ -938,6 +938,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
        if (retval)
                goto out;
+        task_io_account_read(count);
        retval = nfs_direct_read(iocb, iov, nr_segs, pos);
        if (retval > 0)
                iocb->ki_pos = pos + retval;
@@ -999,6 +1001,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
        if (retval)
                goto out;
+        task_io_account_write(count);
        retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
        if (retval > 0)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 7bf029ef4084..d85a534b15cd 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -387,10 +387,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
                file->f_path.dentry->d_name.name,
                mapping->host->i_ino, len, (long long) pos);
-        pnfs_update_layout(mapping->host,
-                           nfs_file_open_context(file),
-                           IOMODE_RW);
 start:
        /*
         * Prevent starvation issues if someone is doing a consistency
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b5ffe8fa291f..1084792bc0fe 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -75,18 +75,25 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
 /*
 * get an NFS2/NFS3 root dentry from the root filehandle
 */
-struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
+struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
+                            const char *devname)
 {
        struct nfs_server *server = NFS_SB(sb);
        struct nfs_fsinfo fsinfo;
        struct dentry *ret;
        struct inode *inode;
+        void *name = kstrdup(devname, GFP_KERNEL);
        int error;
+        if (!name)
+                return ERR_PTR(-ENOMEM);
        /* get the actual root for this mount */
        fsinfo.fattr = nfs_alloc_fattr();
-        if (fsinfo.fattr == NULL)
+        if (fsinfo.fattr == NULL) {
+                kfree(name);
                return ERR_PTR(-ENOMEM);
+        }
        error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
        if (error < 0) {
@@ -119,7 +126,15 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
        }
        security_d_instantiate(ret, inode);
+        spin_lock(&ret->d_lock);
+        if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
+                ret->d_fsdata = name;
+                name = NULL;
+        }
+        spin_unlock(&ret->d_lock);
 out:
+        if (name)
+                kfree(name);
        nfs_free_fattr(fsinfo.fattr);
        return ret;
 }
@@ -169,27 +184,35 @@ out:
 /*
 * get an NFS4 root dentry from the root filehandle
 */
-struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
+struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh,
+                             const char *devname)
 {
        struct nfs_server *server = NFS_SB(sb);
        struct nfs_fattr *fattr = NULL;
        struct dentry *ret;
        struct inode *inode;
+        void *name = kstrdup(devname, GFP_KERNEL);
        int error;
        dprintk("--> nfs4_get_root()\n");
+        if (!name)
+                return ERR_PTR(-ENOMEM);
        /* get the info about the server and filesystem */
        error = nfs4_server_capabilities(server, mntfh);
        if (error < 0) {
                dprintk("nfs_get_root: getcaps error = %d\n",
                        -error);
+                kfree(name);
                return ERR_PTR(error);
        }
        fattr = nfs_alloc_fattr();
-        if (fattr == NULL)
+        if (fattr == NULL) {
-                return ERR_PTR(-ENOMEM);;
+                kfree(name);
+                return ERR_PTR(-ENOMEM);
+        }
        /* get the actual root for this mount */
        error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
@@ -223,8 +246,15 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
        }
        security_d_instantiate(ret, inode);
+        spin_lock(&ret->d_lock);
+        if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
+                ret->d_fsdata = name;
+                name = NULL;
+        }
+        spin_unlock(&ret->d_lock);
 out:
+        if (name)
+                kfree(name);
        nfs_free_fattr(fattr);
        dprintk("<-- nfs4_get_root()\n");
        return ret;
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 18696882f1c6..79664a1025af 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -33,16 +33,41 @@
 *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
+{
+        unsigned long val;
+        char buf[16];
+        if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf))
+                return 0;
+        memcpy(buf, name, namelen);
+        buf[namelen] = '\0';
+        if (strict_strtoul(buf, 0, &val) != 0)
+                return 0;
+        *res = val;
+        return 1;
+}
+static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
+{
+        return snprintf(buf, buflen, "%u", id);
+}
 #ifdef CONFIG_NFS_USE_NEW_IDMAPPER
 #include <linux/slab.h>
 #include <linux/cred.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs_sb.h>
 #include <linux/nfs_idmap.h>
 #include <linux/keyctl.h>
 #include <linux/key-type.h>
 #include <linux/rcupdate.h>
-#include <linux/kernel.h>
 #include <linux/err.h>
 #include <keys/user-type.h>
@@ -219,23 +244,39 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen,
        return ret;
 }
-int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
 {
+        if (nfs_map_string_to_numeric(name, namelen, uid))
+                return 0;
        return nfs_idmap_lookup_id(name, namelen, "uid", uid);
 }
-int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid)
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
 {
+        if (nfs_map_string_to_numeric(name, namelen, gid))
+                return 0;
        return nfs_idmap_lookup_id(name, namelen, "gid", gid);
 }
-int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
 {
-        return nfs_idmap_lookup_name(uid, "user", buf, buflen);
+        int ret = -EINVAL;
+        if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+                ret = nfs_idmap_lookup_name(uid, "user", buf, buflen);
+        if (ret < 0)
+                ret = nfs_map_numeric_to_string(uid, buf, buflen);
+        return ret;
 }
-int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen)
+int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
 {
-        return nfs_idmap_lookup_name(gid, "group", buf, buflen);
+        int ret = -EINVAL;
+        if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+                ret = nfs_idmap_lookup_name(gid, "group", buf, buflen);
+        if (ret < 0)
+                ret = nfs_map_numeric_to_string(gid, buf, buflen);
+        return ret;
 }
 #else  /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
@@ -243,7 +284,6 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
-#include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/socket.h>
 #include <linux/in.h>
@@ -695,31 +735,45 @@ static unsigned int fnvhash32(const void *buf, size_t buflen)
        return hash;
 }
-int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
 {
-        struct idmap *idmap = clp->cl_idmap;
+        struct idmap *idmap = server->nfs_client->cl_idmap;
+        if (nfs_map_string_to_numeric(name, namelen, uid))
+                return 0;
        return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid);
 }
-int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
 {
-        struct idmap *idmap = clp->cl_idmap;
+        struct idmap *idmap = server->nfs_client->cl_idmap;
+        if (nfs_map_string_to_numeric(name, namelen, uid))
+                return 0;
        return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
 }
-int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
 {
-        struct idmap *idmap = clp->cl_idmap;
+        struct idmap *idmap = server->nfs_client->cl_idmap;
+        int ret = -EINVAL;
-        return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
+        if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+                ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
+        if (ret < 0)
+                ret = nfs_map_numeric_to_string(uid, buf, buflen);
+        return ret;
 }
-int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
 {
-        struct idmap *idmap = clp->cl_idmap;
+        struct idmap *idmap = server->nfs_client->cl_idmap;
+        int ret = -EINVAL;
-        return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
+        if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+                ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
+        if (ret < 0)
+                ret = nfs_map_numeric_to_string(uid, buf, buflen);
+        return ret;
 }
 #endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1cc600e77bb4..01768e5e2c9b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -37,6 +37,7 @@
 #include <linux/inet.h>
 #include <linux/nfs_xdr.h>
 #include <linux/slab.h>
+#include <linux/compat.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -89,7 +90,11 @@ int nfs_wait_bit_killable(void *word)
 */
 u64 nfs_compat_user_ino64(u64 fileid)
 {
-        int ino;
+#ifdef CONFIG_COMPAT
+        compat_ulong_t ino;
+#else   
+        unsigned long ino;
+#endif
        if (enable_ino64)
                return fileid;
@@ -1513,7 +1518,7 @@ static int nfsiod_start(void)
 {
        struct workqueue_struct *wq;
        dprintk("RPC:       creating workqueue nfsiod\n");
-        wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0);
+        wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0);
        if (wq == NULL)
                return -ENOMEM;
        nfsiod_workqueue = wq;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index cf9fdbdabc67..72e0bddf7a2f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -148,6 +148,9 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
                                           struct nfs_fattr *);
 extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
 extern int nfs4_check_client_ready(struct nfs_client *clp);
+extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+                                             const struct sockaddr *ds_addr,
+                                             int ds_addrlen, int ds_proto);
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
@@ -163,10 +166,10 @@ static inline void nfs_fs_proc_exit(void)
 /* nfs4namespace.c */
 #ifdef CONFIG_NFS_V4
-extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry);
+extern struct vfsmount *nfs_do_refmount(struct dentry *dentry);
 #else
 static inline
-struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+struct vfsmount *nfs_do_refmount(struct dentry *dentry)
 {
        return ERR_PTR(-ENOENT);
 }
@@ -213,8 +216,14 @@ extern const u32 nfs41_maxwrite_overhead;
 extern struct rpc_procinfo nfs4_procedures[];
 #endif
+extern int nfs4_init_ds_session(struct nfs_client *clp);
 /* proc.c */
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+extern int nfs_init_client(struct nfs_client *clp,
+                           const struct rpc_timeout *timeparms,
+                           const char *ip_addr, rpc_authflavor_t authflavour,
+                           int noresvport);
 /* dir.c */
 extern int nfs_access_cache_shrinker(struct shrinker *shrink,
@@ -247,24 +256,30 @@ extern void nfs_sb_active(struct super_block *sb);
 extern void nfs_sb_deactive(struct super_block *sb);
 /* namespace.c */
-extern char *nfs_path(const char *base,
+extern char *nfs_path(char **p, struct dentry *dentry,
-                      const struct dentry *droot,
-                      const struct dentry *dentry,
                      char *buffer, ssize_t buflen);
 extern struct vfsmount *nfs_d_automount(struct path *path);
 /* getroot.c */
-extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
+extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *,
+                                   const char *);
 #ifdef CONFIG_NFS_V4
-extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *);
+extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *,
+                                    const char *);
 extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
 #endif
 /* read.c */
+extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+                             const struct rpc_call_ops *call_ops);
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 /* write.c */
+extern int nfs_initiate_write(struct nfs_write_data *data,
+                              struct rpc_clnt *clnt,
+                              const struct rpc_call_ops *call_ops,
+                              int how);
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
@@ -274,6 +289,13 @@ extern int nfs_migrate_page(struct address_space *,
 #endif
 /* nfs4proc.c */
+extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
+extern int nfs4_init_client(struct nfs_client *clp,
+                            const struct rpc_timeout *timeparms,
+                            const char *ip_addr,
+                            rpc_authflavor_t authflavour,
+                            int noresvport);
+extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data);
 extern int _nfs4_call_sync(struct nfs_server *server,
                           struct rpc_message *msg,
                           struct nfs4_sequence_args *args,
@@ -288,12 +310,11 @@ extern int _nfs4_call_sync_session(struct nfs_server *server,
 /*
 * Determine the device name as a string
 */
-static inline char *nfs_devname(const struct vfsmount *mnt_parent,
+static inline char *nfs_devname(struct dentry *dentry,
-                                const struct dentry *dentry,
                                char *buffer, ssize_t buflen)
 {
-        return nfs_path(mnt_parent->mnt_devname, mnt_parent->mnt_root,
+        char *dummy;
-                        dentry, buffer, buflen);
+        return nfs_path(&dummy, dentry, buffer, buflen);
 }
 /*
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index f32b8603dca8..c0b8344db0c6 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -25,33 +25,30 @@ static LIST_HEAD(nfs_automount_list);
 static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts);
 int nfs_mountpoint_expiry_timeout = 500 * HZ;
-static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+static struct vfsmount *nfs_do_submount(struct dentry *dentry,
-                                        const struct dentry *dentry,
                                        struct nfs_fh *fh,
                                        struct nfs_fattr *fattr);
 /*
 * nfs_path - reconstruct the path given an arbitrary dentry
- * @base - arbitrary string to prepend to the path
+ * @base - used to return pointer to the end of devname part of path
- * @droot - pointer to root dentry for mountpoint
 * @dentry - pointer to dentry
 * @buffer - result buffer
 * @buflen - length of buffer
 *
- * Helper function for constructing the path from the
+ * Helper function for constructing the server pathname
- * root dentry to an arbitrary hashed dentry.
+ * by arbitrary hashed dentry.
 *
 * This is mainly for use in figuring out the path on the
- * server side when automounting on top of an existing partition.
+ * server side when automounting on top of an existing partition
+ * and in generating /proc/mounts and friends.
 */
-char *nfs_path(const char *base,
+char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen)
-               const struct dentry *droot,
-               const struct dentry *dentry,
-               char *buffer, ssize_t buflen)
 {
        char *end;
        int namelen;
        unsigned seq;
+        const char *base;
 rename_retry:
        end = buffer+buflen;
@@ -60,7 +57,10 @@ rename_retry:
        seq = read_seqbegin(&rename_lock);
        rcu_read_lock();
-        while (!IS_ROOT(dentry) && dentry != droot) {
+        while (1) {
+                spin_lock(&dentry->d_lock);
+                if (IS_ROOT(dentry))
+                        break;
                namelen = dentry->d_name.len;
                buflen -= namelen + 1;
                if (buflen < 0)
@@ -68,27 +68,47 @@ rename_retry:
                end -= namelen;
                memcpy(end, dentry->d_name.name, namelen);
                *--end = '/';
+                spin_unlock(&dentry->d_lock);
                dentry = dentry->d_parent;
        }
-        rcu_read_unlock();
+        if (read_seqretry(&rename_lock, seq)) {
-        if (read_seqretry(&rename_lock, seq))
+                spin_unlock(&dentry->d_lock);
+                rcu_read_unlock();
                goto rename_retry;
+        }
        if (*end != '/') {
-                if (--buflen < 0)
+                if (--buflen < 0) {
+                        spin_unlock(&dentry->d_lock);
+                        rcu_read_unlock();
                        goto Elong;
+                }
                *--end = '/';
        }
+        *p = end;
+        base = dentry->d_fsdata;
+        if (!base) {
+                spin_unlock(&dentry->d_lock);
+                rcu_read_unlock();
+                WARN_ON(1);
+                return end;
+        }
        namelen = strlen(base);
        /* Strip off excess slashes in base string */
        while (namelen > 0 && base[namelen - 1] == '/')
                namelen--;
        buflen -= namelen;
-        if (buflen < 0)
+        if (buflen < 0) {
+                spin_lock(&dentry->d_lock);
+                rcu_read_unlock();
                goto Elong;
+        }
        end -= namelen;
        memcpy(end, base, namelen);
+        spin_unlock(&dentry->d_lock);
+        rcu_read_unlock();
        return end;
 Elong_unlock:
+        spin_lock(&dentry->d_lock);
        rcu_read_unlock();
        if (read_seqretry(&rename_lock, seq))
                goto rename_retry;
@@ -143,9 +163,9 @@ struct vfsmount *nfs_d_automount(struct path *path)
        }
        if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
-                mnt = nfs_do_refmount(path->mnt, path->dentry);
+                mnt = nfs_do_refmount(path->dentry);
        else
-                mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr);
+                mnt = nfs_do_submount(path->dentry, fh, fattr);
        if (IS_ERR(mnt))
                goto out;
@@ -209,19 +229,17 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
 /**
 * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
- * @mnt_parent - mountpoint of parent directory
 * @dentry - parent directory
 * @fh - filehandle for new root dentry
 * @fattr - attributes for new root inode
 *
 */
-static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+static struct vfsmount *nfs_do_submount(struct dentry *dentry,
-                                        const struct dentry *dentry,
                                        struct nfs_fh *fh,
                                        struct nfs_fattr *fattr)
 {
        struct nfs_clone_mount mountdata = {
-                .sb = mnt_parent->mnt_sb,
+                .sb = dentry->d_sb,
                .dentry = dentry,
                .fh = fh,
                .fattr = fattr,
@@ -237,11 +255,11 @@ static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
                        dentry->d_name.name);
        if (page == NULL)
                goto out;
-        devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
+        devname = nfs_devname(dentry, page, PAGE_SIZE);
        mnt = (struct vfsmount *)devname;
        if (IS_ERR(devname))
                goto free_page;
-        mnt = nfs_do_clone_mount(NFS_SB(mnt_parent->mnt_sb), devname, &mountdata);
+        mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata);
 free_page:
        free_page((unsigned long)page);
 out:
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ce939c062a52..d0c80d8b3f96 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -885,4 +885,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
        .lock           = nfs3_proc_lock,
        .clear_acl_cache = nfs3_forget_cached_acls,
        .close_context  = nfs_close_context,
+        .init_client    = nfs_init_client,
 };
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 7a7474073148..c64be1cff080 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -252,6 +252,9 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
 extern int nfs4_setup_sequence(const struct nfs_server *server,
                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
                int cache_reply, struct rpc_task *task);
+extern int nfs41_setup_sequence(struct nfs4_session *session,
+                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+                int cache_reply, struct rpc_task *task);
 extern void nfs4_destroy_session(struct nfs4_session *session);
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
 extern int nfs4_proc_create_session(struct nfs_client *);
@@ -259,6 +262,19 @@ extern int nfs4_proc_destroy_session(struct nfs4_session *);
 extern int nfs4_init_session(struct nfs_server *server);
 extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
                struct nfs_fsinfo *fsinfo);
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+        return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) ==
+                EXCHGID4_FLAG_USE_PNFS_DS;
+}
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+        return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS;
+}
 #else /* CONFIG_NFS_v4_1 */
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
 {
@@ -276,6 +292,18 @@ static inline int nfs4_init_session(struct nfs_server *server)
 {
        return 0;
 }
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+        return false;
+}
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+        return false;
+}
 #endif /* CONFIG_NFS_V4_1 */
 extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
@@ -298,6 +326,11 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
 #if defined(CONFIG_NFS_V4_1)
 struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
 struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
+extern void nfs4_schedule_session_recovery(struct nfs4_session *);
+#else
+static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
@@ -307,10 +340,9 @@ extern void nfs4_put_open_state(struct nfs4_state *);
 extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t);
 extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t);
 extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
-extern void nfs4_schedule_state_recovery(struct nfs_client *);
+extern void nfs4_schedule_lease_recovery(struct nfs_client *);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
-extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
+extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
-extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
 extern void nfs41_handle_recall_slot(struct nfs_client *clp);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 23f930caf1e2..428558464817 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -40,32 +40,309 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
 MODULE_DESCRIPTION("The NFSv4 file layout driver");
-static int
+#define FILELAYOUT_POLL_RETRY_MAX     (15*HZ)
-filelayout_set_layoutdriver(struct nfs_server *nfss)
+static loff_t
+filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
+                            loff_t offset)
 {
-        int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
+        u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
-                                                nfs4_fl_free_deviceid_callback);
+        u64 tmp;
-        if (status) {
-                printk(KERN_WARNING "%s: deviceid cache could not be "
+        offset -= flseg->pattern_offset;
-                        "initialized\n", __func__);
+        tmp = offset;
-                return status;
+        do_div(tmp, stripe_width);
+        return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit);
+}
+/* This function is used by the layout driver to calculate the
+ * offset of the file on the dserver based on whether the
+ * layout type is STRIPE_DENSE or STRIPE_SPARSE
+ */
+static loff_t
+filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+        struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+        switch (flseg->stripe_type) {
+        case STRIPE_SPARSE:
+                return offset;
+        case STRIPE_DENSE:
+                return filelayout_get_dense_offset(flseg, offset);
        }
-        dprintk("%s: deviceid cache has been initialized successfully\n",
-                __func__);
+        BUG();
+}
+/* For data server errors we don't recover from */
+static void
+filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+        if (lseg->pls_range.iomode == IOMODE_RW) {
+                dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
+                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+        } else {
+                dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
+                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+        }
+}
+static int filelayout_async_handle_error(struct rpc_task *task,
+                                         struct nfs4_state *state,
+                                         struct nfs_client *clp,
+                                         int *reset)
+{
+        if (task->tk_status >= 0)
+                return 0;
+        *reset = 0;
+        switch (task->tk_status) {
+        case -NFS4ERR_BADSESSION:
+        case -NFS4ERR_BADSLOT:
+        case -NFS4ERR_BAD_HIGH_SLOT:
+        case -NFS4ERR_DEADSESSION:
+        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+        case -NFS4ERR_SEQ_FALSE_RETRY:
+        case -NFS4ERR_SEQ_MISORDERED:
+                dprintk("%s ERROR %d, Reset session. Exchangeid "
+                        "flags 0x%x\n", __func__, task->tk_status,
+                        clp->cl_exchange_flags);
+                nfs4_schedule_session_recovery(clp->cl_session);
+                break;
+        case -NFS4ERR_DELAY:
+        case -NFS4ERR_GRACE:
+        case -EKEYEXPIRED:
+                rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
+                break;
+        default:
+                dprintk("%s DS error. Retry through MDS %d\n", __func__,
+                        task->tk_status);
+                *reset = 1;
+                break;
+        }
+        task->tk_status = 0;
+        return -EAGAIN;
+}
+/* NFS_PROTO call done callback routines */
+static int filelayout_read_done_cb(struct rpc_task *task,
+                                struct nfs_read_data *data)
+{
+        struct nfs_client *clp = data->ds_clp;
+        int reset = 0;
+        dprintk("%s DS read\n", __func__);
+        if (filelayout_async_handle_error(task, data->args.context->state,
+                                          data->ds_clp, &reset) == -EAGAIN) {
+                dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
+                        __func__, data->ds_clp, data->ds_clp->cl_session);
+                if (reset) {
+                        filelayout_set_lo_fail(data->lseg);
+                        nfs4_reset_read(task, data);
+                        clp = NFS_SERVER(data->inode)->nfs_client;
+                }
+                nfs_restart_rpc(task, clp);
+                return -EAGAIN;
+        }
        return 0;
 }
-/* Clear out the layout by destroying its device list */
+/*
-static int
+ * Call ops for the async read/write cases
-filelayout_clear_layoutdriver(struct nfs_server *nfss)
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void filelayout_read_prepare(struct rpc_task *task, void *data)
 {
-        dprintk("--> %s\n", __func__);
+        struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+        rdata->read_done_cb = filelayout_read_done_cb;
+        if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
+                                &rdata->args.seq_args, &rdata->res.seq_res,
+                                0, task))
+                return;
+        rpc_call_start(task);
+}
+static void filelayout_read_call_done(struct rpc_task *task, void *data)
+{
+        struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+        dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+        /* Note this may cause RPC to be resent */
+        rdata->mds_ops->rpc_call_done(task, data);
+}
+static void filelayout_read_release(void *data)
+{
+        struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+        rdata->mds_ops->rpc_release(data);
+}
+static int filelayout_write_done_cb(struct rpc_task *task,
+                                struct nfs_write_data *data)
+{
+        int reset = 0;
+        if (filelayout_async_handle_error(task, data->args.context->state,
+                                          data->ds_clp, &reset) == -EAGAIN) {
+                struct nfs_client *clp;
+                dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
+                        __func__, data->ds_clp, data->ds_clp->cl_session);
+                if (reset) {
+                        filelayout_set_lo_fail(data->lseg);
+                        nfs4_reset_write(task, data);
+                        clp = NFS_SERVER(data->inode)->nfs_client;
+                } else
+                        clp = data->ds_clp;
+                nfs_restart_rpc(task, clp);
+                return -EAGAIN;
+        }
-        if (nfss->nfs_client->cl_devid_cache)
-                pnfs_put_deviceid_cache(nfss->nfs_client);
        return 0;
 }
+static void filelayout_write_prepare(struct rpc_task *task, void *data)
+{
+        struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+        if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
+                                &wdata->args.seq_args, &wdata->res.seq_res,
+                                0, task))
+                return;
+        rpc_call_start(task);
+}
+static void filelayout_write_call_done(struct rpc_task *task, void *data)
+{
+        struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+        /* Note this may cause RPC to be resent */
+        wdata->mds_ops->rpc_call_done(task, data);
+}
+static void filelayout_write_release(void *data)
+{
+        struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+        wdata->mds_ops->rpc_release(data);
+}
+struct rpc_call_ops filelayout_read_call_ops = {
+        .rpc_call_prepare = filelayout_read_prepare,
+        .rpc_call_done = filelayout_read_call_done,
+        .rpc_release = filelayout_read_release,
+};
+struct rpc_call_ops filelayout_write_call_ops = {
+        .rpc_call_prepare = filelayout_write_prepare,
+        .rpc_call_done = filelayout_write_call_done,
+        .rpc_release = filelayout_write_release,
+};
+static enum pnfs_try_status
+filelayout_read_pagelist(struct nfs_read_data *data)
+{
+        struct pnfs_layout_segment *lseg = data->lseg;
+        struct nfs4_pnfs_ds *ds;
+        loff_t offset = data->args.offset;
+        u32 j, idx;
+        struct nfs_fh *fh;
+        int status;
+        dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+                __func__, data->inode->i_ino,
+                data->args.pgbase, (size_t)data->args.count, offset);
+        /* Retrieve the correct rpc_client for the byte range */
+        j = nfs4_fl_calc_j_index(lseg, offset);
+        idx = nfs4_fl_calc_ds_index(lseg, j);
+        ds = nfs4_fl_prepare_ds(lseg, idx);
+        if (!ds) {
+                /* Either layout fh index faulty, or ds connect failed */
+                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+                return PNFS_NOT_ATTEMPTED;
+        }
+        dprintk("%s USE DS:ip %x %hu\n", __func__,
+                ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+        /* No multipath support. Use first DS */
+        data->ds_clp = ds->ds_clp;
+        fh = nfs4_fl_select_ds_fh(lseg, j);
+        if (fh)
+                data->args.fh = fh;
+        data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+        data->mds_offset = offset;
+        /* Perform an asynchronous read to ds */
+        status = nfs_initiate_read(data, ds->ds_clp->cl_rpcclient,
+                                   &filelayout_read_call_ops);
+        BUG_ON(status != 0);
+        return PNFS_ATTEMPTED;
+}
+/* Perform async writes. */
+static enum pnfs_try_status
+filelayout_write_pagelist(struct nfs_write_data *data, int sync)
+{
+        struct pnfs_layout_segment *lseg = data->lseg;
+        struct nfs4_pnfs_ds *ds;
+        loff_t offset = data->args.offset;
+        u32 j, idx;
+        struct nfs_fh *fh;
+        int status;
+        /* Retrieve the correct rpc_client for the byte range */
+        j = nfs4_fl_calc_j_index(lseg, offset);
+        idx = nfs4_fl_calc_ds_index(lseg, j);
+        ds = nfs4_fl_prepare_ds(lseg, idx);
+        if (!ds) {
+                printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+                return PNFS_NOT_ATTEMPTED;
+        }
+        dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__,
+                data->inode->i_ino, sync, (size_t) data->args.count, offset,
+                ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+        /* We can't handle commit to ds yet */
+        if (!FILELAYOUT_LSEG(lseg)->commit_through_mds)
+                data->args.stable = NFS_FILE_SYNC;
+        data->write_done_cb = filelayout_write_done_cb;
+        data->ds_clp = ds->ds_clp;
+        fh = nfs4_fl_select_ds_fh(lseg, j);
+        if (fh)
+                data->args.fh = fh;
+        /*
+         * Get the file offset on the dserver. Set the write offset to
+         * this offset and save the original offset.
+         */
+        data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+        data->mds_offset = offset;
+        /* Perform an asynchronous write */
+        status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient,
+                                    &filelayout_write_call_ops, sync);
+        BUG_ON(status != 0);
+        return PNFS_ATTEMPTED;
+}
 /*
 * filelayout_check_layout()
 *
@@ -92,14 +369,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
                goto out;
        }
-        if (fl->stripe_unit % PAGE_SIZE) {
+        if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) {
-                dprintk("%s Stripe unit (%u) not page aligned\n",
+                dprintk("%s Invalid stripe unit (%u)\n",
                        __func__, fl->stripe_unit);
                goto out;
        }
        /* find and reference the deviceid */
-        dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
+        dsaddr = nfs4_fl_find_get_deviceid(id);
        if (dsaddr == NULL) {
                dsaddr = get_device_info(lo->plh_inode, id);
                if (dsaddr == NULL)
@@ -134,7 +411,7 @@ out:
        dprintk("--> %s returns %d\n", __func__, status);
        return status;
 out_put:
-        pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
+        nfs4_fl_put_deviceid(dsaddr);
        goto out;
 }
@@ -243,23 +520,47 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
 static void
 filelayout_free_lseg(struct pnfs_layout_segment *lseg)
 {
-        struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
        dprintk("--> %s\n", __func__);
-        pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
+        nfs4_fl_put_deviceid(fl->dsaddr);
-                          &fl->dsaddr->deviceid);
        _filelayout_free_lseg(fl);
 }
+/*
+ * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
+ *
+ * return 1 :  coalesce page
+ * return 0 :  don't coalesce page
+ */
+int
+filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+                   struct nfs_page *req)
+{
+        u64 p_stripe, r_stripe;
+        u32 stripe_unit;
+        if (!pgio->pg_lseg)
+                return 1;
+        p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
+        r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
+        stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+        do_div(p_stripe, stripe_unit);
+        do_div(r_stripe, stripe_unit);
+        return (p_stripe == r_stripe);
+}
 static struct pnfs_layoutdriver_type filelayout_type = {
-        .id = LAYOUT_NFSV4_1_FILES,
+        .id                     = LAYOUT_NFSV4_1_FILES,
-        .name = "LAYOUT_NFSV4_1_FILES",
+        .name                   = "LAYOUT_NFSV4_1_FILES",
-        .owner = THIS_MODULE,
+        .owner                  = THIS_MODULE,
-        .set_layoutdriver = filelayout_set_layoutdriver,
+        .alloc_lseg             = filelayout_alloc_lseg,
-        .clear_layoutdriver = filelayout_clear_layoutdriver,
+        .free_lseg              = filelayout_free_lseg,
-        .alloc_lseg              = filelayout_alloc_lseg,
+        .pg_test                = filelayout_pg_test,
-        .free_lseg               = filelayout_free_lseg,
+        .read_pagelist          = filelayout_read_pagelist,
+        .write_pagelist         = filelayout_write_pagelist,
 };
 static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index bbf60dd2ab9d..ee0c907742b5 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -55,8 +55,14 @@ struct nfs4_pnfs_ds {
        atomic_t                ds_count;
 };
+/* nfs4_file_layout_dsaddr flags */
+#define NFS4_DEVICE_ID_NEG_ENTRY        0x00000001
 struct nfs4_file_layout_dsaddr {
-        struct pnfs_deviceid_node       deviceid;
+        struct hlist_node               node;
+        struct nfs4_deviceid            deviceid;
+        atomic_t                        ref;
+        unsigned long                   flags;
        u32                             stripe_count;
        u8                              *stripe_indices;
        u32                             ds_num;
@@ -83,11 +89,18 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
                            generic_hdr);
 }
-extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
+extern struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
 extern void print_ds(struct nfs4_pnfs_ds *ds);
 extern void print_deviceid(struct nfs4_deviceid *dev_id);
+u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
+u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
+struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
+                                        u32 ds_idx);
 extern struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
+nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
+extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 struct nfs4_file_layout_dsaddr *
 get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index f5c9b125e8cc..68143c162e3b 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -37,6 +37,30 @@
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 /*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+#define NFS4_FL_DEVICE_ID_HASH_BITS     5
+#define NFS4_FL_DEVICE_ID_HASH_SIZE     (1 << NFS4_FL_DEVICE_ID_HASH_BITS)
+#define NFS4_FL_DEVICE_ID_HASH_MASK     (NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
+static inline u32
+nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
+{
+        unsigned char *cptr = (unsigned char *)id->data;
+        unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+        u32 x = 0;
+        while (nbytes--) {
+                x *= 37;
+                x += *cptr++;
+        }
+        return x & NFS4_FL_DEVICE_ID_HASH_MASK;
+}
+static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
+static DEFINE_SPINLOCK(filelayout_deviceid_lock);
+/*
 * Data server cache
 *
 * Data servers can be mapped to different device ids.
@@ -104,6 +128,67 @@ _data_server_lookup_locked(u32 ip_addr, u32 port)
        return NULL;
 }
+/*
+ * Create an rpc connection to the nfs4_pnfs_ds data server
+ * Currently only support IPv4
+ */
+static int
+nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
+{
+        struct nfs_client *clp;
+        struct sockaddr_in sin;
+        int status = 0;
+        dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__,
+                ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+                mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
+        sin.sin_family = AF_INET;
+        sin.sin_addr.s_addr = ds->ds_ip_addr;
+        sin.sin_port = ds->ds_port;
+        clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin,
+                                 sizeof(sin), IPPROTO_TCP);
+        if (IS_ERR(clp)) {
+                status = PTR_ERR(clp);
+                goto out;
+        }
+        if ((clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) != 0) {
+                if (!is_ds_client(clp)) {
+                        status = -ENODEV;
+                        goto out_put;
+                }
+                ds->ds_clp = clp;
+                dprintk("%s [existing] ip=%x, port=%hu\n", __func__,
+                        ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+                goto out;
+        }
+        /*
+         * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to
+         * be equal to the MDS lease. Renewal is scheduled in create_session.
+         */
+        spin_lock(&mds_srv->nfs_client->cl_lock);
+        clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time;
+        spin_unlock(&mds_srv->nfs_client->cl_lock);
+        clp->cl_last_renewal = jiffies;
+        /* New nfs_client */
+        status = nfs4_init_ds_session(clp);
+        if (status)
+                goto out_put;
+        ds->ds_clp = clp;
+        dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr),
+                ntohs(ds->ds_port));
+out:
+        return status;
+out_put:
+        nfs_put_client(clp);
+        goto out;
+}
 static void
 destroy_ds(struct nfs4_pnfs_ds *ds)
 {
@@ -122,7 +207,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
        struct nfs4_pnfs_ds *ds;
        int i;
-        print_deviceid(&dsaddr->deviceid.de_id);
+        print_deviceid(&dsaddr->deviceid);
        for (i = 0; i < dsaddr->ds_num; i++) {
                ds = dsaddr->ds_list[i];
@@ -139,15 +224,6 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
        kfree(dsaddr);
 }
-void
-nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
-{
-        struct nfs4_file_layout_dsaddr *dsaddr =
-                container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
-        nfs4_fl_free_deviceid(dsaddr);
-}
 static struct nfs4_pnfs_ds *
 nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
 {
@@ -219,6 +295,10 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
                goto out_err;
        }
        buf = kmalloc(rlen + 1, GFP_KERNEL);
+        if (!buf) {
+                dprintk("%s: Not enough memory\n", __func__);
+                goto out_err;
+        }
        buf[rlen] = '\0';
        memcpy(buf, r_addr, rlen);
@@ -296,7 +376,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
        dsaddr->stripe_count = cnt;
        dsaddr->ds_num = num;
-        memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
+        memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id));
        /* Go back an read stripe indices */
        p = indicesp;
@@ -346,28 +426,37 @@ out_err:
 }
 /*
- * Decode the opaque device specified in 'dev'
+ * Decode the opaque device specified in 'dev' and add it to the cache of
- * and add it to the list of available devices.
+ * available devices.
- * If the deviceid is already cached, nfs4_add_deviceid will return
- * a pointer to the cached struct and throw away the new.
 */
-static struct nfs4_file_layout_dsaddr*
+static struct nfs4_file_layout_dsaddr *
 decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
 {
-        struct nfs4_file_layout_dsaddr *dsaddr;
+        struct nfs4_file_layout_dsaddr *d, *new;
-        struct pnfs_deviceid_node *d;
+        long hash;
-        dsaddr = decode_device(inode, dev);
+        new = decode_device(inode, dev);
-        if (!dsaddr) {
+        if (!new) {
                printk(KERN_WARNING "%s: Could not decode or add device\n",
                        __func__);
                return NULL;
        }
-        d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
+        spin_lock(&filelayout_deviceid_lock);
-                              &dsaddr->deviceid);
+        d = nfs4_fl_find_get_deviceid(&new->deviceid);
+        if (d) {
+                spin_unlock(&filelayout_deviceid_lock);
+                nfs4_fl_free_deviceid(new);
+                return d;
+        }
+        INIT_HLIST_NODE(&new->node);
+        atomic_set(&new->ref, 1);
+        hash = nfs4_fl_deviceid_hash(&new->deviceid);
+        hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
+        spin_unlock(&filelayout_deviceid_lock);
-        return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+        return new;
 }
 /*
@@ -442,12 +531,123 @@ out_free:
        return dsaddr;
 }
+void
+nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+        if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
+                hlist_del_rcu(&dsaddr->node);
+                spin_unlock(&filelayout_deviceid_lock);
+                synchronize_rcu();
+                nfs4_fl_free_deviceid(dsaddr);
+        }
+}
 struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
+nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
+{
+        struct nfs4_file_layout_dsaddr *d;
+        struct hlist_node *n;
+        long hash = nfs4_fl_deviceid_hash(id);
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
+                if (!memcmp(&d->deviceid, id, sizeof(*id))) {
+                        if (!atomic_inc_not_zero(&d->ref))
+                                goto fail;
+                        rcu_read_unlock();
+                        return d;
+                }
+        }
+fail:
+        rcu_read_unlock();
+        return NULL;
+}
+/*
+ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
+ * Then: ((res + fsi) % dsaddr->stripe_count)
+ */
+u32
+nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+        struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+        u64 tmp;
+        tmp = offset - flseg->pattern_offset;
+        do_div(tmp, flseg->stripe_unit);
+        tmp += flseg->first_stripe_index;
+        return do_div(tmp, flseg->dsaddr->stripe_count);
+}
+u32
+nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
 {
-        struct pnfs_deviceid_node *d;
+        return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
+}
-        d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
+struct nfs_fh *
-        return (d == NULL) ? NULL :
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
-                container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+{
+        struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+        u32 i;
+        if (flseg->stripe_type == STRIPE_SPARSE) {
+                if (flseg->num_fh == 1)
+                        i = 0;
+                else if (flseg->num_fh == 0)
+                        /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
+                        return NULL;
+                else
+                        i = nfs4_fl_calc_ds_index(lseg, j);
+        } else
+                i = j;
+        return flseg->fh_array[i];
+}
+static void
+filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
+                               int err, u32 ds_addr)
+{
+        u32 *p = (u32 *)&dsaddr->deviceid;
+        printk(KERN_ERR "NFS: data server %x connection error %d."
+                " Deviceid [%x%x%x%x] marked out of use.\n",
+                ds_addr, err, p[0], p[1], p[2], p[3]);
+        spin_lock(&filelayout_deviceid_lock);
+        dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
+        spin_unlock(&filelayout_deviceid_lock);
+}
+struct nfs4_pnfs_ds *
+nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
+{
+        struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
+        struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
+        if (ds == NULL) {
+                printk(KERN_ERR "%s: No data server for offset index %d\n",
+                        __func__, ds_idx);
+                return NULL;
+        }
+        if (!ds->ds_clp) {
+                struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
+                int err;
+                if (dsaddr->flags & NFS4_DEVICE_ID_NEG_ENTRY) {
+                        /* Already tried to connect, don't try again */
+                        dprintk("%s Deviceid marked out of use\n", __func__);
+                        return NULL;
+                }
+                err = nfs4_ds_connect(s, ds);
+                if (err) {
+                        filelayout_mark_devid_negative(dsaddr, err,
+                                                       ntohl(ds->ds_ip_addr));
+                        return NULL;
+                }
+        }
+        return ds;
 }
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 3c2a1724fbd2..bb80c49b6533 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -54,33 +54,29 @@ Elong:
 /*
 * Determine the mount path as a string
 */
-static char *nfs4_path(const struct vfsmount *mnt_parent,
+static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen)
-                       const struct dentry *dentry,
-                       char *buffer, ssize_t buflen)
 {
-        const char *srvpath;
+        char *limit;
+        char *path = nfs_path(&limit, dentry, buffer, buflen);
-        srvpath = strchr(mnt_parent->mnt_devname, ':');
+        if (!IS_ERR(path)) {
-        if (srvpath)
+                char *colon = strchr(path, ':');
-                srvpath++;
+                if (colon && colon < limit)
-        else
+                        path = colon + 1;
-                srvpath = mnt_parent->mnt_devname;
+        }
+        return path;
-        return nfs_path(srvpath, mnt_parent->mnt_root, dentry, buffer, buflen);
 }
 /*
 * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we
 * believe to be the server path to this dentry
 */
-static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
+static int nfs4_validate_fspath(struct dentry *dentry,
-                                const struct dentry *dentry,
                                const struct nfs4_fs_locations *locations,
                                char *page, char *page2)
 {
        const char *path, *fs_path;
-        path = nfs4_path(mnt_parent, dentry, page, PAGE_SIZE);
+        path = nfs4_path(dentry, page, PAGE_SIZE);
        if (IS_ERR(path))
                return PTR_ERR(path);
@@ -165,20 +161,18 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 /**
 * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
- * @mnt_parent - mountpoint of parent directory
 * @dentry - parent directory
 * @locations - array of NFSv4 server location information
 *
 */
-static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
+static struct vfsmount *nfs_follow_referral(struct dentry *dentry,
-                                            const struct dentry *dentry,
                                            const struct nfs4_fs_locations *locations)
 {
        struct vfsmount *mnt = ERR_PTR(-ENOENT);
        struct nfs_clone_mount mountdata = {
-                .sb = mnt_parent->mnt_sb,
+                .sb = dentry->d_sb,
                .dentry = dentry,
-                .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
+                .authflavor = NFS_SB(dentry->d_sb)->client->cl_auth->au_flavor,
        };
        char *page = NULL, *page2 = NULL;
        int loc, error;
@@ -198,7 +192,7 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
                goto out;
        /* Ensure fs path is a prefix of current dentry path */
-        error = nfs4_validate_fspath(mnt_parent, dentry, locations, page, page2);
+        error = nfs4_validate_fspath(dentry, locations, page, page2);
        if (error < 0) {
                mnt = ERR_PTR(error);
                goto out;
@@ -225,11 +219,10 @@ out:
 /*
 * nfs_do_refmount - handle crossing a referral on server
- * @mnt_parent - mountpoint of referral
 * @dentry - dentry of referral
 *
 */
-struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+struct vfsmount *nfs_do_refmount(struct dentry *dentry)
 {
        struct vfsmount *mnt = ERR_PTR(-ENOMEM);
        struct dentry *parent;
@@ -262,7 +255,7 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr
            fs_locations->fs_path.ncomponents <= 0)
                goto out_free;
-        mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations);
+        mnt = nfs_follow_referral(dentry, fs_locations);
 out_free:
        __free_page(page);
        kfree(fs_locations);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 78936a8f40ab..1d84e7088af9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -85,6 +85,9 @@ static int nfs4_map_errors(int err)
        switch (err) {
        case -NFS4ERR_RESOURCE:
                return -EREMOTEIO;
+        case -NFS4ERR_BADOWNER:
+        case -NFS4ERR_BADNAME:
+                return -EINVAL;
        default:
                dprintk("%s could not handle NFSv4 error %d\n",
                                __func__, -err);
@@ -241,7 +244,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 /* This is the error handling routine for processes that are allowed
 * to sleep.
 */
-static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
 {
        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state *state = exception->state;
@@ -256,12 +259,13 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
                case -NFS4ERR_OPENMODE:
                        if (state == NULL)
                                break;
-                        nfs4_state_mark_reclaim_nograce(clp, state);
+                        nfs4_schedule_stateid_recovery(server, state);
-                        goto do_state_recovery;
+                        goto wait_on_recovery;
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_EXPIRED:
-                        goto do_state_recovery;
+                        nfs4_schedule_lease_recovery(clp);
+                        goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
                case -NFS4ERR_BADSESSION:
                case -NFS4ERR_BADSLOT:
@@ -272,7 +276,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
                case -NFS4ERR_SEQ_MISORDERED:
                        dprintk("%s ERROR: %d Reset session\n", __func__,
                                errorcode);
-                        nfs4_schedule_state_recovery(clp);
+                        nfs4_schedule_session_recovery(clp->cl_session);
                        exception->retry = 1;
                        break;
 #endif /* defined(CONFIG_NFS_V4_1) */
@@ -292,11 +296,23 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
                                break;
                case -NFS4ERR_OLD_STATEID:
                        exception->retry = 1;
+                        break;
+                case -NFS4ERR_BADOWNER:
+                        /* The following works around a Linux server bug! */
+                case -NFS4ERR_BADNAME:
+                        if (server->caps & NFS_CAP_UIDGID_NOMAP) {
+                                server->caps &= ~NFS_CAP_UIDGID_NOMAP;
+                                exception->retry = 1;
+                                printk(KERN_WARNING "NFS: v4 server %s "
+                                                "does not accept raw "
+                                                "uid/gids. "
+                                                "Reenabling the idmapper.\n",
+                                                server->nfs_client->cl_hostname);
+                        }
        }
        /* We failed to handle the error */
        return nfs4_map_errors(ret);
-do_state_recovery:
+wait_on_recovery:
-        nfs4_schedule_state_recovery(clp);
        ret = nfs4_wait_clnt_recover(clp);
        if (ret == 0)
                exception->retry = 1;
@@ -435,8 +451,8 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
                clp = res->sr_session->clp;
                do_renew_lease(clp, timestamp);
                /* Check sequence flags */
-                if (atomic_read(&clp->cl_count) > 1)
+                if (res->sr_status_flags != 0)
-                        nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+                        nfs4_schedule_lease_recovery(clp);
                break;
        case -NFS4ERR_DELAY:
                /* The server detected a resend of the RPC call and
@@ -505,7 +521,7 @@ out:
        return ret_id;
 }
-static int nfs41_setup_sequence(struct nfs4_session *session,
+int nfs41_setup_sequence(struct nfs4_session *session,
                                struct nfs4_sequence_args *args,
                                struct nfs4_sequence_res *res,
                                int cache_reply,
@@ -571,6 +587,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        res->sr_status = 1;
        return 0;
 }
+EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
 int nfs4_setup_sequence(const struct nfs_server *server,
                        struct nfs4_sequence_args *args,
@@ -1255,14 +1272,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
                        case -NFS4ERR_BAD_HIGH_SLOT:
                        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
                        case -NFS4ERR_DEADSESSION:
-                                nfs4_schedule_state_recovery(
+                                nfs4_schedule_session_recovery(server->nfs_client->cl_session);
-                                        server->nfs_client);
                                goto out;
                        case -NFS4ERR_STALE_CLIENTID:
                        case -NFS4ERR_STALE_STATEID:
                        case -NFS4ERR_EXPIRED:
                                /* Don't recall a delegation if it was lost */
-                                nfs4_schedule_state_recovery(server->nfs_client);
+                                nfs4_schedule_lease_recovery(server->nfs_client);
                                goto out;
                        case -ERESTARTSYS:
                                /*
@@ -1271,7 +1287,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
                                 */
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_BAD_STATEID:
-                                nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+                                nfs4_schedule_stateid_recovery(server, state);
                        case -EKEYEXPIRED:
                                /*
                                 * User RPCSEC_GSS context has expired.
@@ -1574,9 +1590,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
        return 0;
 }
-static int nfs4_recover_expired_lease(struct nfs_server *server)
+static int nfs4_client_recover_expired_lease(struct nfs_client *clp)
 {
-        struct nfs_client *clp = server->nfs_client;
        unsigned int loop;
        int ret;
@@ -1587,12 +1602,17 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
                if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
                    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
                        break;
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_state_manager(clp);
                ret = -EIO;
        }
        return ret;
 }
+static int nfs4_recover_expired_lease(struct nfs_server *server)
+{
+        return nfs4_client_recover_expired_lease(server->nfs_client);
+}
 /*
 * OPEN_EXPIRED:
 *      reclaim state on the server after a network partition.
@@ -3070,15 +3090,10 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
        return err;
 }
-static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
 {
        struct nfs_server *server = NFS_SERVER(data->inode);
-        dprintk("--> %s\n", __func__);
-        if (!nfs4_sequence_done(task, &data->res.seq_res))
-                return -EAGAIN;
        if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
                nfs_restart_rpc(task, server->nfs_client);
                return -EAGAIN;
@@ -3090,19 +3105,44 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
        return 0;
 }
+static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+{
+        dprintk("--> %s\n", __func__);
+        if (!nfs4_sequence_done(task, &data->res.seq_res))
+                return -EAGAIN;
+        return data->read_done_cb(task, data);
+}
 static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
 {
        data->timestamp   = jiffies;
+        data->read_done_cb = nfs4_read_done_cb;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
 }
-static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+/* Reset the the nfs_read_data to send the read to the MDS. */
+void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
+{
+        dprintk("%s Reset task for i/o through\n", __func__);
+        put_lseg(data->lseg);
+        data->lseg = NULL;
+        /* offsets will differ in the dense stripe case */
+        data->args.offset = data->mds_offset;
+        data->ds_clp = NULL;
+        data->args.fh     = NFS_FH(data->inode);
+        data->read_done_cb = nfs4_read_done_cb;
+        task->tk_ops = data->mds_ops;
+        rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+}
+EXPORT_SYMBOL_GPL(nfs4_reset_read);
+static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
        
-        if (!nfs4_sequence_done(task, &data->res.seq_res))
-                return -EAGAIN;
        if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
                nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
                return -EAGAIN;
@@ -3114,11 +3154,41 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
        return 0;
 }
+static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+{
+        if (!nfs4_sequence_done(task, &data->res.seq_res))
+                return -EAGAIN;
+        return data->write_done_cb(task, data);
+}
+/* Reset the the nfs_write_data to send the write to the MDS. */
+void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data)
+{
+        dprintk("%s Reset task for i/o through\n", __func__);
+        put_lseg(data->lseg);
+        data->lseg          = NULL;
+        data->ds_clp        = NULL;
+        data->write_done_cb = nfs4_write_done_cb;
+        data->args.fh       = NFS_FH(data->inode);
+        data->args.bitmask  = data->res.server->cache_consistency_bitmask;
+        data->args.offset   = data->mds_offset;
+        data->res.fattr     = &data->fattr;
+        task->tk_ops        = data->mds_ops;
+        rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+}
+EXPORT_SYMBOL_GPL(nfs4_reset_write);
 static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
        struct nfs_server *server = NFS_SERVER(data->inode);
-        data->args.bitmask = server->cache_consistency_bitmask;
+        if (data->lseg) {
+                data->args.bitmask = NULL;
+                data->res.fattr = NULL;
+        } else
+                data->args.bitmask = server->cache_consistency_bitmask;
+        if (!data->write_done_cb)
+                data->write_done_cb = nfs4_write_done_cb;
        data->res.server = server;
        data->timestamp   = jiffies;
@@ -3178,7 +3248,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
        if (task->tk_status < 0) {
                /* Unless we're shutting down, schedule state recovery! */
                if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0)
-                        nfs4_schedule_state_recovery(clp);
+                        nfs4_schedule_lease_recovery(clp);
                return;
        }
        do_renew_lease(clp, timestamp);
@@ -3252,6 +3322,35 @@ static void buf_to_pages(const void *buf, size_t buflen,
        }
 }
+static int buf_to_pages_noslab(const void *buf, size_t buflen,
+                struct page **pages, unsigned int *pgbase)
+{
+        struct page *newpage, **spages;
+        int rc = 0;
+        size_t len;
+        spages = pages;
+        do {
+                len = min_t(size_t, PAGE_CACHE_SIZE, buflen);
+                newpage = alloc_page(GFP_KERNEL);
+                if (newpage == NULL)
+                        goto unwind;
+                memcpy(page_address(newpage), buf, len);
+                buf += len;
+                buflen -= len;
+                *pages++ = newpage;
+                rc++;
+        } while (buflen != 0);
+        return rc;
+unwind:
+        for(; rc > 0; rc--)
+                __free_page(spages[rc-1]);
+        return -ENOMEM;
+}
 struct nfs4_cached_acl {
        int cached;
        size_t len;
@@ -3420,13 +3519,23 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
                .rpc_argp       = &arg,
                .rpc_resp       = &res,
        };
-        int ret;
+        int ret, i;
        if (!nfs4_server_supports_acls(server))
                return -EOPNOTSUPP;
+        i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
+        if (i < 0)
+                return i;
        nfs_inode_return_delegation(inode);
-        buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
        ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
+        /*
+         * Free each page after tx, so the only ref left is
+         * held by the network stack
+         */
+        for (; i > 0; i--)
+                put_page(pages[i-1]);
        /*
         * Acl update can result in inode attribute update.
         * so mark the attribute cache invalid.
@@ -3464,12 +3573,13 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                case -NFS4ERR_OPENMODE:
                        if (state == NULL)
                                break;
-                        nfs4_state_mark_reclaim_nograce(clp, state);
+                        nfs4_schedule_stateid_recovery(server, state);
-                        goto do_state_recovery;
+                        goto wait_on_recovery;
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_EXPIRED:
-                        goto do_state_recovery;
+                        nfs4_schedule_lease_recovery(clp);
+                        goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
                case -NFS4ERR_BADSESSION:
                case -NFS4ERR_BADSLOT:
@@ -3480,7 +3590,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                case -NFS4ERR_SEQ_MISORDERED:
                        dprintk("%s ERROR %d, Reset session\n", __func__,
                                task->tk_status);
-                        nfs4_schedule_state_recovery(clp);
+                        nfs4_schedule_session_recovery(clp->cl_session);
                        task->tk_status = 0;
                        return -EAGAIN;
 #endif /* CONFIG_NFS_V4_1 */
@@ -3497,9 +3607,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
        }
        task->tk_status = nfs4_map_errors(task->tk_status);
        return 0;
-do_state_recovery:
+wait_on_recovery:
        rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
-        nfs4_schedule_state_recovery(clp);
        if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
                rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
        task->tk_status = 0;
@@ -4110,7 +4219,7 @@ static void nfs4_lock_release(void *calldata)
                task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
                                data->arg.lock_seqid);
                if (!IS_ERR(task))
-                        rpc_put_task(task);
+                        rpc_put_task_async(task);
                dprintk("%s: cancelling lock!\n", __func__);
        } else
                nfs_free_seqid(data->arg.lock_seqid);
@@ -4134,23 +4243,18 @@ static const struct rpc_call_ops nfs4_recover_lock_ops = {
 static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
 {
-        struct nfs_client *clp = server->nfs_client;
-        struct nfs4_state *state = lsp->ls_state;
        switch (error) {
        case -NFS4ERR_ADMIN_REVOKED:
        case -NFS4ERR_BAD_STATEID:
-        case -NFS4ERR_EXPIRED:
+                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
                if (new_lock_owner != 0 ||
                   (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
-                        nfs4_state_mark_reclaim_nograce(clp, state);
+                        nfs4_schedule_stateid_recovery(server, lsp->ls_state);
-                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
                break;
        case -NFS4ERR_STALE_STATEID:
-                if (new_lock_owner != 0 ||
-                    (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
-                        nfs4_state_mark_reclaim_reboot(clp, state);
                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+        case -NFS4ERR_EXPIRED:
+                nfs4_schedule_lease_recovery(server->nfs_client);
        };
 }
@@ -4366,12 +4470,14 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                        case -NFS4ERR_EXPIRED:
                        case -NFS4ERR_STALE_CLIENTID:
                        case -NFS4ERR_STALE_STATEID:
+                                nfs4_schedule_lease_recovery(server->nfs_client);
+                                goto out;
                        case -NFS4ERR_BADSESSION:
                        case -NFS4ERR_BADSLOT:
                        case -NFS4ERR_BAD_HIGH_SLOT:
                        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
                        case -NFS4ERR_DEADSESSION:
-                                nfs4_schedule_state_recovery(server->nfs_client);
+                                nfs4_schedule_session_recovery(server->nfs_client->cl_session);
                                goto out;
                        case -ERESTARTSYS:
                                /*
@@ -4381,7 +4487,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_BAD_STATEID:
                        case -NFS4ERR_OPENMODE:
-                                nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+                                nfs4_schedule_stateid_recovery(server, state);
                                err = 0;
                                goto out;
                        case -EKEYEXPIRED:
@@ -4988,10 +5094,20 @@ int nfs4_proc_create_session(struct nfs_client *clp)
        int status;
        unsigned *ptr;
        struct nfs4_session *session = clp->cl_session;
+        long timeout = 0;
+        int err;
        dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
-        status = _nfs4_proc_create_session(clp);
+        do {
+                status = _nfs4_proc_create_session(clp);
+                if (status == -NFS4ERR_DELAY) {
+                        err = nfs4_delay(clp->cl_rpcclient, &timeout);
+                        if (err)
+                                status = err;
+                }
+        } while (status == -NFS4ERR_DELAY);
        if (status)
                goto out;
@@ -5073,6 +5189,27 @@ int nfs4_init_session(struct nfs_server *server)
        return ret;
 }
+int nfs4_init_ds_session(struct nfs_client *clp)
+{
+        struct nfs4_session *session = clp->cl_session;
+        int ret;
+        if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
+                return 0;
+        ret = nfs4_client_recover_expired_lease(clp);
+        if (!ret)
+                /* Test for the DS role */
+                if (!is_ds_client(clp))
+                        ret = -ENODEV;
+        if (!ret)
+                ret = nfs4_check_client_ready(clp);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
 /*
 * Renew the cl_session lease.
 */
@@ -5100,7 +5237,7 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client
                rpc_delay(task, NFS4_POLL_RETRY_MAX);
                return -EAGAIN;
        default:
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_lease_recovery(clp);
        }
        return 0;
 }
@@ -5187,7 +5324,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
        if (IS_ERR(task))
                ret = PTR_ERR(task);
        else
-                rpc_put_task(task);
+                rpc_put_task_async(task);
        dprintk("<-- %s status=%d\n", __func__, ret);
        return ret;
 }
@@ -5203,8 +5340,13 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
                goto out;
        }
        ret = rpc_wait_for_completion_task(task);
-        if (!ret)
+        if (!ret) {
+                struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
+                if (task->tk_status == 0)
+                        nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
                ret = task->tk_status;
+        }
        rpc_put_task(task);
 out:
        dprintk("<-- %s status=%d\n", __func__, ret);
@@ -5241,7 +5383,7 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
                rpc_delay(task, NFS4_POLL_RETRY_MAX);
                return -EAGAIN;
        default:
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_lease_recovery(clp);
        }
        return 0;
 }
@@ -5309,6 +5451,9 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
                status = PTR_ERR(task);
                goto out;
        }
+        status = nfs4_wait_for_completion_rpc_task(task);
+        if (status == 0)
+                status = task->tk_status;
        rpc_put_task(task);
        return 0;
 out:
@@ -5595,6 +5740,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .clear_acl_cache = nfs4_zap_acl_attr,
        .close_context  = nfs4_close_context,
        .open_context   = nfs4_atomic_open,
+        .init_client    = nfs4_init_client,
 };
 static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 402143d75fc5..df8e7f3ca56d 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -64,12 +64,8 @@ nfs4_renew_state(struct work_struct *work)
        ops = clp->cl_mvops->state_renewal_ops;
        dprintk("%s: start\n", __func__);
-        rcu_read_lock();
+        if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state))
-        if (list_empty(&clp->cl_superblocks)) {
-                rcu_read_unlock();
                goto out;
-        }
-        rcu_read_unlock();
        spin_lock(&clp->cl_lock);
        lease = clp->cl_lease_time;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e6742b57a04c..ab1bf5bb021f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -153,6 +153,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
        int status;
        struct nfs_fsinfo fsinfo;
+        if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
+                nfs4_schedule_state_renewal(clp);
+                return 0;
+        }
        status = nfs4_proc_get_lease_time(clp, &fsinfo);
        if (status == 0) {
                /* Update lease time and schedule renewal */
@@ -1007,9 +1012,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
 }
 /*
- * Schedule a state recovery attempt
+ * Schedule a lease recovery attempt
 */
-void nfs4_schedule_state_recovery(struct nfs_client *clp)
+void nfs4_schedule_lease_recovery(struct nfs_client *clp)
 {
        if (!clp)
                return;
@@ -1018,7 +1023,7 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp)
        nfs4_schedule_state_manager(clp);
 }
-int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
+static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
 {
        set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -1032,7 +1037,7 @@ int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *st
        return 1;
 }
-int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
+static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
 {
        set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
        clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -1041,6 +1046,14 @@ int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *s
        return 1;
 }
+void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state)
+{
+        struct nfs_client *clp = server->nfs_client;
+        nfs4_state_mark_reclaim_nograce(clp, state);
+        nfs4_schedule_state_manager(clp);
+}
 static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
 {
        struct inode *inode = state->inode;
@@ -1436,10 +1449,16 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
 }
 #ifdef CONFIG_NFS_V4_1
+void nfs4_schedule_session_recovery(struct nfs4_session *session)
+{
+        nfs4_schedule_lease_recovery(session->clp);
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
 void nfs41_handle_recall_slot(struct nfs_client *clp)
 {
        set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
-        nfs4_schedule_state_recovery(clp);
+        nfs4_schedule_state_manager(clp);
 }
 static void nfs4_reset_all_state(struct nfs_client *clp)
@@ -1447,7 +1466,7 @@ static void nfs4_reset_all_state(struct nfs_client *clp)
        if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
                clp->cl_boot_time = CURRENT_TIME;
                nfs4_state_start_reclaim_nograce(clp);
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_state_manager(clp);
        }
 }
@@ -1455,7 +1474,7 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp)
 {
        if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
                nfs4_state_start_reclaim_reboot(clp);
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_state_manager(clp);
        }
 }
@@ -1475,7 +1494,7 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp)
 {
        nfs_expire_all_delegations(clp);
        if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_state_manager(clp);
 }
 void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4e2c168b6ee9..0cf560f77884 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -844,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
        if (iap->ia_valid & ATTR_MODE)
                len += 4;
        if (iap->ia_valid & ATTR_UID) {
-                owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ);
+                owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
                if (owner_namelen < 0) {
                        dprintk("nfs: couldn't resolve uid %d to string\n",
                                        iap->ia_uid);
@@ -856,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
                len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
        }
        if (iap->ia_valid & ATTR_GID) {
-                owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ);
+                owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);
                if (owner_grouplen < 0) {
                        dprintk("nfs: couldn't resolve gid %d to string\n",
                                        iap->ia_gid);
@@ -1384,7 +1384,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
        hdr->replen += decode_putrootfh_maxsz;
 }
-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx)
+static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid)
 {
        nfs4_stateid stateid;
        __be32 *p;
@@ -1392,6 +1392,8 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
        p = reserve_space(xdr, NFS4_STATEID_SIZE);
        if (ctx->state != NULL) {
                nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
+                if (zero_seqid)
+                        stateid.stateid.seqid = 0;
                xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
        } else
                xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
@@ -1404,7 +1406,8 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(OP_READ);
-        encode_stateid(xdr, args->context, args->lock_context);
+        encode_stateid(xdr, args->context, args->lock_context,
+                       hdr->minorversion);
        p = reserve_space(xdr, 12);
        p = xdr_encode_hyper(p, args->offset);
@@ -1592,7 +1595,8 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(OP_WRITE);
-        encode_stateid(xdr, args->context, args->lock_context);
+        encode_stateid(xdr, args->context, args->lock_context,
+                       hdr->minorversion);
        p = reserve_space(xdr, 16);
        p = xdr_encode_hyper(p, args->offset);
@@ -1660,7 +1664,7 @@ static void encode_create_session(struct xdr_stream *xdr,
        p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
        *p++ = cpu_to_be32(OP_CREATE_SESSION);
-        p = xdr_encode_hyper(p, clp->cl_ex_clid);
+        p = xdr_encode_hyper(p, clp->cl_clientid);
        *p++ = cpu_to_be32(clp->cl_seqid);                      /*Sequence id */
        *p++ = cpu_to_be32(args->flags);                        /*flags */
@@ -2271,7 +2275,8 @@ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
        encode_putfh(xdr, args->fh, &hdr);
        encode_write(xdr, args, &hdr);
        req->rq_snd_buf.flags |= XDRBUF_WRITE;
-        encode_getfattr(xdr, args->bitmask, &hdr);
+        if (args->bitmask)
+                encode_getfattr(xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
 }
@@ -3382,7 +3387,7 @@ out_overflow:
 }
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
-                struct nfs_client *clp, uint32_t *uid, int may_sleep)
+                const struct nfs_server *server, uint32_t *uid, int may_sleep)
 {
        uint32_t len;
        __be32 *p;
@@ -3402,7 +3407,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
                if (!may_sleep) {
                        /* do nothing */
                } else if (len < XDR_MAX_NETOBJ) {
-                        if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
+                        if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0)
                                ret = NFS_ATTR_FATTR_OWNER;
                        else
                                dprintk("%s: nfs_map_name_to_uid failed!\n",
@@ -3420,7 +3425,7 @@ out_overflow:
 }
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
-                struct nfs_client *clp, uint32_t *gid, int may_sleep)
+                const struct nfs_server *server, uint32_t *gid, int may_sleep)
 {
        uint32_t len;
        __be32 *p;
@@ -3440,7 +3445,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
                if (!may_sleep) {
                        /* do nothing */
                } else if (len < XDR_MAX_NETOBJ) {
-                        if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
+                        if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0)
                                ret = NFS_ATTR_FATTR_GROUP;
                        else
                                dprintk("%s: nfs_map_group_to_gid failed!\n",
@@ -3939,14 +3944,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
                goto xdr_error;
        fattr->valid |= status;
-        status = decode_attr_owner(xdr, bitmap, server->nfs_client,
+        status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep);
-                        &fattr->uid, may_sleep);
        if (status < 0)
                goto xdr_error;
        fattr->valid |= status;
-        status = decode_attr_group(xdr, bitmap, server->nfs_client,
+        status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep);
-                        &fattr->gid, may_sleep);
        if (status < 0)
                goto xdr_error;
        fattr->valid |= status;
@@ -4694,7 +4697,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
        p = xdr_inline_decode(xdr, 8);
        if (unlikely(!p))
                goto out_overflow;
-        xdr_decode_hyper(p, &clp->cl_ex_clid);
+        xdr_decode_hyper(p, &clp->cl_clientid);
        p = xdr_inline_decode(xdr, 12);
        if (unlikely(!p))
                goto out_overflow;
@@ -5690,8 +5693,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_write(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(xdr, res->fattr, res->server,
+        if (res->fattr)
-                        !RPC_IS_ASYNC(rqstp->rq_task));
+                decode_getfattr(xdr, res->fattr, res->server,
+                                !RPC_IS_ASYNC(rqstp->rq_task));
        if (!status)
                status = res->count;
 out:
@@ -6167,8 +6171,6 @@ static struct {
        { NFS4ERR_DQUOT,        -EDQUOT         },
        { NFS4ERR_STALE,        -ESTALE         },
        { NFS4ERR_BADHANDLE,    -EBADHANDLE     },
-        { NFS4ERR_BADOWNER,     -EINVAL         },
-        { NFS4ERR_BADNAME,      -EINVAL         },
        { NFS4ERR_BAD_COOKIE,   -EBADCOOKIE     },
        { NFS4ERR_NOTSUPP,      -ENOTSUPP       },
        { NFS4ERR_TOOSMALL,     -ETOOSMALL      },
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 903908a20023..c541093a5bf2 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -86,11 +86,14 @@
 /* Default path we try to mount. "%s" gets replaced by our IP address */
 #define NFS_ROOT                "/tftpboot/%s"
+/* Default NFSROOT mount options. */
+#define NFS_DEF_OPTIONS         "udp"
 /* Parameters passed from the kernel command line */
 static char nfs_root_parms[256] __initdata = "";
 /* Text-based mount options passed to super.c */
-static char nfs_root_options[256] __initdata = "";
+static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS;
 /* Address of NFS server */
 static __be32 servaddr __initdata = htonl(INADDR_NONE);
@@ -160,8 +163,14 @@ static int __init root_nfs_copy(char *dest, const char *src,
 }
 static int __init root_nfs_cat(char *dest, const char *src,
-                                  const size_t destlen)
+                               const size_t destlen)
 {
+        size_t len = strlen(dest);
+        if (len && dest[len - 1] != ',')
+                if (strlcat(dest, ",", destlen) > destlen)
+                        return -1;
        if (strlcat(dest, src, destlen) > destlen)
                return -1;
        return 0;
@@ -194,16 +203,6 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath,
                if (root_nfs_cat(nfs_root_options, incoming,
                                                sizeof(nfs_root_options)))
                        return -1;
-        /*
-         * Possibly prepare for more options to be appended
-         */
-        if (nfs_root_options[0] != '\0' &&
-            nfs_root_options[strlen(nfs_root_options)] != ',')
-                if (root_nfs_cat(nfs_root_options, ",",
-                                                sizeof(nfs_root_options)))
-                        return -1;
        return 0;
 }
@@ -217,7 +216,7 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath,
 */
 static int __init root_nfs_data(char *cmdline)
 {
-        char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
+        char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
        int len, retval = -1;
        char *tmp = NULL;
        const size_t tmplen = sizeof(nfs_export_path);
@@ -244,9 +243,9 @@ static int __init root_nfs_data(char *cmdline)
         * Append mandatory options for nfsroot so they override
         * what has come before
         */
-        snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4",
+        snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4",
                        &servaddr);
-        if (root_nfs_cat(nfs_root_options, addr_option,
+        if (root_nfs_cat(nfs_root_options, mand_options,
                                                sizeof(nfs_root_options)))
                goto out_optionstoolong;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e1164e3f9e69..23e794410669 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -20,6 +20,7 @@
 #include <linux/nfs_mount.h>
 #include "internal.h"
+#include "pnfs.h"
 static struct kmem_cache *nfs_page_cachep;
@@ -213,7 +214,7 @@ nfs_wait_on_request(struct nfs_page *req)
 */
 void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
                     struct inode *inode,
-                     int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
+                     int (*doio)(struct nfs_pageio_descriptor *),
                     size_t bsize,
                     int io_flags)
 {
@@ -226,6 +227,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
        desc->pg_doio = doio;
        desc->pg_ioflags = io_flags;
        desc->pg_error = 0;
+        desc->pg_lseg = NULL;
 }
 /**
@@ -240,7 +242,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 * Return 'true' if this is the case, else return 'false'.
 */
 static int nfs_can_coalesce_requests(struct nfs_page *prev,
-                                     struct nfs_page *req)
+                                     struct nfs_page *req,
+                                     struct nfs_pageio_descriptor *pgio)
 {
        if (req->wb_context->cred != prev->wb_context->cred)
                return 0;
@@ -254,6 +257,12 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
                return 0;
        if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
                return 0;
+        /*
+         * Non-whole file layouts need to check that req is inside of
+         * pgio->pg_lseg.
+         */
+        if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
+                return 0;
        return 1;
 }
@@ -286,7 +295,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
                if (newlen > desc->pg_bsize)
                        return 0;
                prev = nfs_list_entry(desc->pg_list.prev);
-                if (!nfs_can_coalesce_requests(prev, req))
+                if (!nfs_can_coalesce_requests(prev, req, desc))
                        return 0;
        } else
                desc->pg_base = req->wb_pgbase;
@@ -302,12 +311,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
 static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
 {
        if (!list_empty(&desc->pg_list)) {
-                int error = desc->pg_doio(desc->pg_inode,
+                int error = desc->pg_doio(desc);
-                                          &desc->pg_list,
-                                          nfs_page_array_len(desc->pg_base,
-                                                             desc->pg_count),
-                                          desc->pg_count,
-                                          desc->pg_ioflags);
                if (error < 0)
                        desc->pg_error = error;
                else
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 1b1bc1a0fb0a..f38813a0a295 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -30,6 +30,7 @@
 #include <linux/nfs_fs.h>
 #include "internal.h"
 #include "pnfs.h"
+#include "iostat.h"
 #define NFSDBG_FACILITY         NFSDBG_PNFS
@@ -74,10 +75,8 @@ find_pnfs_driver(u32 id)
 void
 unset_pnfs_layoutdriver(struct nfs_server *nfss)
 {
-        if (nfss->pnfs_curr_ld) {
+        if (nfss->pnfs_curr_ld)
-                nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
                module_put(nfss->pnfs_curr_ld->owner);
-        }
        nfss->pnfs_curr_ld = NULL;
 }
@@ -115,13 +114,7 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
                goto out_no_driver;
        }
        server->pnfs_curr_ld = ld_type;
-        if (ld_type->set_layoutdriver(server)) {
-                printk(KERN_ERR
-                       "%s: Error initializing mount point for layout driver %u.\n",
-                       __func__, id);
-                module_put(ld_type->owner);
-                goto out_no_driver;
-        }
        dprintk("%s: pNFS module for %u set\n", __func__, id);
        return;
@@ -230,37 +223,41 @@ static void free_lseg(struct pnfs_layout_segment *lseg)
        put_layout_hdr(NFS_I(ino)->layout);
 }
-/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
+static void
- * could sleep, so must be called outside of the lock.
+put_lseg_common(struct pnfs_layout_segment *lseg)
- * Returns 1 if object was removed, otherwise return 0.
+{
- */
+        struct inode *inode = lseg->pls_layout->plh_inode;
-static int
-put_lseg_locked(struct pnfs_layout_segment *lseg,
+        BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
-                struct list_head *tmp_list)
+        list_del_init(&lseg->pls_list);
+        if (list_empty(&lseg->pls_layout->plh_segs)) {
+                set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
+                /* Matched by initial refcount set in alloc_init_layout_hdr */
+                put_layout_hdr_locked(lseg->pls_layout);
+        }
+        rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
+}
+void
+put_lseg(struct pnfs_layout_segment *lseg)
 {
+        struct inode *inode;
+        if (!lseg)
+                return;
        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
                atomic_read(&lseg->pls_refcount),
                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
-        if (atomic_dec_and_test(&lseg->pls_refcount)) {
+        inode = lseg->pls_layout->plh_inode;
-                struct inode *ino = lseg->pls_layout->plh_inode;
+        if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
+                LIST_HEAD(free_me);
-                BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+                put_lseg_common(lseg);
-                list_del(&lseg->pls_list);
+                list_add(&lseg->pls_list, &free_me);
-                if (list_empty(&lseg->pls_layout->plh_segs)) {
+                spin_unlock(&inode->i_lock);
-                        struct nfs_client *clp;
+                pnfs_free_lseg_list(&free_me);
-                        clp = NFS_SERVER(ino)->nfs_client;
-                        spin_lock(&clp->cl_lock);
-                        /* List does not take a reference, so no need for put here */
-                        list_del_init(&lseg->pls_layout->plh_layouts);
-                        spin_unlock(&clp->cl_lock);
-                        clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
-                }
-                rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
-                list_add(&lseg->pls_list, tmp_list);
-                return 1;
        }
-        return 0;
 }
 static bool
@@ -281,7 +278,13 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
                 * list.  It will now be removed when all
                 * outstanding io is finished.
                 */
-                rv = put_lseg_locked(lseg, tmp_list);
+                dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+                        atomic_read(&lseg->pls_refcount));
+                if (atomic_dec_and_test(&lseg->pls_refcount)) {
+                        put_lseg_common(lseg);
+                        list_add(&lseg->pls_list, tmp_list);
+                        rv = 1;
+                }
        }
        return rv;
 }
@@ -299,6 +302,11 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
        dprintk("%s:Begin lo %p\n", __func__, lo);
+        if (list_empty(&lo->plh_segs)) {
+                if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
+                        put_layout_hdr_locked(lo);
+                return 0;
+        }
        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
                if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
                        dprintk("%s: freeing lseg %p iomode %d "
@@ -312,11 +320,27 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
        return invalid - removed;
 }
+/* note free_me must contain lsegs from a single layout_hdr */
 void
 pnfs_free_lseg_list(struct list_head *free_me)
 {
        struct pnfs_layout_segment *lseg, *tmp;
+        struct pnfs_layout_hdr *lo;
+        if (list_empty(free_me))
+                return;
+        lo = list_first_entry(free_me, struct pnfs_layout_segment,
+                              pls_list)->pls_layout;
+        if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
+                struct nfs_client *clp;
+                clp = NFS_SERVER(lo->plh_inode)->nfs_client;
+                spin_lock(&clp->cl_lock);
+                list_del_init(&lo->plh_layouts);
+                spin_unlock(&clp->cl_lock);
+        }
        list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
                list_del(&lseg->pls_list);
                free_lseg(lseg);
@@ -332,10 +356,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
        spin_lock(&nfsi->vfs_inode.i_lock);
        lo = nfsi->layout;
        if (lo) {
-                set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
+                lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
                mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
-                /* Matched by refcount set to 1 in alloc_init_layout_hdr */
-                put_layout_hdr_locked(lo);
        }
        spin_unlock(&nfsi->vfs_inode.i_lock);
        pnfs_free_lseg_list(&tmp_list);
@@ -403,6 +425,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
            (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
                return true;
        return lo->plh_block_lgets ||
+                test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
                (list_empty(&lo->plh_segs) &&
                 (atomic_read(&lo->plh_outstanding) > lget));
@@ -674,7 +697,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
                    is_matching_lseg(lseg, iomode)) {
-                        ret = lseg;
+                        ret = get_lseg(lseg);
                        break;
                }
                if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
@@ -699,6 +722,7 @@ pnfs_update_layout(struct inode *ino,
        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
        struct pnfs_layout_hdr *lo;
        struct pnfs_layout_segment *lseg = NULL;
+        bool first = false;
        if (!pnfs_enabled_sb(NFS_SERVER(ino)))
                return NULL;
@@ -715,21 +739,25 @@ pnfs_update_layout(struct inode *ino,
                dprintk("%s matches recall, use MDS\n", __func__);
                goto out_unlock;
        }
-        /* Check to see if the layout for the given range already exists */
-        lseg = pnfs_find_lseg(lo, iomode);
-        if (lseg)
-                goto out_unlock;
        /* if LAYOUTGET already failed once we don't try again */
        if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
                goto out_unlock;
+        /* Check to see if the layout for the given range already exists */
+        lseg = pnfs_find_lseg(lo, iomode);
+        if (lseg)
+                goto out_unlock;
        if (pnfs_layoutgets_blocked(lo, NULL, 0))
                goto out_unlock;
        atomic_inc(&lo->plh_outstanding);
        get_layout_hdr(lo);
-        if (list_empty(&lo->plh_segs)) {
+        if (list_empty(&lo->plh_segs))
+                first = true;
+        spin_unlock(&ino->i_lock);
+        if (first) {
                /* The lo must be on the clp list if there is any
                 * chance of a CB_LAYOUTRECALL(FILE) coming in.
                 */
@@ -738,24 +766,18 @@ pnfs_update_layout(struct inode *ino,
                list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
                spin_unlock(&clp->cl_lock);
        }
-        spin_unlock(&ino->i_lock);
        lseg = send_layoutget(lo, ctx, iomode);
-        if (!lseg) {
+        if (!lseg && first) {
-                spin_lock(&ino->i_lock);
+                spin_lock(&clp->cl_lock);
-                if (list_empty(&lo->plh_segs)) {
+                list_del_init(&lo->plh_layouts);
-                        spin_lock(&clp->cl_lock);
+                spin_unlock(&clp->cl_lock);
-                        list_del_init(&lo->plh_layouts);
-                        spin_unlock(&clp->cl_lock);
-                        clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
-                }
-                spin_unlock(&ino->i_lock);
        }
        atomic_dec(&lo->plh_outstanding);
        put_layout_hdr(lo);
 out:
        dprintk("%s end, state 0x%lx lseg %p\n", __func__,
-                nfsi->layout->plh_flags, lseg);
+                nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
        return lseg;
 out_unlock:
        spin_unlock(&ino->i_lock);
@@ -808,7 +830,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        }
        init_lseg(lo, lseg);
        lseg->pls_range = res->range;
-        *lgp->lsegpp = lseg;
+        *lgp->lsegpp = get_lseg(lseg);
        pnfs_insert_layout(lo, lseg);
        if (res->return_on_close) {
@@ -829,137 +851,97 @@ out_forget_reply:
        goto out;
 }
-/*
+static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
- * Device ID cache. Currently supports one layout type per struct nfs_client.
+                             struct nfs_page *prev,
- * Add layout type to the lookup key to expand to support multiple types.
+                             struct nfs_page *req)
- */
-int
-pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
-                         void (*free_callback)(struct pnfs_deviceid_node *))
 {
-        struct pnfs_deviceid_cache *c;
+        if (pgio->pg_count == prev->wb_bytes) {
+                /* This is first coelesce call for a series of nfs_pages */
-        c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
+                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-        if (!c)
+                                                   prev->wb_context,
-                return -ENOMEM;
+                                                   IOMODE_READ);
-        spin_lock(&clp->cl_lock);
-        if (clp->cl_devid_cache != NULL) {
-                atomic_inc(&clp->cl_devid_cache->dc_ref);
-                dprintk("%s [kref [%d]]\n", __func__,
-                        atomic_read(&clp->cl_devid_cache->dc_ref));
-                kfree(c);
-        } else {
-                /* kzalloc initializes hlists */
-                spin_lock_init(&c->dc_lock);
-                atomic_set(&c->dc_ref, 1);
-                c->dc_free_callback = free_callback;
-                clp->cl_devid_cache = c;
-                dprintk("%s [new]\n", __func__);
        }
-        spin_unlock(&clp->cl_lock);
+        return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
-        return 0;
 }
-EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
-/*
- * Called from pnfs_layoutdriver_type->free_lseg
- * last layout segment reference frees deviceid
- */
 void
-pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
-                  struct pnfs_deviceid_node *devid)
 {
-        struct nfs4_deviceid *id = &devid->de_id;
+        struct pnfs_layoutdriver_type *ld;
-        struct pnfs_deviceid_node *d;
-        struct hlist_node *n;
-        long h = nfs4_deviceid_hash(id);
-        dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
+        ld = NFS_SERVER(inode)->pnfs_curr_ld;
-        if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
+        pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
-                return;
+}
-        hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
+static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
-                if (!memcmp(&d->de_id, id, sizeof(*id))) {
+                              struct nfs_page *prev,
-                        hlist_del_rcu(&d->de_node);
+                              struct nfs_page *req)
-                        spin_unlock(&c->dc_lock);
+{
-                        synchronize_rcu();
+        if (pgio->pg_count == prev->wb_bytes) {
-                        c->dc_free_callback(devid);
+                /* This is first coelesce call for a series of nfs_pages */
-                        return;
+                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-                }
+                                                   prev->wb_context,
-        spin_unlock(&c->dc_lock);
+                                                   IOMODE_RW);
-        /* Why wasn't it found in  the list? */
-        BUG();
-}
-EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
-/* Find and reference a deviceid */
-struct pnfs_deviceid_node *
-pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
-{
-        struct pnfs_deviceid_node *d;
-        struct hlist_node *n;
-        long hash = nfs4_deviceid_hash(id);
-        dprintk("--> %s hash %ld\n", __func__, hash);
-        rcu_read_lock();
-        hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
-                if (!memcmp(&d->de_id, id, sizeof(*id))) {
-                        if (!atomic_inc_not_zero(&d->de_ref)) {
-                                goto fail;
-                        } else {
-                                rcu_read_unlock();
-                                return d;
-                        }
-                }
        }
-fail:
+        return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
-        rcu_read_unlock();
+}
-        return NULL;
+void
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
+{
+        struct pnfs_layoutdriver_type *ld;
+        ld = NFS_SERVER(inode)->pnfs_curr_ld;
+        pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
+}
+enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_write_data *wdata,
+                        const struct rpc_call_ops *call_ops, int how)
+{
+        struct inode *inode = wdata->inode;
+        enum pnfs_try_status trypnfs;
+        struct nfs_server *nfss = NFS_SERVER(inode);
+        wdata->mds_ops = call_ops;
+        dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
+                inode->i_ino, wdata->args.count, wdata->args.offset, how);
+        trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
+        if (trypnfs == PNFS_NOT_ATTEMPTED) {
+                put_lseg(wdata->lseg);
+                wdata->lseg = NULL;
+        } else
+                nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
+        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+        return trypnfs;
 }
-EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
 /*
- * Add a deviceid to the cache.
+ * Call the appropriate parallel I/O subsystem read function.
- * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
 */
-struct pnfs_deviceid_node *
+enum pnfs_try_status
-pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
+pnfs_try_to_read_data(struct nfs_read_data *rdata,
-{
+                       const struct rpc_call_ops *call_ops)
-        struct pnfs_deviceid_node *d;
-        long hash = nfs4_deviceid_hash(&new->de_id);
-        dprintk("--> %s hash %ld\n", __func__, hash);
-        spin_lock(&c->dc_lock);
-        d = pnfs_find_get_deviceid(c, &new->de_id);
-        if (d) {
-                spin_unlock(&c->dc_lock);
-                dprintk("%s [discard]\n", __func__);
-                c->dc_free_callback(new);
-                return d;
-        }
-        INIT_HLIST_NODE(&new->de_node);
-        atomic_set(&new->de_ref, 1);
-        hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
-        spin_unlock(&c->dc_lock);
-        dprintk("%s [new]\n", __func__);
-        return new;
-}
-EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
-void
-pnfs_put_deviceid_cache(struct nfs_client *clp)
 {
-        struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
+        struct inode *inode = rdata->inode;
+        struct nfs_server *nfss = NFS_SERVER(inode);
+        enum pnfs_try_status trypnfs;
-        dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref));
+        rdata->mds_ops = call_ops;
-        if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
-                int i;
+        dprintk("%s: Reading ino:%lu %u@%llu\n",
-                /* Verify cache is empty */
+                __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
-                for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
-                        BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
+        trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
-                clp->cl_devid_cache = NULL;
+        if (trypnfs == PNFS_NOT_ATTEMPTED) {
-                spin_unlock(&clp->cl_lock);
+                put_lseg(rdata->lseg);
-                kfree(local);
+                rdata->lseg = NULL;
+        } else {
+                nfs_inc_stats(inode, NFSIOS_PNFS_READ);
        }
+        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+        return trypnfs;
 }
-EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e2612ea0cbed..6380b9405bcd 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,8 @@
 #ifndef FS_NFS_PNFS_H
 #define FS_NFS_PNFS_H
+#include <linux/nfs_page.h>
 enum {
        NFS_LSEG_VALID = 0,     /* cleared when lseg is recalled/returned */
        NFS_LSEG_ROC,           /* roc bit received from server */
@@ -43,6 +45,11 @@ struct pnfs_layout_segment {
        struct pnfs_layout_hdr *pls_layout;
 };
+enum pnfs_try_status {
+        PNFS_ATTEMPTED     = 0,
+        PNFS_NOT_ATTEMPTED = 1,
+};
 #ifdef CONFIG_NFS_V4_1
 #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
@@ -61,10 +68,18 @@ struct pnfs_layoutdriver_type {
        const u32 id;
        const char *name;
        struct module *owner;
-        int (*set_layoutdriver) (struct nfs_server *);
-        int (*clear_layoutdriver) (struct nfs_server *);
        struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
        void (*free_lseg) (struct pnfs_layout_segment *lseg);
+        /* test for nfs page cache coalescing */
+        int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+        /*
+         * Return PNFS_ATTEMPTED to indicate the layout code has attempted
+         * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
+         */
+        enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
+        enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
 };
 struct pnfs_layout_hdr {
@@ -90,52 +105,6 @@ struct pnfs_device {
        unsigned int  pglen;
 };
-/*
- * Device ID RCU cache. A device ID is unique per client ID and layout type.
- */
-#define NFS4_DEVICE_ID_HASH_BITS        5
-#define NFS4_DEVICE_ID_HASH_SIZE        (1 << NFS4_DEVICE_ID_HASH_BITS)
-#define NFS4_DEVICE_ID_HASH_MASK        (NFS4_DEVICE_ID_HASH_SIZE - 1)
-static inline u32
-nfs4_deviceid_hash(struct nfs4_deviceid *id)
-{
-        unsigned char *cptr = (unsigned char *)id->data;
-        unsigned int nbytes = NFS4_DEVICEID4_SIZE;
-        u32 x = 0;
-        while (nbytes--) {
-                x *= 37;
-                x += *cptr++;
-        }
-        return x & NFS4_DEVICE_ID_HASH_MASK;
-}
-struct pnfs_deviceid_node {
-        struct hlist_node       de_node;
-        struct nfs4_deviceid    de_id;
-        atomic_t                de_ref;
-};
-struct pnfs_deviceid_cache {
-        spinlock_t              dc_lock;
-        atomic_t                dc_ref;
-        void                    (*dc_free_callback)(struct pnfs_deviceid_node *);
-        struct hlist_head       dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
-};
-extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
-                        void (*free_callback)(struct pnfs_deviceid_node *));
-extern void pnfs_put_deviceid_cache(struct nfs_client *);
-extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
-                                struct pnfs_deviceid_cache *,
-                                struct nfs4_deviceid *);
-extern struct pnfs_deviceid_node *pnfs_add_deviceid(
-                                struct pnfs_deviceid_cache *,
-                                struct pnfs_deviceid_node *);
-extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
-                              struct pnfs_deviceid_node *devid);
 extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
 extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
@@ -146,11 +115,18 @@ extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
 /* pnfs.c */
 void get_layout_hdr(struct pnfs_layout_hdr *lo);
+void put_lseg(struct pnfs_layout_segment *lseg);
 struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
                   enum pnfs_iomode access_type);
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
+enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
+                                             const struct rpc_call_ops *, int);
+enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
+                                            const struct rpc_call_ops *);
+void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
+void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
@@ -177,6 +153,16 @@ static inline int lo_fail_bit(u32 iomode)
                         NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
 }
+static inline struct pnfs_layout_segment *
+get_lseg(struct pnfs_layout_segment *lseg)
+{
+        if (lseg) {
+                atomic_inc(&lseg->pls_refcount);
+                smp_mb__after_atomic_inc();
+        }
+        return lseg;
+}
 /* Return true if a layout driver is being used for this mountpoint */
 static inline int pnfs_enabled_sb(struct nfs_server *nfss)
 {
@@ -194,12 +180,36 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
 }
 static inline struct pnfs_layout_segment *
+get_lseg(struct pnfs_layout_segment *lseg)
+{
+        return NULL;
+}
+static inline void put_lseg(struct pnfs_layout_segment *lseg)
+{
+}
+static inline struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
                   enum pnfs_iomode access_type)
 {
        return NULL;
 }
+static inline enum pnfs_try_status
+pnfs_try_to_read_data(struct nfs_read_data *data,
+                      const struct rpc_call_ops *call_ops)
+{
+        return PNFS_NOT_ATTEMPTED;
+}
+static inline enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_write_data *data,
+                       const struct rpc_call_ops *call_ops, int how)
+{
+        return PNFS_NOT_ATTEMPTED;
+}
 static inline bool
 pnfs_roc(struct inode *ino)
 {
@@ -230,6 +240,18 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
 {
 }
+static inline void
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+{
+        pgio->pg_test = NULL;
+}
+static inline void
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+{
+        pgio->pg_test = NULL;
+}
 #endif /* CONFIG_NFS_V4_1 */
 #endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 77d5e21c4ad6..b8ec170f2a0f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -741,4 +741,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .lock           = nfs_proc_lock,
        .lock_check_bounds = nfs_lock_check_bounds,
        .close_context  = nfs_close_context,
+        .init_client    = nfs_init_client,
 };
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index aedcaa7f291f..7cded2b12a05 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -18,19 +18,20 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
+#include <linux/module.h>
 #include <asm/system.h>
+#include "pnfs.h"
 #include "nfs4_fs.h"
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
-#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_PAGECACHE
-static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int);
+static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc);
-static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int);
+static int nfs_pagein_one(struct nfs_pageio_descriptor *desc);
 static const struct rpc_call_ops nfs_read_partial_ops;
 static const struct rpc_call_ops nfs_read_full_ops;
@@ -69,6 +70,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
 static void nfs_readdata_release(struct nfs_read_data *rdata)
 {
+        put_lseg(rdata->lseg);
        put_nfs_open_context(rdata->args.context);
        nfs_readdata_free(rdata);
 }
@@ -114,14 +116,13 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
 int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
                       struct page *page)
 {
-        LIST_HEAD(one_request);
        struct nfs_page *new;
        unsigned int len;
+        struct nfs_pageio_descriptor pgio;
        len = nfs_page_length(page);
        if (len == 0)
                return nfs_return_empty_page(page);
-        pnfs_update_layout(inode, ctx, IOMODE_READ);
        new = nfs_create_request(ctx, inode, page, 0, len);
        if (IS_ERR(new)) {
                unlock_page(page);
@@ -130,11 +131,14 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
        if (len < PAGE_CACHE_SIZE)
                zero_user_segment(page, len, PAGE_CACHE_SIZE);
-        nfs_list_add_request(new, &one_request);
+        nfs_pageio_init(&pgio, inode, NULL, 0, 0);
+        nfs_list_add_request(new, &pgio.pg_list);
+        pgio.pg_count = len;
        if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
-                nfs_pagein_multi(inode, &one_request, 1, len, 0);
+                nfs_pagein_multi(&pgio);
        else
-                nfs_pagein_one(inode, &one_request, 1, len, 0);
+                nfs_pagein_one(&pgio);
        return 0;
 }
@@ -155,24 +159,20 @@ static void nfs_readpage_release(struct nfs_page *req)
        nfs_release_request(req);
 }
-/*
+int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
- * Set up the NFS read request struct
+                      const struct rpc_call_ops *call_ops)
- */
-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
-                const struct rpc_call_ops *call_ops,
-                unsigned int count, unsigned int offset)
 {
-        struct inode *inode = req->wb_context->path.dentry->d_inode;
+        struct inode *inode = data->inode;
        int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
        struct rpc_task *task;
        struct rpc_message msg = {
                .rpc_argp = &data->args,
                .rpc_resp = &data->res,
-                .rpc_cred = req->wb_context->cred,
+                .rpc_cred = data->cred,
        };
        struct rpc_task_setup task_setup_data = {
                .task = &data->task,
-                .rpc_client = NFS_CLIENT(inode),
+                .rpc_client = clnt,
                .rpc_message = &msg,
                .callback_ops = call_ops,
                .callback_data = data,
@@ -180,9 +180,39 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
                .flags = RPC_TASK_ASYNC | swap_flags,
        };
+        /* Set up the initial task struct. */
+        NFS_PROTO(inode)->read_setup(data, &msg);
+        dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ "
+                        "offset %llu)\n",
+                        data->task.tk_pid,
+                        inode->i_sb->s_id,
+                        (long long)NFS_FILEID(inode),
+                        data->args.count,
+                        (unsigned long long)data->args.offset);
+        task = rpc_run_task(&task_setup_data);
+        if (IS_ERR(task))
+                return PTR_ERR(task);
+        rpc_put_task(task);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_read);
+/*
+ * Set up the NFS read request struct
+ */
+static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
+                const struct rpc_call_ops *call_ops,
+                unsigned int count, unsigned int offset,
+                struct pnfs_layout_segment *lseg)
+{
+        struct inode *inode = req->wb_context->path.dentry->d_inode;
        data->req         = req;
        data->inode       = inode;
-        data->cred        = msg.rpc_cred;
+        data->cred        = req->wb_context->cred;
+        data->lseg        = get_lseg(lseg);
        data->args.fh     = NFS_FH(inode);
        data->args.offset = req_offset(req) + offset;
@@ -197,21 +227,11 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
        data->res.eof     = 0;
        nfs_fattr_init(&data->fattr);
-        /* Set up the initial task struct. */
+        if (data->lseg &&
-        NFS_PROTO(inode)->read_setup(data, &msg);
+            (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED))
+                return 0;
-        dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
-                        data->task.tk_pid,
-                        inode->i_sb->s_id,
-                        (long long)NFS_FILEID(inode),
-                        count,
-                        (unsigned long long)data->args.offset);
-        task = rpc_run_task(&task_setup_data);
+        return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
-        if (IS_ERR(task))
-                return PTR_ERR(task);
-        rpc_put_task(task);
-        return 0;
 }
 static void
@@ -240,20 +260,21 @@ nfs_async_read_error(struct list_head *head)
 * won't see the new data until our attribute cache is updated.  This is more
 * or less conventional NFS client behavior.
 */
-static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
+static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
 {
-        struct nfs_page *req = nfs_list_entry(head->next);
+        struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
        struct page *page = req->wb_page;
        struct nfs_read_data *data;
-        size_t rsize = NFS_SERVER(inode)->rsize, nbytes;
+        size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes;
        unsigned int offset;
        int requests = 0;
        int ret = 0;
+        struct pnfs_layout_segment *lseg;
        LIST_HEAD(list);
        nfs_list_remove_request(req);
-        nbytes = count;
+        nbytes = desc->pg_count;
        do {
                size_t len = min(nbytes,rsize);
@@ -266,9 +287,11 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
        } while(nbytes != 0);
        atomic_set(&req->wb_complete, requests);
+        BUG_ON(desc->pg_lseg != NULL);
+        lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
        ClearPageError(page);
        offset = 0;
-        nbytes = count;
+        nbytes = desc->pg_count;
        do {
                int ret2;
@@ -280,12 +303,14 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
                if (nbytes < rsize)
                        rsize = nbytes;
                ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
-                                  rsize, offset);
+                                         rsize, offset, lseg);
                if (ret == 0)
                        ret = ret2;
                offset += rsize;
                nbytes -= rsize;
        } while (nbytes != 0);
+        put_lseg(lseg);
+        desc->pg_lseg = NULL;
        return ret;
@@ -300,16 +325,21 @@ out_bad:
        return -ENOMEM;
 }
-static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
+static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
 {
        struct nfs_page         *req;
        struct page             **pages;
        struct nfs_read_data    *data;
+        struct list_head *head = &desc->pg_list;
+        struct pnfs_layout_segment *lseg = desc->pg_lseg;
        int ret = -ENOMEM;
-        data = nfs_readdata_alloc(npages);
+        data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base,
-        if (!data)
+                                                     desc->pg_count));
-                goto out_bad;
+        if (!data) {
+                nfs_async_read_error(head);
+                goto out;
+        }
        pages = data->pagevec;
        while (!list_empty(head)) {
@@ -320,10 +350,14 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
                *pages++ = req->wb_page;
        }
        req = nfs_list_entry(data->pages.next);
+        if ((!lseg) && list_is_singular(&data->pages))
+                lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
-        return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
+        ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
-out_bad:
+                                0, lseg);
-        nfs_async_read_error(head);
+out:
+        put_lseg(lseg);
+        desc->pg_lseg = NULL;
        return ret;
 }
@@ -366,6 +400,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
                return;
        /* Yes, so retry the read at the end of the data */
+        data->mds_offset += resp->count;
        argp->offset += resp->count;
        argp->pgbase += resp->count;
        argp->count -= resp->count;
@@ -625,7 +660,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
        if (ret == 0)
                goto read_complete; /* all pages were read */
-        pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
+        pnfs_pageio_init_read(&pgio, inode);
        if (rsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
        else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b68c8607770f..2b8e9a5e366a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -263,8 +263,11 @@ static match_table_t nfs_local_lock_tokens = {
 static void nfs_umount_begin(struct super_block *);
 static int  nfs_statfs(struct dentry *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
+static int  nfs_show_devname(struct seq_file *, struct vfsmount *);
+static int  nfs_show_path(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
-static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
+static struct dentry *nfs_fs_mount(struct file_system_type *,
+                int, const char *, void *);
 static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
                int flags, const char *dev_name, void *raw_data);
 static void nfs_put_super(struct super_block *);
@@ -274,7 +277,7 @@ static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
 static struct file_system_type nfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs",
-        .get_sb         = nfs_get_sb,
+        .mount          = nfs_fs_mount,
        .kill_sb        = nfs_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -296,6 +299,8 @@ static const struct super_operations nfs_sops = {
        .evict_inode    = nfs_evict_inode,
        .umount_begin   = nfs_umount_begin,
        .show_options   = nfs_show_options,
+        .show_devname   = nfs_show_devname,
+        .show_path      = nfs_show_path,
        .show_stats     = nfs_show_stats,
        .remount_fs     = nfs_remount,
 };
@@ -303,16 +308,16 @@ static const struct super_operations nfs_sops = {
 #ifdef CONFIG_NFS_V4
 static int nfs4_validate_text_mount_data(void *options,
        struct nfs_parsed_mount_data *args, const char *dev_name);
-static int nfs4_try_mount(int flags, const char *dev_name,
+static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
-        struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
+        struct nfs_parsed_mount_data *data);
-static int nfs4_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+        int flags, const char *dev_name, void *raw_data);
 static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data);
 static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data);
-static int nfs4_referral_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+        int flags, const char *dev_name, void *raw_data);
 static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data);
 static void nfs4_kill_super(struct super_block *sb);
@@ -320,7 +325,7 @@ static void nfs4_kill_super(struct super_block *sb);
 static struct file_system_type nfs4_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
-        .get_sb         = nfs4_get_sb,
+        .mount          = nfs4_mount,
        .kill_sb        = nfs4_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -352,7 +357,7 @@ static struct file_system_type nfs4_remote_referral_fs_type = {
 struct file_system_type nfs4_referral_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
-        .get_sb         = nfs4_referral_get_sb,
+        .mount          = nfs4_referral_mount,
        .kill_sb        = nfs4_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -366,6 +371,8 @@ static const struct super_operations nfs4_sops = {
        .evict_inode    = nfs4_evict_inode,
        .umount_begin   = nfs_umount_begin,
        .show_options   = nfs_show_options,
+        .show_devname   = nfs_show_devname,
+        .show_path      = nfs_show_path,
        .show_stats     = nfs_show_stats,
        .remount_fs     = nfs_remount,
 };
@@ -726,6 +733,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
        return 0;
 }
+static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
+{
+        char *page = (char *) __get_free_page(GFP_KERNEL);
+        char *devname, *dummy;
+        int err = 0;
+        if (!page)
+                return -ENOMEM;
+        devname = nfs_path(&dummy, mnt->mnt_root, page, PAGE_SIZE);
+        if (IS_ERR(devname))
+                err = PTR_ERR(devname);
+        else
+                seq_escape(m, devname, " \t\n\\");
+        free_page((unsigned long)page);
+        return err;
+}
+static int nfs_show_path(struct seq_file *m, struct vfsmount *mnt)
+{
+        seq_puts(m, "/");
+        return 0;
+}
 /*
 * Present statistical information for this VFS mountpoint
 */
@@ -979,6 +1008,27 @@ static int nfs_parse_security_flavors(char *value,
        return 1;
 }
+static int nfs_get_option_str(substring_t args[], char **option)
+{
+        kfree(*option);
+        *option = match_strdup(args);
+        return !option;
+}
+static int nfs_get_option_ul(substring_t args[], unsigned long *option)
+{
+        int rc;
+        char *string;
+        string = match_strdup(args);
+        if (string == NULL)
+                return -ENOMEM;
+        rc = strict_strtoul(string, 10, option);
+        kfree(string);
+        return rc;
+}
 /*
 * Error-check and convert a string of mount options from user space into
 * a data structure.  The whole mount string is processed; bad options are
@@ -1127,155 +1177,82 @@ static int nfs_parse_mount_options(char *raw,
                 * options that take numeric values
                 */
                case Opt_port:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option) ||
-                        if (string == NULL)
+                            option > USHRT_MAX)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0 || option > USHRT_MAX)
                                goto out_invalid_value;
                        mnt->nfs_server.port = option;
                        break;
                case Opt_rsize:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->rsize = option;
                        break;
                case Opt_wsize:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->wsize = option;
                        break;
                case Opt_bsize:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->bsize = option;
                        break;
                case Opt_timeo:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option) || option == 0)
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0 || option == 0)
                                goto out_invalid_value;
                        mnt->timeo = option;
                        break;
                case Opt_retrans:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option) || option == 0)
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0 || option == 0)
                                goto out_invalid_value;
                        mnt->retrans = option;
                        break;
                case Opt_acregmin:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->acregmin = option;
                        break;
                case Opt_acregmax:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->acregmax = option;
                        break;
                case Opt_acdirmin:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->acdirmin = option;
                        break;
                case Opt_acdirmax:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->acdirmax = option;
                        break;
                case Opt_actimeo:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->acregmin = mnt->acregmax =
                        mnt->acdirmin = mnt->acdirmax = option;
                        break;
                case Opt_namelen:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->namlen = option;
                        break;
                case Opt_mountport:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option) ||
-                        if (string == NULL)
+                            option > USHRT_MAX)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0 || option > USHRT_MAX)
                                goto out_invalid_value;
                        mnt->mount_server.port = option;
                        break;
                case Opt_mountvers:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option) ||
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0 ||
                            option < NFS_MNT_VERSION ||
                            option > NFS_MNT3_VERSION)
                                goto out_invalid_value;
                        mnt->mount_server.version = option;
                        break;
                case Opt_nfsvers:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        switch (option) {
                        case NFS2_VERSION:
@@ -1295,12 +1272,7 @@ static int nfs_parse_mount_options(char *raw,
                        }
                        break;
                case Opt_minorversion:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        if (option > NFS4_MAX_MINOR_VERSION)
                                goto out_invalid_value;
@@ -1336,21 +1308,18 @@ static int nfs_parse_mount_options(char *raw,
                        case Opt_xprt_udp:
                                mnt->flags &= ~NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
-                                kfree(string);
                                break;
                        case Opt_xprt_tcp6:
                                protofamily = AF_INET6;
                        case Opt_xprt_tcp:
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-                                kfree(string);
                                break;
                        case Opt_xprt_rdma:
                                /* vector side protocols to TCP */
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
                                xprt_load_transport(string);
-                                kfree(string);
                                break;
                        default:
                                dfprintk(MOUNT, "NFS:   unrecognized "
@@ -1358,6 +1327,7 @@ static int nfs_parse_mount_options(char *raw,
                                kfree(string);
                                return 0;
                        }
+                        kfree(string);
                        break;
                case Opt_mountproto:
                        string = match_strdup(args);
@@ -1400,18 +1370,13 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_invalid_address;
                        break;
                case Opt_clientaddr:
-                        string = match_strdup(args);
+                        if (nfs_get_option_str(args, &mnt->client_address))
-                        if (string == NULL)
                                goto out_nomem;
-                        kfree(mnt->client_address);
-                        mnt->client_address = string;
                        break;
                case Opt_mounthost:
-                        string = match_strdup(args);
+                        if (nfs_get_option_str(args,
-                        if (string == NULL)
+                                               &mnt->mount_server.hostname))
                                goto out_nomem;
-                        kfree(mnt->mount_server.hostname);
-                        mnt->mount_server.hostname = string;
                        break;
                case Opt_mountaddr:
                        string = match_strdup(args);
@@ -1451,11 +1416,8 @@ static int nfs_parse_mount_options(char *raw,
                        };
                        break;
                case Opt_fscache_uniq:
-                        string = match_strdup(args);
+                        if (nfs_get_option_str(args, &mnt->fscache_uniq))
-                        if (string == NULL)
                                goto out_nomem;
-                        kfree(mnt->fscache_uniq);
-                        mnt->fscache_uniq = string;
                        mnt->options |= NFS_OPTION_FSCACHE;
                        break;
                case Opt_local_lock:
@@ -1665,99 +1627,59 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
        return nfs_walk_authlist(args, &request);
 }
-static int nfs_parse_simple_hostname(const char *dev_name,
+/*
-                                     char **hostname, size_t maxnamlen,
+ * Split "dev_name" into "hostname:export_path".
-                                     char **export_path, size_t maxpathlen)
+ *
+ * The leftmost colon demarks the split between the server's hostname
+ * and the export path.  If the hostname starts with a left square
+ * bracket, then it may contain colons.
+ *
+ * Note: caller frees hostname and export path, even on error.
+ */
+static int nfs_parse_devname(const char *dev_name,
+                             char **hostname, size_t maxnamlen,
+                             char **export_path, size_t maxpathlen)
 {
        size_t len;
-        char *colon, *comma;
+        char *end;
-        colon = strchr(dev_name, ':');
-        if (colon == NULL)
-                goto out_bad_devname;
-        len = colon - dev_name;
-        if (len > maxnamlen)
-                goto out_hostname;
-        /* N.B. caller will free nfs_server.hostname in all cases */
+        /* Is the host name protected with square brakcets? */
-        *hostname = kstrndup(dev_name, len, GFP_KERNEL);
+        if (*dev_name == '[') {
-        if (!*hostname)
+                end = strchr(++dev_name, ']');
-                goto out_nomem;
+                if (end == NULL || end[1] != ':')
-        /* kill possible hostname list: not supported */
-        comma = strchr(*hostname, ',');
-        if (comma != NULL) {
-                if (comma == *hostname)
                        goto out_bad_devname;
-                *comma = '\0';
-        }
-        colon++;
+                len = end - dev_name;
-        len = strlen(colon);
+                end++;
-        if (len > maxpathlen)
+        } else {
-                goto out_path;
+                char *comma;
-        *export_path = kstrndup(colon, len, GFP_KERNEL);
-        if (!*export_path)
-                goto out_nomem;
-        dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
-        return 0;
-out_bad_devname:
-        dfprintk(MOUNT, "NFS: device name not in host:path format\n");
-        return -EINVAL;
-out_nomem:
-        dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
-        return -ENOMEM;
-out_hostname:
-        dfprintk(MOUNT, "NFS: server hostname too long\n");
-        return -ENAMETOOLONG;
-out_path:
-        dfprintk(MOUNT, "NFS: export pathname too long\n");
-        return -ENAMETOOLONG;
-}
-/*
- * Hostname has square brackets around it because it contains one or
- * more colons.  We look for the first closing square bracket, and a
- * colon must follow it.
- */
-static int nfs_parse_protected_hostname(const char *dev_name,
-                                        char **hostname, size_t maxnamlen,
-                                        char **export_path, size_t maxpathlen)
-{
-        size_t len;
-        char *start, *end;
-        start = (char *)(dev_name + 1);
+                end = strchr(dev_name, ':');
+                if (end == NULL)
+                        goto out_bad_devname;
+                len = end - dev_name;
-        end = strchr(start, ']');
+                /* kill possible hostname list: not supported */
-        if (end == NULL)
+                comma = strchr(dev_name, ',');
-                goto out_bad_devname;
+                if (comma != NULL && comma < end)
-        if (*(end + 1) != ':')
+                        *comma = 0;
-                goto out_bad_devname;
+        }
-        len = end - start;
        if (len > maxnamlen)
                goto out_hostname;
        /* N.B. caller will free nfs_server.hostname in all cases */
-        *hostname = kstrndup(start, len, GFP_KERNEL);
+        *hostname = kstrndup(dev_name, len, GFP_KERNEL);
        if (*hostname == NULL)
                goto out_nomem;
+        len = strlen(++end);
-        end += 2;
-        len = strlen(end);
        if (len > maxpathlen)
                goto out_path;
        *export_path = kstrndup(end, len, GFP_KERNEL);
        if (!*export_path)
                goto out_nomem;
+        dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
        return 0;
 out_bad_devname:
@@ -1778,29 +1700,6 @@ out_path:
 }
 /*
- * Split "dev_name" into "hostname:export_path".
- *
- * The leftmost colon demarks the split between the server's hostname
- * and the export path.  If the hostname starts with a left square
- * bracket, then it may contain colons.
- *
- * Note: caller frees hostname and export path, even on error.
- */
-static int nfs_parse_devname(const char *dev_name,
-                             char **hostname, size_t maxnamlen,
-                             char **export_path, size_t maxpathlen)
-{
-        if (*dev_name == '[')
-                return nfs_parse_protected_hostname(dev_name,
-                                                    hostname, maxnamlen,
-                                                    export_path, maxpathlen);
-        return nfs_parse_simple_hostname(dev_name,
-                                         hostname, maxnamlen,
-                                         export_path, maxpathlen);
-}
-/*
 * Validate the NFS2/NFS3 mount data
 * - fills in the mount root filehandle
 *
@@ -2267,19 +2166,19 @@ static int nfs_bdi_register(struct nfs_server *server)
        return bdi_register_dev(&server->backing_dev_info, server->s_dev);
 }
-static int nfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *raw_data)
 {
        struct nfs_server *server = NULL;
        struct super_block *s;
        struct nfs_parsed_mount_data *data;
        struct nfs_fh *mntfh;
-        struct dentry *mntroot;
+        struct dentry *mntroot = ERR_PTR(-ENOMEM);
        int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
        struct nfs_sb_mountdata sb_mntdata = {
                .mntflags = flags,
        };
-        int error = -ENOMEM;
+        int error;
        data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
        mntfh = nfs_alloc_fhandle();
@@ -2290,12 +2189,14 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        /* Validate the mount data */
        error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
-        if (error < 0)
+        if (error < 0) {
+                mntroot = ERR_PTR(error);
                goto out;
+        }
 #ifdef CONFIG_NFS_V4
        if (data->version == 4) {
-                error = nfs4_try_mount(flags, dev_name, data, mnt);
+                mntroot = nfs4_try_mount(flags, dev_name, data);
                kfree(data->client_address);
                kfree(data->nfs_server.export_path);
                goto out;
@@ -2305,7 +2206,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        /* Get a volume representation */
        server = nfs_create_server(data, mntfh);
        if (IS_ERR(server)) {
-                error = PTR_ERR(server);
+                mntroot = ERR_CAST(server);
                goto out;
        }
        sb_mntdata.server = server;
@@ -2316,7 +2217,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        /* Get a superblock - note that we may end up sharing one that already exists */
        s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata);
        if (IS_ERR(s)) {
-                error = PTR_ERR(s);
+                mntroot = ERR_CAST(s);
                goto out_err_nosb;
        }
@@ -2325,8 +2226,10 @@ static int nfs_get_sb(struct file_system_type *fs_type,
                server = NULL;
        } else {
                error = nfs_bdi_register(server);
-                if (error)
+                if (error) {
+                        mntroot = ERR_PTR(error);
                        goto error_splat_bdi;
+                }
        }
        if (!s->s_root) {
@@ -2336,20 +2239,15 @@ static int nfs_get_sb(struct file_system_type *fs_type,
                        s, data ? data->fscache_uniq : NULL, NULL);
        }
-        mntroot = nfs_get_root(s, mntfh);
+        mntroot = nfs_get_root(s, mntfh, dev_name);
-        if (IS_ERR(mntroot)) {
+        if (IS_ERR(mntroot))
-                error = PTR_ERR(mntroot);
                goto error_splat_super;
-        }
        error = security_sb_set_mnt_opts(s, &data->lsm_opts);
        if (error)
                goto error_splat_root;
        s->s_flags |= MS_ACTIVE;
-        mnt->mnt_sb = s;
-        mnt->mnt_root = mntroot;
-        error = 0;
 out:
        kfree(data->nfs_server.hostname);
@@ -2359,7 +2257,7 @@ out:
 out_free_fh:
        nfs_free_fhandle(mntfh);
        kfree(data);
-        return error;
+        return mntroot;
 out_err_nosb:
        nfs_free_server(server);
@@ -2367,6 +2265,7 @@ out_err_nosb:
 error_splat_root:
        dput(mntroot);
+        mntroot = ERR_PTR(error);
 error_splat_super:
        if (server && !s->s_root)
                bdi_unregister(&server->backing_dev_info);
@@ -2450,7 +2349,7 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
                nfs_fscache_get_super_cookie(s, NULL, data);
        }
-        mntroot = nfs_get_root(s, data->fh);
+        mntroot = nfs_get_root(s, data->fh, dev_name);
        if (IS_ERR(mntroot)) {
                error = PTR_ERR(mntroot);
                goto error_splat_super;
@@ -2718,7 +2617,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
                        s, data ? data->fscache_uniq : NULL, NULL);
        }
-        mntroot = nfs4_get_root(s, mntfh);
+        mntroot = nfs4_get_root(s, mntfh, dev_name);
        if (IS_ERR(mntroot)) {
                error = PTR_ERR(mntroot);
                goto error_splat_super;
@@ -2771,27 +2670,6 @@ static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type,
        return root_mnt;
 }
-static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt)
-{
-        char *page = (char *) __get_free_page(GFP_KERNEL);
-        char *devname, *tmp;
-        if (page == NULL)
-                return;
-        devname = nfs_path(path->mnt->mnt_devname,
-                        path->mnt->mnt_root, path->dentry,
-                        page, PAGE_SIZE);
-        if (IS_ERR(devname))
-                goto out_freepage;
-        tmp = kstrdup(devname, GFP_KERNEL);
-        if (tmp == NULL)
-                goto out_freepage;
-        kfree(mnt->mnt_devname);
-        mnt->mnt_devname = tmp;
-out_freepage:
-        free_page((unsigned long)page);
-}
 struct nfs_referral_count {
        struct list_head list;
        const struct task_struct *task;
@@ -2858,17 +2736,18 @@ static void nfs_referral_loop_unprotect(void)
        kfree(p);
 }
-static int nfs_follow_remote_path(struct vfsmount *root_mnt,
+static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
-                const char *export_path, struct vfsmount *mnt_target)
+                const char *export_path)
 {
        struct nameidata *nd = NULL;
        struct mnt_namespace *ns_private;
        struct super_block *s;
+        struct dentry *dentry;
        int ret;
        nd = kmalloc(sizeof(*nd), GFP_KERNEL);
        if (nd == NULL)
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
        ns_private = create_mnt_ns(root_mnt);
        ret = PTR_ERR(ns_private);
@@ -2890,32 +2769,27 @@ static int nfs_follow_remote_path(struct vfsmount *root_mnt,
        s = nd->path.mnt->mnt_sb;
        atomic_inc(&s->s_active);
-        mnt_target->mnt_sb = s;
+        dentry = dget(nd->path.dentry);
-        mnt_target->mnt_root = dget(nd->path.dentry);
-        /* Correct the device pathname */
-        nfs_fix_devname(&nd->path, mnt_target);
        path_put(&nd->path);
        kfree(nd);
        down_write(&s->s_umount);
-        return 0;
+        return dentry;
 out_put_mnt_ns:
        put_mnt_ns(ns_private);
 out_mntput:
        mntput(root_mnt);
 out_err:
        kfree(nd);
-        return ret;
+        return ERR_PTR(ret);
 }
-static int nfs4_try_mount(int flags, const char *dev_name,
+static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
-                         struct nfs_parsed_mount_data *data,
+                         struct nfs_parsed_mount_data *data)
-                         struct vfsmount *mnt)
 {
        char *export_path;
        struct vfsmount *root_mnt;
-        int error;
+        struct dentry *res;
        dfprintk(MOUNT, "--> nfs4_try_mount()\n");
@@ -2925,26 +2799,25 @@ static int nfs4_try_mount(int flags, const char *dev_name,
                        data->nfs_server.hostname);
        data->nfs_server.export_path = export_path;
-        error = PTR_ERR(root_mnt);
+        res = ERR_CAST(root_mnt);
-        if (IS_ERR(root_mnt))
+        if (!IS_ERR(root_mnt))
-                goto out;
+                res = nfs_follow_remote_path(root_mnt, export_path);
-        error = nfs_follow_remote_path(root_mnt, export_path, mnt);
-out:
+        dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n",
-        dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error,
+                        IS_ERR(res) ? PTR_ERR(res) : 0,
-                        error != 0 ? " [error]" : "");
+                        IS_ERR(res) ? " [error]" : "");
-        return error;
+        return res;
 }
 /*
 * Get the superblock for an NFS4 mountpoint
 */
-static int nfs4_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *raw_data)
 {
        struct nfs_parsed_mount_data *data;
        int error = -ENOMEM;
+        struct dentry *res = ERR_PTR(-ENOMEM);
        data = nfs_alloc_parsed_mount_data(4);
        if (data == NULL)
@@ -2952,10 +2825,14 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
        /* Validate the mount data */
        error = nfs4_validate_mount_data(raw_data, data, dev_name);
-        if (error < 0)
+        if (error < 0) {
+                res = ERR_PTR(error);
                goto out;
+        }
-        error = nfs4_try_mount(flags, dev_name, data, mnt);
+        res = nfs4_try_mount(flags, dev_name, data);
+        if (IS_ERR(res))
+                error = PTR_ERR(res);
 out:
        kfree(data->client_address);
@@ -2964,9 +2841,9 @@ out:
        kfree(data->fscache_uniq);
 out_free_data:
        kfree(data);
-        dprintk("<-- nfs4_get_sb() = %d%s\n", error,
+        dprintk("<-- nfs4_mount() = %d%s\n", error,
                        error != 0 ? " [error]" : "");
-        return error;
+        return res;
 }
 static void nfs4_kill_super(struct super_block *sb)
@@ -3033,7 +2910,7 @@ nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
                nfs_fscache_get_super_cookie(s, NULL, data);
        }
-        mntroot = nfs4_get_root(s, data->fh);
+        mntroot = nfs4_get_root(s, data->fh, dev_name);
        if (IS_ERR(mntroot)) {
                error = PTR_ERR(mntroot);
                goto error_splat_super;
@@ -3120,7 +2997,7 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
                nfs_fscache_get_super_cookie(s, NULL, data);
        }
-        mntroot = nfs4_get_root(s, mntfh);
+        mntroot = nfs4_get_root(s, mntfh, dev_name);
        if (IS_ERR(mntroot)) {
                error = PTR_ERR(mntroot);
                goto error_splat_super;
@@ -3160,16 +3037,15 @@ error_splat_bdi:
 /*
 * Create an NFS4 server record on referral traversal
 */
-static int nfs4_referral_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
-                int flags, const char *dev_name, void *raw_data,
+                int flags, const char *dev_name, void *raw_data)
-                struct vfsmount *mnt)
 {
        struct nfs_clone_mount *data = raw_data;
        char *export_path;
        struct vfsmount *root_mnt;
-        int error;
+        struct dentry *res;
-        dprintk("--> nfs4_referral_get_sb()\n");
+        dprintk("--> nfs4_referral_mount()\n");
        export_path = data->mnt_path;
        data->mnt_path = "/";
@@ -3178,15 +3054,13 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type,
                        flags, data, data->hostname);
        data->mnt_path = export_path;
-        error = PTR_ERR(root_mnt);
+        res = ERR_CAST(root_mnt);
-        if (IS_ERR(root_mnt))
+        if (!IS_ERR(root_mnt))
-                goto out;
+                res = nfs_follow_remote_path(root_mnt, export_path);
+        dprintk("<-- nfs4_referral_mount() = %ld%s\n",
-        error = nfs_follow_remote_path(root_mnt, export_path, mnt);
+                        IS_ERR(res) ? PTR_ERR(res) : 0,
-out:
+                        IS_ERR(res) ? " [error]" : "");
-        dprintk("<-- nfs4_referral_get_sb() = %d%s\n", error,
+        return res;
-                        error != 0 ? " [error]" : "");
-        return error;
 }
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index e313a51acdd1..8d6864c2a5fa 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -148,6 +148,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
        alias = d_lookup(parent, &data->args.name);
        if (alias != NULL) {
                int ret = 0;
+                void *devname_garbage = NULL;
                /*
                 * Hey, we raced with lookup... See if we need to transfer
@@ -157,6 +158,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
                spin_lock(&alias->d_lock);
                if (alias->d_inode != NULL &&
                    !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
+                        devname_garbage = alias->d_fsdata;
                        alias->d_fsdata = data;
                        alias->d_flags |= DCACHE_NFSFS_RENAMED;
                        ret = 1;
@@ -164,6 +166,13 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
                spin_unlock(&alias->d_lock);
                nfs_dec_sillycount(dir);
                dput(alias);
+                /*
+                 * If we'd displaced old cached devname, free it.  At that
+                 * point dentry is definitely not a root, so we won't need
+                 * that anymore.
+                 */
+                if (devname_garbage)
+                        kfree(devname_garbage);
                return ret;
        }
        data->dir = igrab(dir);
@@ -180,7 +189,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
        task_setup_data.rpc_client = NFS_CLIENT(dir);
        task = rpc_run_task(&task_setup_data);
        if (!IS_ERR(task))
-                rpc_put_task(task);
+                rpc_put_task_async(task);
        return 1;
 }
@@ -252,6 +261,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct nfs_unlinkdata *data;
        int status = -ENOMEM;
+        void *devname_garbage = NULL;
        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (data == NULL)
@@ -269,8 +279,16 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
        if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
                goto out_unlock;
        dentry->d_flags |= DCACHE_NFSFS_RENAMED;
+        devname_garbage = dentry->d_fsdata;
        dentry->d_fsdata = data;
        spin_unlock(&dentry->d_lock);
+        /*
+         * If we'd displaced old cached devname, free it.  At that
+         * point dentry is definitely not a root, so we won't need
+         * that anymore.
+         */
+        if (devname_garbage)
+                kfree(devname_garbage);
        return 0;
 out_unlock:
        spin_unlock(&dentry->d_lock);
@@ -299,6 +317,7 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
        if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
                dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
                data = dentry->d_fsdata;
+                dentry->d_fsdata = NULL;
        }
        spin_unlock(&dentry->d_lock);
@@ -315,6 +334,7 @@ nfs_cancel_async_unlink(struct dentry *dentry)
                struct nfs_unlinkdata *data = dentry->d_fsdata;
                dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+                dentry->d_fsdata = NULL;
                spin_unlock(&dentry->d_lock);
                nfs_free_unlinkdata(data);
                return;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c8278f4046cb..47a3ad63e0d5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -28,6 +28,7 @@
 #include "iostat.h"
 #include "nfs4_fs.h"
 #include "fscache.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_PAGECACHE
@@ -96,6 +97,7 @@ void nfs_writedata_free(struct nfs_write_data *p)
 static void nfs_writedata_release(struct nfs_write_data *wdata)
 {
+        put_lseg(wdata->lseg);
        put_nfs_open_context(wdata->args.context);
        nfs_writedata_free(wdata);
 }
@@ -781,25 +783,21 @@ static int flush_task_priority(int how)
        return RPC_PRIORITY_NORMAL;
 }
-/*
+int nfs_initiate_write(struct nfs_write_data *data,
- * Set up the argument/result storage required for the RPC call.
+                       struct rpc_clnt *clnt,
- */
+                       const struct rpc_call_ops *call_ops,
-static int nfs_write_rpcsetup(struct nfs_page *req,
+                       int how)
-                struct nfs_write_data *data,
-                const struct rpc_call_ops *call_ops,
-                unsigned int count, unsigned int offset,
-                int how)
 {
-        struct inode *inode = req->wb_context->path.dentry->d_inode;
+        struct inode *inode = data->inode;
        int priority = flush_task_priority(how);
        struct rpc_task *task;
        struct rpc_message msg = {
                .rpc_argp = &data->args,
                .rpc_resp = &data->res,
-                .rpc_cred = req->wb_context->cred,
+                .rpc_cred = data->cred,
        };
        struct rpc_task_setup task_setup_data = {
-                .rpc_client = NFS_CLIENT(inode),
+                .rpc_client = clnt,
                .task = &data->task,
                .rpc_message = &msg,
                .callback_ops = call_ops,
@@ -810,12 +808,52 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
        };
        int ret = 0;
+        /* Set up the initial task struct.  */
+        NFS_PROTO(inode)->write_setup(data, &msg);
+        dprintk("NFS: %5u initiated write call "
+                "(req %s/%lld, %u bytes @ offset %llu)\n",
+                data->task.tk_pid,
+                inode->i_sb->s_id,
+                (long long)NFS_FILEID(inode),
+                data->args.count,
+                (unsigned long long)data->args.offset);
+        task = rpc_run_task(&task_setup_data);
+        if (IS_ERR(task)) {
+                ret = PTR_ERR(task);
+                goto out;
+        }
+        if (how & FLUSH_SYNC) {
+                ret = rpc_wait_for_completion_task(task);
+                if (ret == 0)
+                        ret = task->tk_status;
+        }
+        rpc_put_task(task);
+out:
+        return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_write);
+/*
+ * Set up the argument/result storage required for the RPC call.
+ */
+static int nfs_write_rpcsetup(struct nfs_page *req,
+                struct nfs_write_data *data,
+                const struct rpc_call_ops *call_ops,
+                unsigned int count, unsigned int offset,
+                struct pnfs_layout_segment *lseg,
+                int how)
+{
+        struct inode *inode = req->wb_context->path.dentry->d_inode;
        /* Set up the RPC argument and reply structs
         * NB: take care not to mess about with data->commit et al. */
        data->req = req;
        data->inode = inode = req->wb_context->path.dentry->d_inode;
-        data->cred = msg.rpc_cred;
+        data->cred = req->wb_context->cred;
+        data->lseg = get_lseg(lseg);
        data->args.fh     = NFS_FH(inode);
        data->args.offset = req_offset(req) + offset;
@@ -836,30 +874,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
        data->res.verf    = &data->verf;
        nfs_fattr_init(&data->fattr);
-        /* Set up the initial task struct.  */
+        if (data->lseg &&
-        NFS_PROTO(inode)->write_setup(data, &msg);
+            (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
+                return 0;
-        dprintk("NFS: %5u initiated write call "
-                "(req %s/%lld, %u bytes @ offset %llu)\n",
-                data->task.tk_pid,
-                inode->i_sb->s_id,
-                (long long)NFS_FILEID(inode),
-                count,
-                (unsigned long long)data->args.offset);
-        task = rpc_run_task(&task_setup_data);
+        return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
-        if (IS_ERR(task)) {
-                ret = PTR_ERR(task);
-                goto out;
-        }
-        if (how & FLUSH_SYNC) {
-                ret = rpc_wait_for_completion_task(task);
-                if (ret == 0)
-                        ret = task->tk_status;
-        }
-        rpc_put_task(task);
-out:
-        return ret;
 }
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -879,20 +898,21 @@ static void nfs_redirty_request(struct nfs_page *req)
 * Generate multiple small requests to write out a single
 * contiguous dirty area on one page.
 */
-static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
+static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
 {
-        struct nfs_page *req = nfs_list_entry(head->next);
+        struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
        struct page *page = req->wb_page;
        struct nfs_write_data *data;
-        size_t wsize = NFS_SERVER(inode)->wsize, nbytes;
+        size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes;
        unsigned int offset;
        int requests = 0;
        int ret = 0;
+        struct pnfs_layout_segment *lseg;
        LIST_HEAD(list);
        nfs_list_remove_request(req);
-        nbytes = count;
+        nbytes = desc->pg_count;
        do {
                size_t len = min(nbytes, wsize);
@@ -905,9 +925,11 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
        } while (nbytes != 0);
        atomic_set(&req->wb_complete, requests);
+        BUG_ON(desc->pg_lseg);
+        lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
        ClearPageError(page);
        offset = 0;
-        nbytes = count;
+        nbytes = desc->pg_count;
        do {
                int ret2;
@@ -919,13 +941,15 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
                if (nbytes < wsize)
                        wsize = nbytes;
                ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
-                                   wsize, offset, how);
+                                          wsize, offset, lseg, desc->pg_ioflags);
                if (ret == 0)
                        ret = ret2;
                offset += wsize;
                nbytes -= wsize;
        } while (nbytes != 0);
+        put_lseg(lseg);
+        desc->pg_lseg = NULL;
        return ret;
 out_bad:
@@ -946,16 +970,26 @@ out_bad:
 * This is the case if nfs_updatepage detects a conflicting request
 * that has been written but not committed.
 */
-static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
+static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
 {
        struct nfs_page         *req;
        struct page             **pages;
        struct nfs_write_data   *data;
+        struct list_head *head = &desc->pg_list;
+        struct pnfs_layout_segment *lseg = desc->pg_lseg;
+        int ret;
-        data = nfs_writedata_alloc(npages);
+        data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base,
-        if (!data)
+                                                      desc->pg_count));
-                goto out_bad;
+        if (!data) {
+                while (!list_empty(head)) {
+                        req = nfs_list_entry(head->next);
+                        nfs_list_remove_request(req);
+                        nfs_redirty_request(req);
+                }
+                ret = -ENOMEM;
+                goto out;
+        }
        pages = data->pagevec;
        while (!list_empty(head)) {
                req = nfs_list_entry(head->next);
@@ -965,16 +999,15 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
                *pages++ = req->wb_page;
        }
        req = nfs_list_entry(data->pages.next);
+        if ((!lseg) && list_is_singular(&data->pages))
+                lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
        /* Set up the argument struct */
-        return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how);
+        ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags);
- out_bad:
+out:
-        while (!list_empty(head)) {
+        put_lseg(lseg); /* Cleans any gotten in ->pg_test */
-                req = nfs_list_entry(head->next);
+        desc->pg_lseg = NULL;
-                nfs_list_remove_request(req);
+        return ret;
-                nfs_redirty_request(req);
-        }
-        return -ENOMEM;
 }
 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
@@ -982,6 +1015,8 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 {
        size_t wsize = NFS_SERVER(inode)->wsize;
+        pnfs_pageio_init_write(pgio, inode);
        if (wsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
        else
@@ -1132,7 +1167,7 @@ static const struct rpc_call_ops nfs_write_full_ops = {
 /*
 * This function is called when the WRITE call is complete.
 */
-int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
+void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct nfs_writeargs    *argp = &data->args;
        struct nfs_writeres     *resp = &data->res;
@@ -1151,7 +1186,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
         */
        status = NFS_PROTO(data->inode)->write_done(task, data);
        if (status != 0)
-                return status;
+                return;
        nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -1166,6 +1201,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                 */
                static unsigned long    complain;
+                /* Note this will print the MDS for a DS write */
                if (time_before(complain, jiffies)) {
                        dprintk("NFS:       faulty NFS server %s:"
                                " (committed = %d) != (stable = %d)\n",
@@ -1186,6 +1222,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                        /* Was this an NFSv2 write or an NFSv3 stable write? */
                        if (resp->verf->committed != NFS_UNSTABLE) {
                                /* Resend from where the server left off */
+                                data->mds_offset += resp->count;
                                argp->offset += resp->count;
                                argp->pgbase += resp->count;
                                argp->count -= resp->count;
@@ -1196,7 +1233,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                                argp->stable = NFS_FILE_SYNC;
                        }
                        nfs_restart_rpc(task, server->nfs_client);
-                        return -EAGAIN;
+                        return;
                }
                if (time_before(complain, jiffies)) {
                        printk(KERN_WARNING
@@ -1207,7 +1244,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                /* Can't do anything about it except throw an error. */
                task->tk_status = -EIO;
        }
-        return 0;
+        return;
 }
@@ -1292,6 +1329,8 @@ static int nfs_commit_rpcsetup(struct list_head *head,
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
+        if (how & FLUSH_SYNC)
+                rpc_wait_for_completion_task(task);
        rpc_put_task(task);
        return 0;
 }
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index bf9cbd242ddd..124e8fcb0dd6 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -22,30 +22,17 @@
 static struct file *do_open(char *name, int flags)
 {
-        struct nameidata nd;
        struct vfsmount *mnt;
-        int error;
+        struct file *file;
        mnt = do_kern_mount("nfsd", 0, "nfsd", NULL);
        if (IS_ERR(mnt))
                return (struct file *)mnt;
-        error = vfs_path_lookup(mnt->mnt_root, mnt, name, 0, &nd);
+        file = file_open_root(mnt->mnt_root, mnt, name, flags);
-        mntput(mnt);    /* drop do_kern_mount reference */
-        if (error)
-                return ERR_PTR(error);
-        if (flags == O_RDWR)
-                error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags);
-        else
-                error = may_open(&nd.path, MAY_WRITE, flags);
-        if (!error)
+        mntput(mnt);    /* drop do_kern_mount reference */
-                return dentry_open(nd.path.dentry, nd.path.mnt, flags,
+        return file;
-                                   current_cred());
-        path_put(&nd.path);
-        return ERR_PTR(error);
 }
 static struct {
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index cde36cb0f348..02eb4edf0ece 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -432,7 +432,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
         * If the server returns different values for sessionID, slotID or
         * sequence number, the server is looney tunes.
         */
-        p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4);
+        p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4);
        if (unlikely(p == NULL))
                goto out_overflow;
        memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 54b60bfceb8d..7b566ec14e18 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2445,15 +2445,16 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
 static struct nfs4_delegation *
 find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
 {
-        struct nfs4_delegation *dp = NULL;
+        struct nfs4_delegation *dp;
        spin_lock(&recall_lock);
-        list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) {
+        list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
-                if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid)
+                if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) {
-                        break;
+                        spin_unlock(&recall_lock);
-        }
+                        return dp;
+                }
        spin_unlock(&recall_lock);
-        return dp;
+        return NULL;
 }
 int share_access_to_flags(u32 share_access)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1275b8655070..615f0a9f0600 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1142,7 +1142,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
        u32 dummy;
        char *machine_name;
-        int i;
+        int i, j;
        int nr_secflavs;
        READ_BUF(16);
@@ -1215,7 +1215,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
                        READ_BUF(4);
                        READ32(dummy);
                        READ_BUF(dummy * 4);
-                        for (i = 0; i < dummy; ++i)
+                        for (j = 0; j < dummy; ++j)
                                READ32(dummy);
                        break;
                case RPC_AUTH_GSS:
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 388e9e8f5286..85f7baa15f5d 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -35,11 +35,6 @@
 #include "btnode.h"
-void nilfs_btnode_cache_init_once(struct address_space *btnc)
-{
-        nilfs_mapping_init_once(btnc);
-}
 static const struct address_space_operations def_btnode_aops = {
        .sync_page              = block_sync_page,
 };
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 79037494f1e0..1b8ebd888c28 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt {
        struct buffer_head *newbh;
 };
-void nilfs_btnode_cache_init_once(struct address_space *);
 void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 6a0e2a189f60..a0babd2bff6a 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -454,9 +454,9 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
        struct backing_dev_info *bdi = inode->i_sb->s_bdi;
        INIT_LIST_HEAD(&shadow->frozen_buffers);
-        nilfs_mapping_init_once(&shadow->frozen_data);
+        address_space_init_once(&shadow->frozen_data);
        nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
-        nilfs_mapping_init_once(&shadow->frozen_btnodes);
+        address_space_init_once(&shadow->frozen_btnodes);
        nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
        mi->mi_shadow = shadow;
        return 0;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 98034271cd02..161791d26458 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -397,7 +397,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
                if (!new_de)
                        goto out_dir;
-                inc_nlink(old_inode);
                nilfs_set_link(new_dir, new_de, new_page, old_inode);
                nilfs_mark_inode_dirty(new_dir);
                new_inode->i_ctime = CURRENT_TIME;
@@ -411,13 +410,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        if (new_dir->i_nlink >= NILFS_LINK_MAX)
                                goto out_dir;
                }
-                inc_nlink(old_inode);
                err = nilfs_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        drop_nlink(old_inode);
-                        nilfs_mark_inode_dirty(old_inode);
                        goto out_dir;
-                }
                if (dir_de) {
                        inc_nlink(new_dir);
                        nilfs_mark_inode_dirty(new_dir);
@@ -431,7 +426,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        old_inode->i_ctime = CURRENT_TIME;
        nilfs_delete_entry(old_de, old_page);
-        drop_nlink(old_inode);
        if (dir_de) {
                nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 0c432416cfef..a585b35fd6bc 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -492,19 +492,6 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
        return nc;
 }
-void nilfs_mapping_init_once(struct address_space *mapping)
-{
-        memset(mapping, 0, sizeof(*mapping));
-        INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-        spin_lock_init(&mapping->tree_lock);
-        INIT_LIST_HEAD(&mapping->private_list);
-        spin_lock_init(&mapping->private_lock);
-        spin_lock_init(&mapping->i_mmap_lock);
-        INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
-}
 void nilfs_mapping_init(struct address_space *mapping,
                        struct backing_dev_info *bdi,
                        const struct address_space_operations *aops)
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 622df27cd891..2a00953ebd5f 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -61,7 +61,6 @@ void nilfs_free_private_page(struct page *);
 int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
-void nilfs_mapping_init_once(struct address_space *mapping);
 void nilfs_mapping_init(struct address_space *mapping,
                        struct backing_dev_info *bdi,
                        const struct address_space_operations *aops);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 55ebae5c7f39..2de9f636792a 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -430,7 +430,8 @@ static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
        nilfs_segctor_map_segsum_entry(
                sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
-        if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+        if (NILFS_I(inode)->i_root &&
+            !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
                set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
        /* skip finfo */
 }
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 58fd707174e1..1673b3d99842 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1279,7 +1279,7 @@ static void nilfs_inode_init_once(void *obj)
 #ifdef CONFIG_NILFS_XATTR
        init_rwsem(&ii->xattr_sem);
 #endif
-        nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+        address_space_init_once(&ii->i_btnode_cache);
        ii->i_bmap = &ii->i_bmap_data;
        inode_init_once(&ii->vfs_inode);
 }
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 6d80ecc7834f..7eb90403fc8a 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -56,7 +56,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
        int ret = 0;    /* if all else fails, just return false */
        struct ocfs2_super *osb;
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        inode = dentry->d_inode;
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 5dbc3062b4fd..254652a9b542 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -197,8 +197,12 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
                   dentry->d_name.len, dentry->d_name.name,
                   fh, len, connectable);
-        if (len < 3 || (connectable && len < 6)) {
+        if (connectable && (len < 6)) {
-                mlog(ML_ERROR, "fh buffer is too small for encoding\n");
+                *max_len = 6;
+                type = 255;
+                goto bail;
+        } else if (len < 3) {
+                *max_len = 3;
                type = 255;
                goto bail;
        }
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 43e56b97f9c0..6180da1e37e6 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -405,9 +405,9 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
               ocfs2_quota_trans_credits(sb);
 }
-/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
+/* data block for new dir/symlink, allocation of directory block, dx_root
- * bitmap block for the new bit) dx_root update for free list */
+ * update for free list */
-#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1)
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1)
 static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
 {
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 849fb4a2e814..d6c25d76b537 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -293,7 +293,7 @@ static int ocfs2_mknod(struct inode *dir,
        }
        /* get security xattr */
-        status = ocfs2_init_security_get(inode, dir, &si);
+        status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
        if (status) {
                if (status == -EOPNOTSUPP)
                        si.enable = 0;
@@ -1665,7 +1665,7 @@ static int ocfs2_symlink(struct inode *dir,
        }
        /* get security xattr */
-        status = ocfs2_init_security_get(inode, dir, &si);
+        status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
        if (status) {
                if (status == -EOPNOTSUPP)
                        si.enable = 0;
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 196fcb52d95d..d5ab56cbe5c5 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -114,7 +114,4 @@ int ocfs2_local_write_dquot(struct dquot *dquot);
 extern const struct dquot_operations ocfs2_quota_operations;
 extern struct quota_format_type ocfs2_quota_format;
-int ocfs2_quota_setup(void);
-void ocfs2_quota_shutdown(void);
 #endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 4607923eb24c..a73f64166481 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -63,8 +63,6 @@
 *        write to gf
 */
-static struct workqueue_struct *ocfs2_quota_wq = NULL;
 static void qsync_work_fn(struct work_struct *work);
 static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
@@ -400,8 +398,8 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
                                                OCFS2_QBLK_RESERVED_SPACE;
        oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
        INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
-        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+        schedule_delayed_work(&oinfo->dqi_sync_work,
-                           msecs_to_jiffies(oinfo->dqi_syncms));
+                              msecs_to_jiffies(oinfo->dqi_syncms));
 out_err:
        mlog_exit(status);
@@ -635,8 +633,8 @@ static void qsync_work_fn(struct work_struct *work)
        struct super_block *sb = oinfo->dqi_gqinode->i_sb;
        dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
-        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+        schedule_delayed_work(&oinfo->dqi_sync_work,
-                           msecs_to_jiffies(oinfo->dqi_syncms));
+                              msecs_to_jiffies(oinfo->dqi_syncms));
 }
 /*
@@ -923,20 +921,3 @@ const struct dquot_operations ocfs2_quota_operations = {
        .alloc_dquot    = ocfs2_alloc_dquot,
        .destroy_dquot  = ocfs2_destroy_dquot,
 };
-int ocfs2_quota_setup(void)
-{
-        ocfs2_quota_wq = create_workqueue("o2quot");
-        if (!ocfs2_quota_wq)
-                return -ENOMEM;
-        return 0;
-}
-void ocfs2_quota_shutdown(void)
-{
-        if (ocfs2_quota_wq) {
-                flush_workqueue(ocfs2_quota_wq);
-                destroy_workqueue(ocfs2_quota_wq);
-                ocfs2_quota_wq = NULL;
-        }
-}
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index b5f9160e93e9..c384d634872a 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3228,7 +3228,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
                                        u32 num_clusters, unsigned int e_flags)
 {
        int ret, delete, index, credits =  0;
-        u32 new_bit, new_len;
+        u32 new_bit, new_len, orig_num_clusters;
        unsigned int set_len;
        struct ocfs2_super *osb = OCFS2_SB(sb);
        handle_t *handle;
@@ -3261,6 +3261,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
                goto out;
        }
+        orig_num_clusters = num_clusters;
        while (num_clusters) {
                ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
                                             p_cluster, num_clusters,
@@ -3348,7 +3350,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
         * in write-back mode.
         */
        if (context->get_clusters == ocfs2_di_get_clusters) {
-                ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters);
+                ret = ocfs2_cow_sync_writeback(sb, context, cpos,
+                                               orig_num_clusters);
                if (ret)
                        mlog_errno(ret);
        }
@@ -4325,7 +4328,8 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
        /* If the security isn't preserved, we need to re-initialize them. */
        if (!preserve) {
-                error = ocfs2_init_security_and_acl(dir, new_orphan_inode);
+                error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
+                                                    &new_dentry->d_name);
                if (error)
                        mlog_errno(error);
        }
@@ -4376,7 +4380,7 @@ static int ocfs2_user_path_parent(const char __user *path,
        if (IS_ERR(s))
                return PTR_ERR(s);
-        error = path_lookup(s, LOOKUP_PARENT, nd);
+        error = kern_path_parent(s, nd);
        if (error)
                putname(s);
        else
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 38f986d2447e..236ed1bdca2c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1316,7 +1316,7 @@ static int ocfs2_parse_options(struct super_block *sb,
                               struct mount_options *mopt,
                               int is_remount)
 {
-        int status;
+        int status, user_stack = 0;
        char *p;
        u32 tmp;
@@ -1459,6 +1459,15 @@ static int ocfs2_parse_options(struct super_block *sb,
                        memcpy(mopt->cluster_stack, args[0].from,
                               OCFS2_STACK_LABEL_LEN);
                        mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+                        /*
+                         * Open code the memcmp here as we don't have
+                         * an osb to pass to
+                         * ocfs2_userspace_stack().
+                         */
+                        if (memcmp(mopt->cluster_stack,
+                                   OCFS2_CLASSIC_CLUSTER_STACK,
+                                   OCFS2_STACK_LABEL_LEN))
+                                user_stack = 1;
                        break;
                case Opt_inode64:
                        mopt->mount_opt |= OCFS2_MOUNT_INODE64;
@@ -1514,13 +1523,16 @@ static int ocfs2_parse_options(struct super_block *sb,
                }
        }
-        /* Ensure only one heartbeat mode */
+        if (user_stack == 0) {
-        tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
+                /* Ensure only one heartbeat mode */
-                                 OCFS2_MOUNT_HB_NONE);
+                tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
-        if (hweight32(tmp) != 1) {
+                                         OCFS2_MOUNT_HB_GLOBAL |
-                mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+                                         OCFS2_MOUNT_HB_NONE);
-                status = 0;
+                if (hweight32(tmp) != 1) {
-                goto bail;
+                        mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+                        status = 0;
+                        goto bail;
+                }
        }
        status = 1;
@@ -1645,16 +1657,11 @@ static int __init ocfs2_init(void)
                mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
        }
-        status = ocfs2_quota_setup();
-        if (status)
-                goto leave;
        ocfs2_set_locking_protocol();
        status = register_quota_format(&ocfs2_quota_format);
 leave:
        if (status < 0) {
-                ocfs2_quota_shutdown();
                ocfs2_free_mem_caches();
                exit_ocfs2_uptodate_cache();
        }
@@ -1671,8 +1678,6 @@ static void __exit ocfs2_exit(void)
 {
        mlog_entry_void();
-        ocfs2_quota_shutdown();
        if (ocfs2_wq) {
                flush_workqueue(ocfs2_wq);
                destroy_workqueue(ocfs2_wq);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 67cd43914641..6bb602486c6b 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7185,7 +7185,8 @@ out:
 * must not hold any lock expect i_mutex.
 */
 int ocfs2_init_security_and_acl(struct inode *dir,
-                                struct inode *inode)
+                                struct inode *inode,
+                                const struct qstr *qstr)
 {
        int ret = 0;
        struct buffer_head *dir_bh = NULL;
@@ -7193,7 +7194,7 @@ int ocfs2_init_security_and_acl(struct inode *dir,
                .enable = 1,
        };
-        ret = ocfs2_init_security_get(inode, dir, &si);
+        ret = ocfs2_init_security_get(inode, dir, qstr, &si);
        if (!ret) {
                ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
                                      si.name, si.value, si.value_len,
@@ -7261,13 +7262,14 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
 int ocfs2_init_security_get(struct inode *inode,
                            struct inode *dir,
+                            const struct qstr *qstr,
                            struct ocfs2_security_xattr_info *si)
 {
        /* check whether ocfs2 support feature xattr */
        if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
                return -EOPNOTSUPP;
-        return security_inode_init_security(inode, dir, &si->name, &si->value,
+        return security_inode_init_security(inode, dir, qstr, &si->name,
-                                            &si->value_len);
+                                            &si->value, &si->value_len);
 }
 int ocfs2_init_security_set(handle_t *handle,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index aa64bb37a65b..d63cfb72316b 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -57,6 +57,7 @@ int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
                                         struct ocfs2_dinode *di);
 int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
 int ocfs2_init_security_get(struct inode *, struct inode *,
+                            const struct qstr *,
                            struct ocfs2_security_xattr_info *);
 int ocfs2_init_security_set(handle_t *, struct inode *,
                            struct buffer_head *,
@@ -94,5 +95,6 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
                         struct buffer_head *new_bh,
                         bool preserve_security);
 int ocfs2_init_security_and_acl(struct inode *dir,
-                                struct inode *inode);
+                                struct inode *inode,
+                                const struct qstr *qstr);
 #endif /* OCFS2_XATTR_H */
diff --git a/fs/open.c b/fs/open.c
index 5a2c6ebc22b5..f83ca80cc59a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -233,6 +233,14 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
+        /* It's not possible punch hole on append only file */
+        if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode))
+                return -EPERM;
+        if (IS_IMMUTABLE(inode))
+                return -EPERM;
        /*
         * Revalidate the write permissions, in case security policy has
         * changed since the files were opened.
@@ -565,13 +573,15 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
 {
        struct path path;
        int error = -EINVAL;
-        int follow;
+        int lookup_flags;
-        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                goto out;
-        follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+        lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
-        error = user_path_at(dfd, filename, follow, &path);
+        if (flag & AT_EMPTY_PATH)
+                lookup_flags |= LOOKUP_EMPTY;
+        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
                goto out;
        error = mnt_want_write(path.mnt);
@@ -661,11 +671,16 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
                                        int (*open)(struct inode *, struct file *),
                                        const struct cred *cred)
 {
+        static const struct file_operations empty_fops = {};
        struct inode *inode;
        int error;
        f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
                                FMODE_PREAD | FMODE_PWRITE;
+        if (unlikely(f->f_flags & O_PATH))
+                f->f_mode = FMODE_PATH;
        inode = dentry->d_inode;
        if (f->f_mode & FMODE_WRITE) {
                error = __get_file_write_access(inode, mnt);
@@ -679,9 +694,15 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
        f->f_path.dentry = dentry;
        f->f_path.mnt = mnt;
        f->f_pos = 0;
-        f->f_op = fops_get(inode->i_fop);
        file_sb_list_add(f, inode->i_sb);
+        if (unlikely(f->f_mode & FMODE_PATH)) {
+                f->f_op = &empty_fops;
+                return f;
+        }
+        f->f_op = fops_get(inode->i_fop);
        error = security_dentry_open(f, cred);
        if (error)
                goto cleanup_all;
@@ -693,7 +714,8 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
                if (error)
                        goto cleanup_all;
        }
-        ima_counts_get(f);
+        if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+                i_readcount_inc(inode);
        f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
@@ -882,15 +904,110 @@ void fd_install(unsigned int fd, struct file *file)
 EXPORT_SYMBOL(fd_install);
+static inline int build_open_flags(int flags, int mode, struct open_flags *op)
+{
+        int lookup_flags = 0;
+        int acc_mode;
+        if (!(flags & O_CREAT))
+                mode = 0;
+        op->mode = mode;
+        /* Must never be set by userspace */
+        flags &= ~FMODE_NONOTIFY;
+        /*
+         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
+         * check for O_DSYNC if the need any syncing at all we enforce it's
+         * always set instead of having to deal with possibly weird behaviour
+         * for malicious applications setting only __O_SYNC.
+         */
+        if (flags & __O_SYNC)
+                flags |= O_DSYNC;
+        /*
+         * If we have O_PATH in the open flag. Then we
+         * cannot have anything other than the below set of flags
+         */
+        if (flags & O_PATH) {
+                flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
+                acc_mode = 0;
+        } else {
+                acc_mode = MAY_OPEN | ACC_MODE(flags);
+        }
+        op->open_flag = flags;
+        /* O_TRUNC implies we need access checks for write permissions */
+        if (flags & O_TRUNC)
+                acc_mode |= MAY_WRITE;
+        /* Allow the LSM permission hook to distinguish append
+           access from general write access. */
+        if (flags & O_APPEND)
+                acc_mode |= MAY_APPEND;
+        op->acc_mode = acc_mode;
+        op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
+        if (flags & O_CREAT) {
+                op->intent |= LOOKUP_CREATE;
+                if (flags & O_EXCL)
+                        op->intent |= LOOKUP_EXCL;
+        }
+        if (flags & O_DIRECTORY)
+                lookup_flags |= LOOKUP_DIRECTORY;
+        if (!(flags & O_NOFOLLOW))
+                lookup_flags |= LOOKUP_FOLLOW;
+        return lookup_flags;
+}
+/**
+ * filp_open - open file and return file pointer
+ *
+ * @filename:   path to open
+ * @flags:      open flags as per the open(2) second argument
+ * @mode:       mode for the new file if O_CREAT is set, else ignored
+ *
+ * This is the helper to open a file from kernelspace if you really
+ * have to.  But in generally you should not do this, so please move
+ * along, nothing to see here..
+ */
+struct file *filp_open(const char *filename, int flags, int mode)
+{
+        struct open_flags op;
+        int lookup = build_open_flags(flags, mode, &op);
+        return do_filp_open(AT_FDCWD, filename, &op, lookup);
+}
+EXPORT_SYMBOL(filp_open);
+struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+                            const char *filename, int flags)
+{
+        struct open_flags op;
+        int lookup = build_open_flags(flags, 0, &op);
+        if (flags & O_CREAT)
+                return ERR_PTR(-EINVAL);
+        if (!filename && (flags & O_DIRECTORY))
+                if (!dentry->d_inode->i_op->lookup)
+                        return ERR_PTR(-ENOTDIR);
+        return do_file_open_root(dentry, mnt, filename, &op, lookup);
+}
+EXPORT_SYMBOL(file_open_root);
 long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 {
+        struct open_flags op;
+        int lookup = build_open_flags(flags, mode, &op);
        char *tmp = getname(filename);
        int fd = PTR_ERR(tmp);
        if (!IS_ERR(tmp)) {
                fd = get_unused_fd_flags(flags);
                if (fd >= 0) {
-                        struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);
+                        struct file *f = do_filp_open(dfd, tmp, &op, lookup);
                        if (IS_ERR(f)) {
                                put_unused_fd(fd);
                                fd = PTR_ERR(f);
@@ -960,8 +1077,10 @@ int filp_close(struct file *filp, fl_owner_t id)
        if (filp->f_op && filp->f_op->flush)
                retval = filp->f_op->flush(filp, id);
-        dnotify_flush(filp, id);
+        if (likely(!(filp->f_mode & FMODE_PATH))) {
-        locks_remove_posix(filp, id);
+                dnotify_flush(filp, id);
+                locks_remove_posix(filp, id);
+        }
        fput(filp);
        return retval;
 }
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 789c625c7aa5..b10e3540d5b7 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -251,6 +251,11 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
        }
        vm->vblk_size     = get_unaligned_be32(data + 0x08);
+        if (vm->vblk_size == 0) {
+                ldm_error ("Illegal VBLK size");
+                return false;
+        }
        vm->vblk_offset   = get_unaligned_be32(data + 0x0C);
        vm->last_vblk_seq = get_unaligned_be32(data + 0x04);
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index 48cec7cbca17..764b86a01965 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,10 +10,13 @@
 #include "check.h"
 #include "osf.h"
+#define MAX_OSF_PARTITIONS 18
 int osf_partition(struct parsed_partitions *state)
 {
        int i;
        int slot = 1;
+        unsigned int npartitions;
        Sector sect;
        unsigned char *data;
        struct disklabel {
@@ -45,7 +48,7 @@ int osf_partition(struct parsed_partitions *state)
                        u8  p_fstype;
                        u8  p_frag;
                        __le16 p_cpg;
-                } d_partitions[8];
+                } d_partitions[MAX_OSF_PARTITIONS];
        } * label;
        struct d_partition * partition;
@@ -63,7 +66,12 @@ int osf_partition(struct parsed_partitions *state)
                put_dev_sector(sect);
                return 0;
        }
-        for (i = 0 ; i < le16_to_cpu(label->d_npartitions); i++, partition++) {
+        npartitions = le16_to_cpu(label->d_npartitions);
+        if (npartitions > MAX_OSF_PARTITIONS) {
+                put_dev_sector(sect);
+                return 0;
+        }
+        for (i = 0 ; i < npartitions; i++, partition++) {
                if (slot == state->limit)
                        break;
                if (le32_to_cpu(partition->p_size))
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9d096e82b201..d49c4b5d2c3e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2620,35 +2620,6 @@ static const struct pid_entry proc_base_stuff[] = {
                &proc_self_inode_operations, NULL, {}),
 };
-/*
- *      Exceptional case: normally we are not allowed to unhash a busy
- * directory. In this case, however, we can do it - no aliasing problems
- * due to the way we treat inodes.
- */
-static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
-        struct inode *inode;
-        struct task_struct *task;
-        if (nd->flags & LOOKUP_RCU)
-                return -ECHILD;
-        inode = dentry->d_inode;
-        task = get_proc_task(inode);
-        if (task) {
-                put_task_struct(task);
-                return 1;
-        }
-        d_drop(dentry);
-        return 0;
-}
-static const struct dentry_operations proc_base_dentry_operations =
-{
-        .d_revalidate   = proc_base_revalidate,
-        .d_delete       = pid_delete_dentry,
-};
 static struct dentry *proc_base_instantiate(struct inode *dir,
        struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
@@ -2685,7 +2656,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        if (p->fop)
                inode->i_fop = p->fop;
        ei->op = p->op;
-        d_set_d_op(dentry, &proc_base_dentry_operations);
        d_add(dentry, inode);
        error = NULL;
 out:
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 176ce4cda68a..d6a7ca1fdac5 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -27,6 +27,7 @@
 static void proc_evict_inode(struct inode *inode)
 {
        struct proc_dir_entry *de;
+        struct ctl_table_header *head;
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
@@ -38,8 +39,11 @@ static void proc_evict_inode(struct inode *inode)
        de = PROC_I(inode)->pde;
        if (de)
                pde_put(de);
-        if (PROC_I(inode)->sysctl)
+        head = PROC_I(inode)->sysctl;
-                sysctl_head_put(PROC_I(inode)->sysctl);
+        if (head) {
+                rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
+                sysctl_head_put(head);
+        }
 }
 struct vfsmount *proc_mnt;
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index d9396a4fc7ff..927cbd115e53 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -233,7 +233,7 @@ void __init proc_device_tree_init(void)
                return;
        root = of_find_node_by_path("/");
        if (root == NULL) {
-                printk(KERN_ERR "/proc/device-tree: can't find root\n");
+                pr_debug("/proc/device-tree: can't find root\n");
                return;
        }
        proc_device_tree_add_node(root, proc_device_tree);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 09a1f92a34ef..f50133c11c24 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -32,7 +32,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
        ei->sysctl_entry = table;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-        inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
        inode->i_mode = table->mode;
        if (!table->child) {
                inode->i_mode |= S_IFREG;
@@ -408,15 +407,18 @@ static int proc_sys_compare(const struct dentry *parent,
                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
+        struct ctl_table_header *head;
        /* Although proc doesn't have negative dentries, rcu-walk means
         * that inode here can be NULL */
+        /* AV: can it, indeed? */
        if (!inode)
-                return 0;
+                return 1;
        if (name->len != len)
                return 1;
        if (memcmp(name->name, str, len))
                return 1;
-        return !sysctl_is_seen(PROC_I(inode)->sysctl);
+        head = rcu_dereference(PROC_I(inode)->sysctl);
+        return !head || !sysctl_is_seen(head);
 }
 static const struct dentry_operations proc_sys_dentry_operations = {
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
new file mode 100644
index 000000000000..867d0ac026ce
--- /dev/null
+++ b/fs/pstore/Kconfig
@@ -0,0 +1,13 @@
+config PSTORE
+        bool "Persistant store support"
+        default n
+        help
+           This option enables generic access to platform level
+           persistent storage via "pstore" filesystem that can
+           be mounted as /dev/pstore.  Only useful if you have
+           a platform level driver that registers with pstore to
+           provide the data, so you probably should just go say "Y"
+           (or "M") to a platform specific persistent store driver
+           (e.g. ACPI_APEI on X86) which will select this for you.
+           If you don't have a platform persistent store driver,
+           say N.
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
new file mode 100644
index 000000000000..760f4bce7d1d
--- /dev/null
+++ b/fs/pstore/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux pstorefs routines.
+#
+obj-y += pstore.o
+pstore-objs += inode.o platform.o
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
new file mode 100644
index 000000000000..549d245d0b42
--- /dev/null
+++ b/fs/pstore/inode.c
@@ -0,0 +1,285 @@
+/*
+ * Persistent Storage - ramfs parts.
+ *
+ * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/mount.h>
+#include <linux/ramfs.h>
+#include <linux/sched.h>
+#include <linux/magic.h>
+#include <linux/pstore.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include "internal.h"
+#define PSTORE_NAMELEN  64
+struct pstore_private {
+        u64     id;
+        int     (*erase)(u64);
+};
+#define pstore_get_inode ramfs_get_inode
+/*
+ * When a file is unlinked from our file system we call the
+ * platform driver to erase the record from persistent store.
+ */
+static int pstore_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct pstore_private *p = dentry->d_inode->i_private;
+        p->erase(p->id);
+        kfree(p);
+        return simple_unlink(dir, dentry);
+}
+static const struct inode_operations pstore_dir_inode_operations = {
+        .lookup         = simple_lookup,
+        .unlink         = pstore_unlink,
+};
+static const struct super_operations pstore_ops = {
+        .statfs         = simple_statfs,
+        .drop_inode     = generic_delete_inode,
+        .show_options   = generic_show_options,
+};
+static struct super_block *pstore_sb;
+static struct vfsmount *pstore_mnt;
+int pstore_is_mounted(void)
+{
+        return pstore_mnt != NULL;
+}
+/*
+ * Set up a file structure as if we had opened this file and
+ * write our data to it.
+ */
+static int pstore_writefile(struct inode *inode, struct dentry *dentry,
+        char *data, size_t size)
+{
+        struct file f;
+        ssize_t n;
+        mm_segment_t old_fs = get_fs();
+        memset(&f, '0', sizeof f);
+        f.f_mapping = inode->i_mapping;
+        f.f_path.dentry = dentry;
+        f.f_path.mnt = pstore_mnt;
+        f.f_pos = 0;
+        f.f_op = inode->i_fop;
+        set_fs(KERNEL_DS);
+        n = do_sync_write(&f, data, size, &f.f_pos);
+        set_fs(old_fs);
+        fsnotify_modify(&f);
+        return n == size;
+}
+/*
+ * Make a regular file in the root directory of our file system.
+ * Load it up with "size" bytes of data from "buf".
+ * Set the mtime & ctime to the date that this record was originally stored.
+ */
+int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
+                              char *data, size_t size,
+                              struct timespec time, int (*erase)(u64))
+{
+        struct dentry           *root = pstore_sb->s_root;
+        struct dentry           *dentry;
+        struct inode            *inode;
+        int                     rc;
+        char                    name[PSTORE_NAMELEN];
+        struct pstore_private   *private;
+        rc = -ENOMEM;
+        inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0);
+        if (!inode)
+                goto fail;
+        inode->i_uid = inode->i_gid = 0;
+        private = kmalloc(sizeof *private, GFP_KERNEL);
+        if (!private)
+                goto fail_alloc;
+        private->id = id;
+        private->erase = erase;
+        switch (type) {
+        case PSTORE_TYPE_DMESG:
+                sprintf(name, "dmesg-%s-%lld", psname, id);
+                break;
+        case PSTORE_TYPE_MCE:
+                sprintf(name, "mce-%s-%lld", psname, id);
+                break;
+        case PSTORE_TYPE_UNKNOWN:
+                sprintf(name, "unknown-%s-%lld", psname, id);
+                break;
+        default:
+                sprintf(name, "type%d-%s-%lld", type, psname, id);
+                break;
+        }
+        mutex_lock(&root->d_inode->i_mutex);
+        rc = -ENOSPC;
+        dentry = d_alloc_name(root, name);
+        if (IS_ERR(dentry))
+                goto fail_lockedalloc;
+        d_add(dentry, inode);
+        mutex_unlock(&root->d_inode->i_mutex);
+        if (!pstore_writefile(inode, dentry, data, size))
+                goto fail_write;
+        inode->i_private = private;
+        if (time.tv_sec)
+                inode->i_mtime = inode->i_ctime = time;
+        return 0;
+fail_write:
+        kfree(private);
+        inode->i_nlink--;
+        mutex_lock(&root->d_inode->i_mutex);
+        d_delete(dentry);
+        dput(dentry);
+        mutex_unlock(&root->d_inode->i_mutex);
+        goto fail;
+fail_lockedalloc:
+        mutex_unlock(&root->d_inode->i_mutex);
+        kfree(private);
+fail_alloc:
+        iput(inode);
+fail:
+        return rc;
+}
+int pstore_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct inode *inode = NULL;
+        struct dentry *root;
+        int err;
+        save_mount_options(sb, data);
+        pstore_sb = sb;
+        sb->s_maxbytes          = MAX_LFS_FILESIZE;
+        sb->s_blocksize         = PAGE_CACHE_SIZE;
+        sb->s_blocksize_bits    = PAGE_CACHE_SHIFT;
+        sb->s_magic             = PSTOREFS_MAGIC;
+        sb->s_op                = &pstore_ops;
+        sb->s_time_gran         = 1;
+        inode = pstore_get_inode(sb, NULL, S_IFDIR | 0755, 0);
+        if (!inode) {
+                err = -ENOMEM;
+                goto fail;
+        }
+        /* override ramfs "dir" options so we catch unlink(2) */
+        inode->i_op = &pstore_dir_inode_operations;
+        root = d_alloc_root(inode);
+        sb->s_root = root;
+        if (!root) {
+                err = -ENOMEM;
+                goto fail;
+        }
+        pstore_get_records();
+        return 0;
+fail:
+        iput(inode);
+        return err;
+}
+static int pstore_get_sb(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+        struct dentry *root;
+        root = mount_nodev(fs_type, flags, data, pstore_fill_super);
+        if (IS_ERR(root))
+                return -ENOMEM;
+        mnt->mnt_root = root;
+        mnt->mnt_sb = root->d_sb;
+        pstore_mnt = mnt;
+        return 0;
+}
+static void pstore_kill_sb(struct super_block *sb)
+{
+        kill_litter_super(sb);
+        pstore_sb = NULL;
+        pstore_mnt = NULL;
+}
+static struct file_system_type pstore_fs_type = {
+        .name           = "pstore",
+        .get_sb         = pstore_get_sb,
+        .kill_sb        = pstore_kill_sb,
+};
+static int __init init_pstore_fs(void)
+{
+        int rc = 0;
+        struct kobject *pstorefs_kobj;
+        pstorefs_kobj = kobject_create_and_add("pstore", fs_kobj);
+        if (!pstorefs_kobj) {
+                rc = -ENOMEM;
+                goto done;
+        }
+        rc = sysfs_create_file(pstorefs_kobj, &pstore_kmsg_bytes_attr.attr);
+        if (rc)
+                goto done1;
+        rc = register_filesystem(&pstore_fs_type);
+        if (rc == 0)
+                goto done;
+        sysfs_remove_file(pstorefs_kobj, &pstore_kmsg_bytes_attr.attr);
+done1:
+        kobject_put(pstorefs_kobj);
+done:
+        return rc;
+}
+module_init(init_pstore_fs)
+MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
+MODULE_LICENSE("GPL");
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
new file mode 100644
index 000000000000..76c26d2fab29
--- /dev/null
+++ b/fs/pstore/internal.h
@@ -0,0 +1,7 @@
+extern void     pstore_get_records(void);
+extern int      pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
+                              char *data, size_t size,
+                              struct timespec time, int (*erase)(u64));
+extern int      pstore_is_mounted(void);
+extern struct kobj_attribute pstore_kmsg_bytes_attr;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
new file mode 100644
index 000000000000..705fdf8abf6e
--- /dev/null
+++ b/fs/pstore/platform.c
@@ -0,0 +1,202 @@
+/*
+ * Persistent Storage - platform driver interface parts.
+ *
+ * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/atomic.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kmsg_dump.h>
+#include <linux/module.h>
+#include <linux/pstore.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include "internal.h"
+/*
+ * pstore_lock just protects "psinfo" during
+ * calls to pstore_register()
+ */
+static DEFINE_SPINLOCK(pstore_lock);
+static struct pstore_info *psinfo;
+/* How much of the console log to snapshot. /sys/fs/pstore/kmsg_bytes */
+static unsigned long kmsg_bytes = 10240;
+static ssize_t b_show(struct kobject *kobj,
+                      struct kobj_attribute *attr, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%lu\n", kmsg_bytes);
+}
+static ssize_t b_store(struct kobject *kobj, struct kobj_attribute *attr,
+                       const char *buf, size_t count)
+{
+        return (sscanf(buf, "%lu", &kmsg_bytes) > 0) ? count : 0;
+}
+struct kobj_attribute pstore_kmsg_bytes_attr =
+        __ATTR(kmsg_bytes, S_IRUGO | S_IWUSR, b_show, b_store);
+/* Tag each group of saved records with a sequence number */
+static int      oopscount;
+/*
+ * callback from kmsg_dump. (s2,l2) has the most recently
+ * written bytes, older bytes are in (s1,l1). Save as much
+ * as we can from the end of the buffer.
+ */
+static void pstore_dump(struct kmsg_dumper *dumper,
+            enum kmsg_dump_reason reason,
+            const char *s1, unsigned long l1,
+            const char *s2, unsigned long l2)
+{
+        unsigned long   s1_start, s2_start;
+        unsigned long   l1_cpy, l2_cpy;
+        unsigned long   size, total = 0;
+        char            *dst;
+        u64             id;
+        int             hsize, part = 1;
+        mutex_lock(&psinfo->buf_mutex);
+        oopscount++;
+        while (total < kmsg_bytes) {
+                dst = psinfo->buf;
+                hsize = sprintf(dst, "Oops#%d Part%d\n", oopscount, part++);
+                size = psinfo->bufsize - hsize;
+                dst += hsize;
+                l2_cpy = min(l2, size);
+                l1_cpy = min(l1, size - l2_cpy);
+                if (l1_cpy + l2_cpy == 0)
+                        break;
+                s2_start = l2 - l2_cpy;
+                s1_start = l1 - l1_cpy;
+                memcpy(dst, s1 + s1_start, l1_cpy);
+                memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
+                id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy);
+                if (pstore_is_mounted())
+                        pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id,
+                                      psinfo->buf, hsize + l1_cpy + l2_cpy,
+                                      CURRENT_TIME, psinfo->erase);
+                l1 -= l1_cpy;
+                l2 -= l2_cpy;
+                total += l1_cpy + l2_cpy;
+        }
+        mutex_unlock(&psinfo->buf_mutex);
+}
+static struct kmsg_dumper pstore_dumper = {
+        .dump = pstore_dump,
+};
+/*
+ * platform specific persistent storage driver registers with
+ * us here. If pstore is already mounted, call the platform
+ * read function right away to populate the file system. If not
+ * then the pstore mount code will call us later to fill out
+ * the file system.
+ *
+ * Register with kmsg_dump to save last part of console log on panic.
+ */
+int pstore_register(struct pstore_info *psi)
+{
+        struct module *owner = psi->owner;
+        spin_lock(&pstore_lock);
+        if (psinfo) {
+                spin_unlock(&pstore_lock);
+                return -EBUSY;
+        }
+        psinfo = psi;
+        spin_unlock(&pstore_lock);
+        if (owner && !try_module_get(owner)) {
+                psinfo = NULL;
+                return -EINVAL;
+        }
+        if (pstore_is_mounted())
+                pstore_get_records();
+        kmsg_dump_register(&pstore_dumper);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(pstore_register);
+/*
+ * Read all the records from the persistent store. Create and
+ * file files in our filesystem.
+ */
+void pstore_get_records(void)
+{
+        struct pstore_info *psi = psinfo;
+        size_t                  size;
+        u64                     id;
+        enum pstore_type_id     type;
+        struct timespec         time;
+        int                     failed = 0;
+        if (!psi)
+                return;
+        mutex_lock(&psinfo->buf_mutex);
+        while ((size = psi->read(&id, &type, &time)) > 0) {
+                if (pstore_mkfile(type, psi->name, id, psi->buf, size,
+                                  time, psi->erase))
+                        failed++;
+        }
+        mutex_unlock(&psinfo->buf_mutex);
+        if (failed)
+                printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n",
+                       failed, psi->name);
+}
+/*
+ * Call platform driver to write a record to the
+ * persistent store.
+ */
+int pstore_write(enum pstore_type_id type, char *buf, size_t size)
+{
+        u64     id;
+        if (!psinfo)
+                return -ENODEV;
+        if (size > psinfo->bufsize)
+                return -EFBIG;
+        mutex_lock(&psinfo->buf_mutex);
+        memcpy(psinfo->buf, buf, size);
+        id = psinfo->write(type, size);
+        if (pstore_is_mounted())
+                pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf,
+                              size, CURRENT_TIME, psinfo->erase);
+        mutex_unlock(&psinfo->buf_mutex);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(pstore_write);
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 65444d29406b..f1ab3604db5a 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -112,7 +112,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
        if (!info->dqi_priv) {
                printk(KERN_WARNING
                       "Not enough memory for quota information structure.\n");
-                return -1;
+                return -ENOMEM;
        }
        qinfo = info->dqi_priv;
        if (version == 0) {
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0bae036831e2..1bba24bad820 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1593,8 +1593,13 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
        struct inode *inode = dentry->d_inode;
        int maxlen = *lenp;
-        if (maxlen < 3)
+        if (need_parent && (maxlen < 5)) {
+                *lenp = 5;
                return 255;
+        } else if (maxlen < 3) {
+                *lenp = 3;
+                return 255;
+        }
        data[0] = inode->i_ino;
        data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 3eea859e6990..c77514bd5776 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2876,7 +2876,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        reiserfs_mounted_fs_count++;
        if (reiserfs_mounted_fs_count <= 1) {
                reiserfs_write_unlock(sb);
-                commit_wq = create_workqueue("reiserfs");
+                commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
                reiserfs_write_lock(sb);
        }
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ba5f51ec3458..118662690cdf 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -593,7 +593,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
        new_inode_init(inode, dir, mode);
        jbegin_count += reiserfs_cache_default_acl(dir);
-        retval = reiserfs_security_init(dir, inode, &security);
+        retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
        if (retval < 0) {
                drop_new_inode(inode);
                return retval;
@@ -667,7 +667,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
        new_inode_init(inode, dir, mode);
        jbegin_count += reiserfs_cache_default_acl(dir);
-        retval = reiserfs_security_init(dir, inode, &security);
+        retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
        if (retval < 0) {
                drop_new_inode(inode);
                return retval;
@@ -747,7 +747,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        new_inode_init(inode, dir, mode);
        jbegin_count += reiserfs_cache_default_acl(dir);
-        retval = reiserfs_security_init(dir, inode, &security);
+        retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
        if (retval < 0) {
                drop_new_inode(inode);
                return retval;
@@ -771,7 +771,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                                        EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
                                        dentry, inode, &security);
        if (retval) {
-                dir->i_nlink--;
+                DEC_DIR_INODE_NLINK(dir)
                goto out_failed;
        }
@@ -1032,7 +1032,8 @@ static int reiserfs_symlink(struct inode *parent_dir,
        }
        new_inode_init(inode, parent_dir, mode);
-        retval = reiserfs_security_init(parent_dir, inode, &security);
+        retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name,
+                                        &security);
        if (retval < 0) {
                drop_new_inode(inode);
                return retval;
@@ -1122,10 +1123,6 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
                reiserfs_write_unlock(dir->i_sb);
                return -EMLINK;
        }
-        if (inode->i_nlink == 0) {
-                reiserfs_write_unlock(dir->i_sb);
-                return -ENOENT;
-        }
        /* inc before scheduling so reiserfs_unlink knows we are here */
        inc_nlink(inode);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 3cfb2e933644..5c11ca82b782 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -978,8 +978,6 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
 static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        if (nd->flags & LOOKUP_RCU)
-                return -ECHILD;
        return -EPERM;
 }
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 237c6928d3c6..ef66c18a9332 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -54,6 +54,7 @@ static size_t security_list(struct dentry *dentry, char *list, size_t list_len,
 * of blocks needed for the transaction. If successful, reiserfs_security
 * must be released using reiserfs_security_free when the caller is done. */
 int reiserfs_security_init(struct inode *dir, struct inode *inode,
+                           const struct qstr *qstr,
                           struct reiserfs_security_handle *sec)
 {
        int blocks = 0;
@@ -65,7 +66,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
        if (IS_PRIVATE(dir))
                return 0;
-        error = security_inode_init_security(inode, dir, &sec->name,
+        error = security_inode_init_security(inode, dir, qstr, &sec->name,
                                             &sec->value, &sec->length);
        if (error) {
                if (error == -EOPNOTSUPP)
diff --git a/fs/stat.c b/fs/stat.c
index d5c61cf2b703..961039121cb8 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,13 +75,16 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
        int error = -EINVAL;
        int lookup_flags = 0;
-        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0)
+        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
+                      AT_EMPTY_PATH)) != 0)
                goto out;
        if (!(flag & AT_SYMLINK_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
        if (flag & AT_NO_AUTOMOUNT)
                lookup_flags |= LOOKUP_NO_AUTOMOUNT;
+        if (flag & AT_EMPTY_PATH)
+                lookup_flags |= LOOKUP_EMPTY;
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
@@ -297,7 +300,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
        if (bufsiz <= 0)
                return -EINVAL;
-        error = user_path_at(dfd, pathname, 0, &path);
+        error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path);
        if (!error) {
                struct inode *inode = path.dentry->d_inode;
diff --git a/fs/statfs.c b/fs/statfs.c
index 30ea8c8a996b..8244924dec55 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -73,149 +73,135 @@ int vfs_statfs(struct path *path, struct kstatfs *buf)
 }
 EXPORT_SYMBOL(vfs_statfs);
-static int do_statfs_native(struct path *path, struct statfs *buf)
+int user_statfs(const char __user *pathname, struct kstatfs *st)
 {
-        struct kstatfs st;
+        struct path path;
-        int retval;
+        int error = user_path(pathname, &path);
+        if (!error) {
+                error = vfs_statfs(&path, st);
+                path_put(&path);
+        }
+        return error;
+}
-        retval = vfs_statfs(path, &st);
+int fd_statfs(int fd, struct kstatfs *st)
-        if (retval)
+{
-                return retval;
+        struct file *file = fget(fd);
+        int error = -EBADF;
+        if (file) {
+                error = vfs_statfs(&file->f_path, st);
+                fput(file);
+        }
+        return error;
+}
-        if (sizeof(*buf) == sizeof(st))
+static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
-                memcpy(buf, &st, sizeof(st));
+{
+        struct statfs buf;
+        if (sizeof(buf) == sizeof(*st))
+                memcpy(&buf, st, sizeof(*st));
        else {
-                if (sizeof buf->f_blocks == 4) {
+                if (sizeof buf.f_blocks == 4) {
-                        if ((st.f_blocks | st.f_bfree | st.f_bavail |
+                        if ((st->f_blocks | st->f_bfree | st->f_bavail |
-                             st.f_bsize | st.f_frsize) &
+                             st->f_bsize | st->f_frsize) &
                            0xffffffff00000000ULL)
                                return -EOVERFLOW;
                        /*
                         * f_files and f_ffree may be -1; it's okay to stuff
                         * that into 32 bits
                         */
-                        if (st.f_files != -1 &&
+                        if (st->f_files != -1 &&
-                            (st.f_files & 0xffffffff00000000ULL))
+                            (st->f_files & 0xffffffff00000000ULL))
                                return -EOVERFLOW;
-                        if (st.f_ffree != -1 &&
+                        if (st->f_ffree != -1 &&
-                            (st.f_ffree & 0xffffffff00000000ULL))
+                            (st->f_ffree & 0xffffffff00000000ULL))
                                return -EOVERFLOW;
                }
-                buf->f_type = st.f_type;
+                buf.f_type = st->f_type;
-                buf->f_bsize = st.f_bsize;
+                buf.f_bsize = st->f_bsize;
-                buf->f_blocks = st.f_blocks;
+                buf.f_blocks = st->f_blocks;
-                buf->f_bfree = st.f_bfree;
+                buf.f_bfree = st->f_bfree;
-                buf->f_bavail = st.f_bavail;
+                buf.f_bavail = st->f_bavail;
-                buf->f_files = st.f_files;
+                buf.f_files = st->f_files;
-                buf->f_ffree = st.f_ffree;
+                buf.f_ffree = st->f_ffree;
-                buf->f_fsid = st.f_fsid;
+                buf.f_fsid = st->f_fsid;
-                buf->f_namelen = st.f_namelen;
+                buf.f_namelen = st->f_namelen;
-                buf->f_frsize = st.f_frsize;
+                buf.f_frsize = st->f_frsize;
-                buf->f_flags = st.f_flags;
+                buf.f_flags = st->f_flags;
-                memset(buf->f_spare, 0, sizeof(buf->f_spare));
+                memset(buf.f_spare, 0, sizeof(buf.f_spare));
        }
+        if (copy_to_user(p, &buf, sizeof(buf)))
+                return -EFAULT;
        return 0;
 }
-static int do_statfs64(struct path *path, struct statfs64 *buf)
+static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p)
 {
-        struct kstatfs st;
+        struct statfs64 buf;
-        int retval;
+        if (sizeof(buf) == sizeof(*st))
+                memcpy(&buf, st, sizeof(*st));
-        retval = vfs_statfs(path, &st);
-        if (retval)
-                return retval;
-        if (sizeof(*buf) == sizeof(st))
-                memcpy(buf, &st, sizeof(st));
        else {
-                buf->f_type = st.f_type;
+                buf.f_type = st->f_type;
-                buf->f_bsize = st.f_bsize;
+                buf.f_bsize = st->f_bsize;
-                buf->f_blocks = st.f_blocks;
+                buf.f_blocks = st->f_blocks;
-                buf->f_bfree = st.f_bfree;
+                buf.f_bfree = st->f_bfree;
-                buf->f_bavail = st.f_bavail;
+                buf.f_bavail = st->f_bavail;
-                buf->f_files = st.f_files;
+                buf.f_files = st->f_files;
-                buf->f_ffree = st.f_ffree;
+                buf.f_ffree = st->f_ffree;
-                buf->f_fsid = st.f_fsid;
+                buf.f_fsid = st->f_fsid;
-                buf->f_namelen = st.f_namelen;
+                buf.f_namelen = st->f_namelen;
-                buf->f_frsize = st.f_frsize;
+                buf.f_frsize = st->f_frsize;
-                buf->f_flags = st.f_flags;
+                buf.f_flags = st->f_flags;
-                memset(buf->f_spare, 0, sizeof(buf->f_spare));
+                memset(buf.f_spare, 0, sizeof(buf.f_spare));
        }
+        if (copy_to_user(p, &buf, sizeof(buf)))
+                return -EFAULT;
        return 0;
 }
 SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
 {
-        struct path path;
+        struct kstatfs st;
-        int error;
+        int error = user_statfs(pathname, &st);
+        if (!error)
-        error = user_path(pathname, &path);
+                error = do_statfs_native(&st, buf);
-        if (!error) {
-                struct statfs tmp;
-                error = do_statfs_native(&path, &tmp);
-                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                        error = -EFAULT;
-                path_put(&path);
-        }
        return error;
 }
 SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
 {
-        struct path path;
+        struct kstatfs st;
-        long error;
+        int error;
        if (sz != sizeof(*buf))
                return -EINVAL;
-        error = user_path(pathname, &path);
+        error = user_statfs(pathname, &st);
-        if (!error) {
+        if (!error)
-                struct statfs64 tmp;
+                error = do_statfs64(&st, buf);
-                error = do_statfs64(&path, &tmp);
-                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                        error = -EFAULT;
-                path_put(&path);
-        }
        return error;
 }
 SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
 {
-        struct file *file;
+        struct kstatfs st;
-        struct statfs tmp;
+        int error = fd_statfs(fd, &st);
-        int error;
+        if (!error)
+                error = do_statfs_native(&st, buf);
-        error = -EBADF;
-        file = fget(fd);
-        if (!file)
-                goto out;
-        error = do_statfs_native(&file->f_path, &tmp);
-        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                error = -EFAULT;
-        fput(file);
-out:
        return error;
 }
 SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
 {
-        struct file *file;
+        struct kstatfs st;
-        struct statfs64 tmp;
        int error;
        if (sz != sizeof(*buf))
                return -EINVAL;
-        error = -EBADF;
+        error = fd_statfs(fd, &st);
-        file = fget(fd);
+        if (!error)
-        if (!file)
+                error = do_statfs64(&st, buf);
-                goto out;
-        error = do_statfs64(&file->f_path, &tmp);
-        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                error = -EFAULT;
-        fput(file);
-out:
        return error;
 }
diff --git a/fs/super.c b/fs/super.c
index 7e9dd4cc2c01..4bae0ef6110e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -843,23 +843,6 @@ error:
 }
 EXPORT_SYMBOL(mount_bdev);
-int get_sb_bdev(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data,
-        int (*fill_super)(struct super_block *, void *, int),
-        struct vfsmount *mnt)
-{
-        struct dentry *root;
-        root = mount_bdev(fs_type, flags, dev_name, data, fill_super);
-        if (IS_ERR(root))
-                return PTR_ERR(root);
-        mnt->mnt_root = root;
-        mnt->mnt_sb = root->d_sb;
-        return 0;
-}
-EXPORT_SYMBOL(get_sb_bdev);
 void kill_block_super(struct super_block *sb)
 {
        struct block_device *bdev = sb->s_bdev;
@@ -897,22 +880,6 @@ struct dentry *mount_nodev(struct file_system_type *fs_type,
 }
 EXPORT_SYMBOL(mount_nodev);
-int get_sb_nodev(struct file_system_type *fs_type,
-        int flags, void *data,
-        int (*fill_super)(struct super_block *, void *, int),
-        struct vfsmount *mnt)
-{
-        struct dentry *root;
-        root = mount_nodev(fs_type, flags, data, fill_super);
-        if (IS_ERR(root))
-                return PTR_ERR(root);
-        mnt->mnt_root = root;
-        mnt->mnt_sb = root->d_sb;
-        return 0;
-}
-EXPORT_SYMBOL(get_sb_nodev);
 static int compare_single(struct super_block *s, void *p)
 {
        return 1;
@@ -943,22 +910,6 @@ struct dentry *mount_single(struct file_system_type *fs_type,
 }
 EXPORT_SYMBOL(mount_single);
-int get_sb_single(struct file_system_type *fs_type,
-        int flags, void *data,
-        int (*fill_super)(struct super_block *, void *, int),
-        struct vfsmount *mnt)
-{
-        struct dentry *root;
-        root = mount_single(fs_type, flags, data, fill_super);
-        if (IS_ERR(root))
-                return PTR_ERR(root);
-        mnt->mnt_root = root;
-        mnt->mnt_sb = root->d_sb;
-        return 0;
-}
-EXPORT_SYMBOL(get_sb_single);
 struct vfsmount *
 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
@@ -988,19 +939,13 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
                        goto out_free_secdata;
        }
-        if (type->mount) {
+        root = type->mount(type, flags, name, data);
-                root = type->mount(type, flags, name, data);
+        if (IS_ERR(root)) {
-                if (IS_ERR(root)) {
+                error = PTR_ERR(root);
-                        error = PTR_ERR(root);
+                goto out_free_secdata;
-                        goto out_free_secdata;
-                }
-                mnt->mnt_root = root;
-                mnt->mnt_sb = root->d_sb;
-        } else {
-                error = type->get_sb(type, flags, name, data, mnt);
-                if (error < 0)
-                        goto out_free_secdata;
        }
+        mnt->mnt_root = root;
+        mnt->mnt_sb = root->d_sb;
        BUG_ON(!mnt->mnt_sb);
        WARN_ON(!mnt->mnt_sb->s_bdi);
        mnt->mnt_sb->s_flags |= MS_BORN;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index b427b1208c26..e474fbcf8bde 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -245,7 +245,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
                new_de = sysv_find_entry(new_dentry, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                sysv_set_link(new_de, new_page, old_inode);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
@@ -257,18 +256,15 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
                        if (new_dir->i_nlink >= SYSV_SB(new_dir->i_sb)->s_link_max)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = sysv_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
        sysv_delete_entry(old_de, old_page);
-        inode_dec_link_count(old_inode);
+        mark_inode_dirty(old_inode);
        if (dir_de) {
                sysv_set_link(dir_de, dir_page, new_dir);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 14f64b689d7f..7217d67a80a6 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -522,24 +522,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
        ubifs_assert(mutex_is_locked(&dir->i_mutex));
        ubifs_assert(mutex_is_locked(&inode->i_mutex));
-        /*
-         * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-         * otherwise has the potential to corrupt the orphan inode list.
-         *
-         * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and
-         * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not
-         * lock 'dirA->i_mutex', so this is possible. Both of the functions
-         * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes
-         * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this
-         * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA'
-         * to the list of orphans. After this, 'vfs_link()' will link
-         * 'dirB/fileB' to 'inodeA'. This is a problem because, for example,
-         * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode
-         * to the list of orphans.
-         */
-         if (inode->i_nlink == 0)
-                 return -ENOENT;
        err = dbg_check_synced_i_size(inode);
        if (err)
                return err;
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 306ee39ef2c3..8994dd041660 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -31,7 +31,7 @@
 #define udf_set_bit(nr, addr) ext2_set_bit(nr, addr)
 #define udf_test_bit(nr, addr) ext2_test_bit(nr, addr)
 #define udf_find_next_one_bit(addr, size, offset) \
-                ext2_find_next_bit(addr, size, offset)
+                ext2_find_next_bit((unsigned long *)(addr), size, offset)
 static int read_block_bitmap(struct super_block *sb,
                             struct udf_bitmap *bitmap, unsigned int block,
@@ -297,7 +297,7 @@ repeat:
                                break;
                        }
                } else {
-                        bit = udf_find_next_one_bit((char *)bh->b_data,
+                        bit = udf_find_next_one_bit(bh->b_data,
                                                    sb->s_blocksize << 3,
                                                    group_start << 3);
                        if (bit < sb->s_blocksize << 3)
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 89c78486cbbe..f391a2adc699 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -123,8 +123,8 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                if (inode->i_sb->s_blocksize <
                                (udf_file_entry_alloc_offset(inode) +
                                                pos + count)) {
-                        udf_expand_file_adinicb(inode, pos + count, &err);
+                        err = udf_expand_file_adinicb(inode);
-                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+                        if (err) {
                                udf_debug("udf_expand_adinicb: err=%d\n", err);
                                up_write(&iinfo->i_data_sem);
                                return err;
@@ -237,7 +237,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
        if ((attr->ia_valid & ATTR_SIZE) &&
            attr->ia_size != i_size_read(inode)) {
-                error = vmtruncate(inode, attr->ia_size);
+                error = udf_setsize(inode, attr->ia_size);
                if (error)
                        return error;
        }
@@ -249,5 +249,4 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
 const struct inode_operations udf_file_inode_operations = {
        .setattr                = udf_setattr,
-        .truncate               = udf_truncate,
 };
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index c6a2e782b97b..ccc814321414 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -73,14 +73,12 @@ void udf_evict_inode(struct inode *inode)
        struct udf_inode_info *iinfo = UDF_I(inode);
        int want_delete = 0;
-        truncate_inode_pages(&inode->i_data, 0);
        if (!inode->i_nlink && !is_bad_inode(inode)) {
                want_delete = 1;
-                inode->i_size = 0;
+                udf_setsize(inode, 0);
-                udf_truncate(inode);
                udf_update_inode(inode, IS_SYNC(inode));
-        }
+        } else
+                truncate_inode_pages(&inode->i_data, 0);
        invalidate_inode_buffers(inode);
        end_writeback(inode);
        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
@@ -117,9 +115,18 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
        ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block);
        if (unlikely(ret)) {
-                loff_t isize = mapping->host->i_size;
+                struct inode *inode = mapping->host;
-                if (pos + len > isize)
+                struct udf_inode_info *iinfo = UDF_I(inode);
-                        vmtruncate(mapping->host, isize);
+                loff_t isize = inode->i_size;
+                if (pos + len > isize) {
+                        truncate_pagecache(inode, pos + len, isize);
+                        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+                                down_write(&iinfo->i_data_sem);
+                                udf_truncate_extents(inode);
+                                up_write(&iinfo->i_data_sem);
+                        }
+                }
        }
        return ret;
@@ -139,30 +146,31 @@ const struct address_space_operations udf_aops = {
        .bmap           = udf_bmap,
 };
-void udf_expand_file_adinicb(struct inode *inode, int newsize, int *err)
+int udf_expand_file_adinicb(struct inode *inode)
 {
        struct page *page;
        char *kaddr;
        struct udf_inode_info *iinfo = UDF_I(inode);
+        int err;
        struct writeback_control udf_wbc = {
                .sync_mode = WB_SYNC_NONE,
                .nr_to_write = 1,
        };
-        /* from now on we have normal address_space methods */
-        inode->i_data.a_ops = &udf_aops;
        if (!iinfo->i_lenAlloc) {
                if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
                        iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
                else
                        iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
+                /* from now on we have normal address_space methods */
+                inode->i_data.a_ops = &udf_aops;
                mark_inode_dirty(inode);
-                return;
+                return 0;
        }
-        page = grab_cache_page(inode->i_mapping, 0);
+        page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
-        BUG_ON(!PageLocked(page));
+        if (!page)
+                return -ENOMEM;
        if (!PageUptodate(page)) {
                kaddr = kmap(page);
@@ -181,11 +189,24 @@ void udf_expand_file_adinicb(struct inode *inode, int newsize, int *err)
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
        else
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
+        /* from now on we have normal address_space methods */
-        inode->i_data.a_ops->writepage(page, &udf_wbc);
+        inode->i_data.a_ops = &udf_aops;
+        err = inode->i_data.a_ops->writepage(page, &udf_wbc);
+        if (err) {
+                /* Restore everything back so that we don't lose data... */
+                lock_page(page);
+                kaddr = kmap(page);
+                memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr,
+                       inode->i_size);
+                kunmap(page);
+                unlock_page(page);
+                iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
+                inode->i_data.a_ops = &udf_adinicb_aops;
+        }
        page_cache_release(page);
        mark_inode_dirty(inode);
+        return err;
 }
 struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
@@ -348,8 +369,10 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block,
 }
 /* Extend the file by 'blocks' blocks, return the number of extents added */
-int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
+static int udf_do_extend_file(struct inode *inode,
-                    struct kernel_long_ad *last_ext, sector_t blocks)
+                              struct extent_position *last_pos,
+                              struct kernel_long_ad *last_ext,
+                              sector_t blocks)
 {
        sector_t add;
        int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
@@ -357,6 +380,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
        struct kernel_lb_addr prealloc_loc = {};
        int prealloc_len = 0;
        struct udf_inode_info *iinfo;
+        int err;
        /* The previous extent is fake and we should not extend by anything
         * - there's nothing to do... */
@@ -422,26 +446,29 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
        /* Create enough extents to cover the whole hole */
        while (blocks > add) {
                blocks -= add;
-                if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
+                err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
-                                 last_ext->extLength, 1) == -1)
+                                   last_ext->extLength, 1);
-                        return -1;
+                if (err)
+                        return err;
                count++;
        }
        if (blocks) {
                last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
                        (blocks << sb->s_blocksize_bits);
-                if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
+                err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
-                                 last_ext->extLength, 1) == -1)
+                                   last_ext->extLength, 1);
-                        return -1;
+                if (err)
+                        return err;
                count++;
        }
 out:
        /* Do we have some preallocated blocks saved? */
        if (prealloc_len) {
-                if (udf_add_aext(inode, last_pos, &prealloc_loc,
+                err = udf_add_aext(inode, last_pos, &prealloc_loc,
-                                 prealloc_len, 1) == -1)
+                                   prealloc_len, 1);
-                        return -1;
+                if (err)
+                        return err;
                last_ext->extLocation = prealloc_loc;
                last_ext->extLength = prealloc_len;
                count++;
@@ -453,11 +480,68 @@ out:
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
                last_pos->offset -= sizeof(struct long_ad);
        else
-                return -1;
+                return -EIO;
        return count;
 }
+static int udf_extend_file(struct inode *inode, loff_t newsize)
+{
+        struct extent_position epos;
+        struct kernel_lb_addr eloc;
+        uint32_t elen;
+        int8_t etype;
+        struct super_block *sb = inode->i_sb;
+        sector_t first_block = newsize >> sb->s_blocksize_bits, offset;
+        int adsize;
+        struct udf_inode_info *iinfo = UDF_I(inode);
+        struct kernel_long_ad extent;
+        int err;
+        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+                adsize = sizeof(struct short_ad);
+        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+                adsize = sizeof(struct long_ad);
+        else
+                BUG();
+        etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
+        /* File has extent covering the new size (could happen when extending
+         * inside a block)? */
+        if (etype != -1)
+                return 0;
+        if (newsize & (sb->s_blocksize - 1))
+                offset++;
+        /* Extended file just to the boundary of the last file block? */
+        if (offset == 0)
+                return 0;
+        /* Truncate is extending the file by 'offset' blocks */
+        if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
+            (epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
+                /* File has no extents at all or has empty last
+                 * indirect extent! Create a fake extent... */
+                extent.extLocation.logicalBlockNum = 0;
+                extent.extLocation.partitionReferenceNum = 0;
+                extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
+        } else {
+                epos.offset -= adsize;
+                etype = udf_next_aext(inode, &epos, &extent.extLocation,
+                                      &extent.extLength, 0);
+                extent.extLength |= etype << 30;
+        }
+        err = udf_do_extend_file(inode, &epos, &extent, offset);
+        if (err < 0)
+                goto out;
+        err = 0;
+        iinfo->i_lenExtents = newsize;
+out:
+        brelse(epos.bh);
+        return err;
+}
 static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                                        int *err, sector_t *phys, int *new)
 {
@@ -540,7 +624,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                        elen = EXT_RECORDED_ALLOCATED |
                                ((elen + inode->i_sb->s_blocksize - 1) &
                                 ~(inode->i_sb->s_blocksize - 1));
-                        etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
+                        udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
                }
                brelse(prev_epos.bh);
                brelse(cur_epos.bh);
@@ -564,19 +648,17 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                        memset(&laarr[0].extLocation, 0x00,
                                sizeof(struct kernel_lb_addr));
                        laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
-                        /* Will udf_extend_file() create real extent from
+                        /* Will udf_do_extend_file() create real extent from
                           a fake one? */
                        startnum = (offset > 0);
                }
                /* Create extents for the hole between EOF and offset */
-                ret = udf_extend_file(inode, &prev_epos, laarr, offset);
+                ret = udf_do_extend_file(inode, &prev_epos, laarr, offset);
-                if (ret == -1) {
+                if (ret < 0) {
                        brelse(prev_epos.bh);
                        brelse(cur_epos.bh);
                        brelse(next_epos.bh);
-                        /* We don't really know the error here so we just make
+                        *err = ret;
-                         * something up */
-                        *err = -ENOSPC;
                        return NULL;
                }
                c = 0;
@@ -1005,52 +1087,66 @@ struct buffer_head *udf_bread(struct inode *inode, int block,
        return NULL;
 }
-void udf_truncate(struct inode *inode)
+int udf_setsize(struct inode *inode, loff_t newsize)
 {
-        int offset;
        int err;
        struct udf_inode_info *iinfo;
+        int bsize = 1 << inode->i_blkbits;
        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
              S_ISLNK(inode->i_mode)))
-                return;
+                return -EINVAL;
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                return;
+                return -EPERM;
        iinfo = UDF_I(inode);
-        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+        if (newsize > inode->i_size) {
                down_write(&iinfo->i_data_sem);
-                if (inode->i_sb->s_blocksize <
+                if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
-                                (udf_file_entry_alloc_offset(inode) +
+                        if (bsize <
-                                 inode->i_size)) {
+                            (udf_file_entry_alloc_offset(inode) + newsize)) {
-                        udf_expand_file_adinicb(inode, inode->i_size, &err);
+                                err = udf_expand_file_adinicb(inode);
-                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+                                if (err) {
-                                inode->i_size = iinfo->i_lenAlloc;
+                                        up_write(&iinfo->i_data_sem);
-                                up_write(&iinfo->i_data_sem);
+                                        return err;
-                                return;
+                                }
                        } else
-                                udf_truncate_extents(inode);
+                                iinfo->i_lenAlloc = newsize;
-                } else {
+                }
-                        offset = inode->i_size & (inode->i_sb->s_blocksize - 1);
+                err = udf_extend_file(inode, newsize);
-                        memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset,
+                if (err) {
-                                0x00, inode->i_sb->s_blocksize -
+                        up_write(&iinfo->i_data_sem);
-                                offset - udf_file_entry_alloc_offset(inode));
+                        return err;
-                        iinfo->i_lenAlloc = inode->i_size;
                }
+                truncate_setsize(inode, newsize);
                up_write(&iinfo->i_data_sem);
        } else {
-                block_truncate_page(inode->i_mapping, inode->i_size,
+                if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
-                                    udf_get_block);
+                        down_write(&iinfo->i_data_sem);
+                        memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize,
+                               0x00, bsize - newsize -
+                               udf_file_entry_alloc_offset(inode));
+                        iinfo->i_lenAlloc = newsize;
+                        truncate_setsize(inode, newsize);
+                        up_write(&iinfo->i_data_sem);
+                        goto update_time;
+                }
+                err = block_truncate_page(inode->i_mapping, newsize,
+                                          udf_get_block);
+                if (err)
+                        return err;
                down_write(&iinfo->i_data_sem);
+                truncate_setsize(inode, newsize);
                udf_truncate_extents(inode);
                up_write(&iinfo->i_data_sem);
        }
+update_time:
        inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
        if (IS_SYNC(inode))
                udf_sync_inode(inode);
        else
                mark_inode_dirty(inode);
+        return 0;
 }
 static void __udf_read_inode(struct inode *inode)
@@ -1637,14 +1733,13 @@ struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
        return NULL;
 }
-int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
+int udf_add_aext(struct inode *inode, struct extent_position *epos,
-                    struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+                 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
 {
        int adsize;
        struct short_ad *sad = NULL;
        struct long_ad *lad = NULL;
        struct allocExtDesc *aed;
-        int8_t etype;
        uint8_t *ptr;
        struct udf_inode_info *iinfo = UDF_I(inode);
@@ -1660,7 +1755,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
                adsize = sizeof(struct long_ad);
        else
-                return -1;
+                return -EIO;
        if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
                unsigned char *sptr, *dptr;
@@ -1672,12 +1767,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                                                obloc.partitionReferenceNum,
                                                obloc.logicalBlockNum, &err);
                if (!epos->block.logicalBlockNum)
-                        return -1;
+                        return -ENOSPC;
                nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
                                                                 &epos->block,
                                                                 0));
                if (!nbh)
-                        return -1;
+                        return -EIO;
                lock_buffer(nbh);
                memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize);
                set_buffer_uptodate(nbh);
@@ -1746,7 +1841,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                epos->bh = nbh;
        }
-        etype = udf_write_aext(inode, epos, eloc, elen, inc);
+        udf_write_aext(inode, epos, eloc, elen, inc);
        if (!epos->bh) {
                iinfo->i_lenAlloc += adsize;
@@ -1764,11 +1859,11 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                mark_buffer_dirty_inode(epos->bh, inode);
        }
-        return etype;
+        return 0;
 }
-int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
+void udf_write_aext(struct inode *inode, struct extent_position *epos,
-                      struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+                    struct kernel_lb_addr *eloc, uint32_t elen, int inc)
 {
        int adsize;
        uint8_t *ptr;
@@ -1798,7 +1893,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
                adsize = sizeof(struct long_ad);
                break;
        default:
-                return -1;
+                return;
        }
        if (epos->bh) {
@@ -1817,8 +1912,6 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
        if (inc)
                epos->offset += adsize;
-        return (elen >> 30);
 }
 int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 2be0f9eb86d2..f1dce848ef96 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -32,6 +32,8 @@
 #include <linux/crc-itu-t.h>
 #include <linux/exportfs.h>
+enum { UDF_MAX_LINKS = 0xffff };
 static inline int udf_match(int len1, const unsigned char *name1, int len2,
                            const unsigned char *name2)
 {
@@ -650,7 +652,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct udf_inode_info *iinfo;
        err = -EMLINK;
-        if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
+        if (dir->i_nlink >= UDF_MAX_LINKS)
                goto out;
        err = -EIO;
@@ -1034,9 +1036,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        struct fileIdentDesc cfi, *fi;
        int err;
-        if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
+        if (inode->i_nlink >= UDF_MAX_LINKS)
                return -EMLINK;
-        }
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi) {
@@ -1131,9 +1132,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
                        goto end_rename;
                retval = -EMLINK;
-                if (!new_inode &&
+                if (!new_inode && new_dir->i_nlink >= UDF_MAX_LINKS)
-                        new_dir->i_nlink >=
-                                (256 << sizeof(new_dir->i_nlink)) - 1)
                        goto end_rename;
        }
        if (!nfi) {
@@ -1287,8 +1286,13 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
        struct fid *fid = (struct fid *)fh;
        int type = FILEID_UDF_WITHOUT_PARENT;
-        if (len < 3 || (connectable && len < 5))
+        if (connectable && (len < 5)) {
+                *lenp = 5;
                return 255;
+        } else if (len < 3) {
+                *lenp = 3;
+                return 255;
+        }
        *lenp = 3;
        fid->udf.block = location.logicalBlockNum;
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 225527cdc885..8424308db4b4 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -197,6 +197,11 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
        mark_buffer_dirty_inode(epos->bh, inode);
 }
+/*
+ * Truncate extents of inode to inode->i_size. This function can be used only
+ * for making file shorter. For making file longer, udf_extend_file() has to
+ * be used.
+ */
 void udf_truncate_extents(struct inode *inode)
 {
        struct extent_position epos;
@@ -219,96 +224,65 @@ void udf_truncate_extents(struct inode *inode)
        etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
        byte_offset = (offset << sb->s_blocksize_bits) +
                (inode->i_size & (sb->s_blocksize - 1));
-        if (etype != -1) {
+        if (etype == -1) {
-                epos.offset -= adsize;
+                /* We should extend the file? */
-                extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
+                WARN_ON(byte_offset);
-                epos.offset += adsize;
+                return;
-                if (byte_offset)
+        }
-                        lenalloc = epos.offset;
+        epos.offset -= adsize;
-                else
+        extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
-                        lenalloc = epos.offset - adsize;
+        epos.offset += adsize;
+        if (byte_offset)
-                if (!epos.bh)
+                lenalloc = epos.offset;
-                        lenalloc -= udf_file_entry_alloc_offset(inode);
+        else
-                else
+                lenalloc = epos.offset - adsize;
-                        lenalloc -= sizeof(struct allocExtDesc);
-                while ((etype = udf_current_aext(inode, &epos, &eloc,
-                                                 &elen, 0)) != -1) {
-                        if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
-                                udf_write_aext(inode, &epos, &neloc, nelen, 0);
-                                if (indirect_ext_len) {
-                                        /* We managed to free all extents in the
-                                         * indirect extent - free it too */
-                                        BUG_ON(!epos.bh);
-                                        udf_free_blocks(sb, inode, &epos.block,
-                                                        0, indirect_ext_len);
-                                } else if (!epos.bh) {
-                                        iinfo->i_lenAlloc = lenalloc;
-                                        mark_inode_dirty(inode);
-                                } else
-                                        udf_update_alloc_ext_desc(inode,
-                                                        &epos, lenalloc);
-                                brelse(epos.bh);
-                                epos.offset = sizeof(struct allocExtDesc);
-                                epos.block = eloc;
-                                epos.bh = udf_tread(sb,
-                                                udf_get_lb_pblock(sb, &eloc, 0));
-                                if (elen)
-                                        indirect_ext_len =
-                                                (elen + sb->s_blocksize - 1) >>
-                                                sb->s_blocksize_bits;
-                                else
-                                        indirect_ext_len = 1;
-                        } else {
-                                extent_trunc(inode, &epos, &eloc, etype,
-                                             elen, 0);
-                                epos.offset += adsize;
-                        }
-                }
-                if (indirect_ext_len) {
+        if (!epos.bh)
-                        BUG_ON(!epos.bh);
+                lenalloc -= udf_file_entry_alloc_offset(inode);
-                        udf_free_blocks(sb, inode, &epos.block, 0,
+        else
-                                        indirect_ext_len);
+                lenalloc -= sizeof(struct allocExtDesc);
-                } else if (!epos.bh) {
-                        iinfo->i_lenAlloc = lenalloc;
-                        mark_inode_dirty(inode);
-                } else
-                        udf_update_alloc_ext_desc(inode, &epos, lenalloc);
-        } else if (inode->i_size) {
-                if (byte_offset) {
-                        struct kernel_long_ad extent;
-                        /*
+        while ((etype = udf_current_aext(inode, &epos, &eloc,
-                         *  OK, there is not extent covering inode->i_size and
+                                         &elen, 0)) != -1) {
-                         *  no extent above inode->i_size => truncate is
+                if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
-                         *  extending the file by 'offset' blocks.
+                        udf_write_aext(inode, &epos, &neloc, nelen, 0);
-                         */
+                        if (indirect_ext_len) {
-                        if ((!epos.bh &&
+                                /* We managed to free all extents in the
-                             epos.offset ==
+                                 * indirect extent - free it too */
-                                        udf_file_entry_alloc_offset(inode)) ||
+                                BUG_ON(!epos.bh);
-                            (epos.bh && epos.offset ==
+                                udf_free_blocks(sb, inode, &epos.block,
-                                                sizeof(struct allocExtDesc))) {
+                                                0, indirect_ext_len);
-                                /* File has no extents at all or has empty last
+                        } else if (!epos.bh) {
-                                 * indirect extent! Create a fake extent... */
+                                iinfo->i_lenAlloc = lenalloc;
-                                extent.extLocation.logicalBlockNum = 0;
+                                mark_inode_dirty(inode);
-                                extent.extLocation.partitionReferenceNum = 0;
+                        } else
-                                extent.extLength =
+                                udf_update_alloc_ext_desc(inode,
-                                        EXT_NOT_RECORDED_NOT_ALLOCATED;
+                                                &epos, lenalloc);
-                        } else {
+                        brelse(epos.bh);
-                                epos.offset -= adsize;
+                        epos.offset = sizeof(struct allocExtDesc);
-                                etype = udf_next_aext(inode, &epos,
+                        epos.block = eloc;
-                                                      &extent.extLocation,
+                        epos.bh = udf_tread(sb,
-                                                      &extent.extLength, 0);
+                                        udf_get_lb_pblock(sb, &eloc, 0));
-                                extent.extLength |= etype << 30;
+                        if (elen)
-                        }
+                                indirect_ext_len =
-                        udf_extend_file(inode, &epos, &extent,
+                                        (elen + sb->s_blocksize - 1) >>
-                                        offset +
+                                        sb->s_blocksize_bits;
-                                        ((inode->i_size &
+                        else
-                                                (sb->s_blocksize - 1)) != 0));
+                                indirect_ext_len = 1;
+                } else {
+                        extent_trunc(inode, &epos, &eloc, etype, elen, 0);
+                        epos.offset += adsize;
                }
        }
+        if (indirect_ext_len) {
+                BUG_ON(!epos.bh);
+                udf_free_blocks(sb, inode, &epos.block, 0, indirect_ext_len);
+        } else if (!epos.bh) {
+                iinfo->i_lenAlloc = lenalloc;
+                mark_inode_dirty(inode);
+        } else
+                udf_update_alloc_ext_desc(inode, &epos, lenalloc);
        iinfo->i_lenExtents = inode->i_size;
        brelse(epos.bh);
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index eba48209f9f3..dbd52d4b5eed 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -136,22 +136,20 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
 extern long udf_ioctl(struct file *, unsigned int, unsigned long);
 /* inode.c */
 extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
-extern void udf_expand_file_adinicb(struct inode *, int, int *);
+extern int udf_expand_file_adinicb(struct inode *);
 extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
 extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
-extern void udf_truncate(struct inode *);
+extern int udf_setsize(struct inode *, loff_t);
 extern void udf_read_inode(struct inode *);
 extern void udf_evict_inode(struct inode *);
 extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
 extern long udf_block_map(struct inode *, sector_t);
-extern int udf_extend_file(struct inode *, struct extent_position *,
-                           struct kernel_long_ad *, sector_t);
 extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
                         struct kernel_lb_addr *, uint32_t *, sector_t *);
-extern int8_t udf_add_aext(struct inode *, struct extent_position *,
+extern int udf_add_aext(struct inode *, struct extent_position *,
+                        struct kernel_lb_addr *, uint32_t, int);
+extern void udf_write_aext(struct inode *, struct extent_position *,
                           struct kernel_lb_addr *, uint32_t, int);
-extern int8_t udf_write_aext(struct inode *, struct extent_position *,
-                             struct kernel_lb_addr *, uint32_t, int);
 extern int8_t udf_delete_aext(struct inode *, struct extent_position,
                              struct kernel_lb_addr, uint32_t);
 extern int8_t udf_next_aext(struct inode *, struct extent_position *,
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index 30c8f223253d..e4f10a40768a 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -1,7 +1,6 @@
 config UFS_FS
        tristate "UFS file system support (read only)"
        depends on BLOCK
-        depends on BKL # probably fixable
        help
          BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
          OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 2b251f2093af..03c255f12df5 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -34,7 +34,6 @@
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
@@ -43,7 +42,7 @@
 #include "swab.h"
 #include "util.h"
-static u64 ufs_frag_map(struct inode *inode, sector_t frag);
+static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock);
 static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4])
 {
@@ -82,7 +81,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
 * the begining of the filesystem.
 */
-static u64 ufs_frag_map(struct inode *inode, sector_t frag)
+static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock)
 {
        struct ufs_inode_info *ufsi = UFS_I(inode);
        struct super_block *sb = inode->i_sb;
@@ -107,7 +106,8 @@ static u64 ufs_frag_map(struct inode *inode, sector_t frag)
        p = offsets;
-        lock_kernel();
+        if (needs_lock)
+                lock_ufs(sb);
        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
                goto ufs2;
@@ -152,7 +152,8 @@ ufs2:
        ret = temp + (u64) (frag & uspi->s_fpbmask);
 out:
-        unlock_kernel();
+        if (needs_lock)
+                unlock_ufs(sb);
        return ret;
 }
@@ -415,14 +416,16 @@ out:
 int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
 {
        struct super_block * sb = inode->i_sb;
-        struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi;
+        struct ufs_sb_info * sbi = UFS_SB(sb);
+        struct ufs_sb_private_info * uspi = sbi->s_uspi;
        struct buffer_head * bh;
        int ret, err, new;
        unsigned long ptr,phys;
        u64 phys64 = 0;
+        bool needs_lock = (sbi->mutex_owner != current);
        
        if (!create) {
-                phys64 = ufs_frag_map(inode, fragment);
+                phys64 = ufs_frag_map(inode, fragment, needs_lock);
                UFSD("phys64 = %llu\n", (unsigned long long)phys64);
                if (phys64)
                        map_bh(bh_result, sb, phys64);
@@ -436,7 +439,8 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head
        ret = 0;
        bh = NULL;
-        lock_kernel();
+        if (needs_lock)
+                lock_ufs(sb);
        UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
        if (fragment >
@@ -498,7 +502,9 @@ out:
                set_buffer_new(bh_result);
        map_bh(bh_result, sb, phys);
 abort:
-        unlock_kernel();
+        if (needs_lock)
+                unlock_ufs(sb);
        return err;
 abort_too_big:
@@ -506,48 +512,6 @@ abort_too_big:
        goto abort;
 }
-static struct buffer_head *ufs_getfrag(struct inode *inode,
-                                       unsigned int fragment,
-                                       int create, int *err)
-{
-        struct buffer_head dummy;
-        int error;
-        dummy.b_state = 0;
-        dummy.b_blocknr = -1000;
-        error = ufs_getfrag_block(inode, fragment, &dummy, create);
-        *err = error;
-        if (!error && buffer_mapped(&dummy)) {
-                struct buffer_head *bh;
-                bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
-                if (buffer_new(&dummy)) {
-                        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
-                        set_buffer_uptodate(bh);
-                        mark_buffer_dirty(bh);
-                }
-                return bh;
-        }
-        return NULL;
-}
-struct buffer_head * ufs_bread (struct inode * inode, unsigned fragment,
-        int create, int * err)
-{
-        struct buffer_head * bh;
-        UFSD("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment);
-        bh = ufs_getfrag (inode, fragment, create, err);
-        if (!bh || buffer_uptodate(bh))                 
-                return bh;
-        ll_rw_block (READ, 1, &bh);
-        wait_on_buffer (bh);
-        if (buffer_uptodate(bh))
-                return bh;
-        brelse (bh);
-        *err = -EIO;
-        return NULL;
-}
 static int ufs_writepage(struct page *page, struct writeback_control *wbc)
 {
        return block_write_full_page(page,ufs_getfrag_block,wbc);
@@ -900,9 +864,9 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
 int ufs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        int ret;
-        lock_kernel();
+        lock_ufs(inode->i_sb);
        ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
-        unlock_kernel();
+        unlock_ufs(inode->i_sb);
        return ret;
 }
@@ -922,22 +886,22 @@ void ufs_evict_inode(struct inode * inode)
        if (want_delete) {
                loff_t old_i_size;
                /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
-                lock_kernel();
+                lock_ufs(inode->i_sb);
                mark_inode_dirty(inode);
                ufs_update_inode(inode, IS_SYNC(inode));
                old_i_size = inode->i_size;
                inode->i_size = 0;
                if (inode->i_blocks && ufs_truncate(inode, old_i_size))
                        ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n");
-                unlock_kernel();
+                unlock_ufs(inode->i_sb);
        }
        invalidate_inode_buffers(inode);
        end_writeback(inode);
        if (want_delete) {
-                lock_kernel();
+                lock_ufs(inode->i_sb);
                ufs_free_inode (inode);
-                unlock_kernel();
+                unlock_ufs(inode->i_sb);
        }
 }
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 12f39b9e4437..29309e25417f 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -29,7 +29,6 @@
 #include <linux/time.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -55,16 +54,16 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
        if (dentry->d_name.len > UFS_MAXNAMLEN)
                return ERR_PTR(-ENAMETOOLONG);
-        lock_kernel();
+        lock_ufs(dir->i_sb);
        ino = ufs_inode_by_name(dir, &dentry->d_name);
        if (ino) {
                inode = ufs_iget(dir->i_sb, ino);
                if (IS_ERR(inode)) {
-                        unlock_kernel();
+                        unlock_ufs(dir->i_sb);
                        return ERR_CAST(inode);
                }
        }
-        unlock_kernel();
+        unlock_ufs(dir->i_sb);
        d_add(dentry, inode);
        return NULL;
 }
@@ -93,9 +92,9 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
                inode->i_fop = &ufs_file_operations;
                inode->i_mapping->a_ops = &ufs_aops;
                mark_inode_dirty(inode);
-                lock_kernel();
+                lock_ufs(dir->i_sb);
                err = ufs_add_nondir(dentry, inode);
-                unlock_kernel();
+                unlock_ufs(dir->i_sb);
        }
        UFSD("END: err=%d\n", err);
        return err;
@@ -115,9 +114,9 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
                init_special_inode(inode, mode, rdev);
                ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev);
                mark_inode_dirty(inode);
-                lock_kernel();
+                lock_ufs(dir->i_sb);
                err = ufs_add_nondir(dentry, inode);
-                unlock_kernel();
+                unlock_ufs(dir->i_sb);
        }
        return err;
 }
@@ -133,7 +132,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
        if (l > sb->s_blocksize)
                goto out_notlocked;
-        lock_kernel();
+        lock_ufs(dir->i_sb);
        inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
@@ -156,7 +155,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
        err = ufs_add_nondir(dentry, inode);
 out:
-        unlock_kernel();
+        unlock_ufs(dir->i_sb);
 out_notlocked:
        return err;
@@ -172,9 +171,9 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
        struct inode *inode = old_dentry->d_inode;
        int error;
-        lock_kernel();
+        lock_ufs(dir->i_sb);
        if (inode->i_nlink >= UFS_LINK_MAX) {
-                unlock_kernel();
+                unlock_ufs(dir->i_sb);
                return -EMLINK;
        }
@@ -183,7 +182,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
        ihold(inode);
        error = ufs_add_nondir(dentry, inode);
-        unlock_kernel();
+        unlock_ufs(dir->i_sb);
        return error;
 }
@@ -195,7 +194,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
        if (dir->i_nlink >= UFS_LINK_MAX)
                goto out;
-        lock_kernel();
+        lock_ufs(dir->i_sb);
        inode_inc_link_count(dir);
        inode = ufs_new_inode(dir, S_IFDIR|mode);
@@ -216,7 +215,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
        err = ufs_add_link(dentry, inode);
        if (err)
                goto out_fail;
-        unlock_kernel();
+        unlock_ufs(dir->i_sb);
        d_instantiate(dentry, inode);
 out:
@@ -228,7 +227,7 @@ out_fail:
        iput (inode);
 out_dir:
        inode_dec_link_count(dir);
-        unlock_kernel();
+        unlock_ufs(dir->i_sb);
        goto out;
 }
@@ -259,7 +258,7 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
        struct inode * inode = dentry->d_inode;
        int err= -ENOTEMPTY;
-        lock_kernel();
+        lock_ufs(dir->i_sb);
        if (ufs_empty_dir (inode)) {
                err = ufs_unlink(dir, dentry);
                if (!err) {
@@ -268,7 +267,7 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
                        inode_dec_link_count(dir);
                }
        }
-        unlock_kernel();
+        unlock_ufs(dir->i_sb);
        return err;
 }
@@ -306,7 +305,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                ufs_set_link(new_dir, new_de, new_page, old_inode);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
@@ -318,12 +316,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        if (new_dir->i_nlink >= UFS_LINK_MAX)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = ufs_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
@@ -331,12 +326,11 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
        /*
         * Like most other Unix systems, set the ctime for inodes on a
         * rename.
-         * inode_dec_link_count() will mark the inode dirty.
         */
        old_inode->i_ctime = CURRENT_TIME_SEC;
        ufs_delete_entry(old_dir, old_de, old_page);
-        inode_dec_link_count(old_inode);
+        mark_inode_dirty(old_inode);
        if (dir_de) {
                ufs_set_link(old_inode, dir_de, dir_page, new_dir);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 2c61ac5d4e48..7693d6293404 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -84,7 +84,6 @@
 #include <linux/blkdev.h>
 #include <linux/init.h>
 #include <linux/parser.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/log2.h>
@@ -96,6 +95,26 @@
 #include "swab.h"
 #include "util.h"
+void lock_ufs(struct super_block *sb)
+{
+#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
+        struct ufs_sb_info *sbi = UFS_SB(sb);
+        mutex_lock(&sbi->mutex);
+        sbi->mutex_owner = current;
+#endif
+}
+void unlock_ufs(struct super_block *sb)
+{
+#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
+        struct ufs_sb_info *sbi = UFS_SB(sb);
+        sbi->mutex_owner = NULL;
+        mutex_unlock(&sbi->mutex);
+#endif
+}
 static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
 {
        struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
@@ -313,7 +332,6 @@ void ufs_panic (struct super_block * sb, const char * function,
        struct ufs_super_block_first * usb1;
        va_list args;
        
-        lock_kernel();
        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first(uspi);
        
@@ -521,7 +539,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
         */
        size = uspi->s_cssize;
        blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
-        base = space = kmalloc(size, GFP_KERNEL);
+        base = space = kmalloc(size, GFP_NOFS);
        if (!base)
                goto failed; 
        sbi->s_csp = (struct ufs_csum *)space;
@@ -546,7 +564,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
         * Read cylinder group (we read only first fragment from block
         * at this time) and prepare internal data structures for cg caching.
         */
-        if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_KERNEL)))
+        if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_NOFS)))
                goto failed;
        for (i = 0; i < uspi->s_ncg; i++) 
                sbi->s_ucg[i] = NULL;
@@ -564,7 +582,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
                ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data);
        }
        for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) {
-                if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_KERNEL)))
+                if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_NOFS)))
                        goto failed;
                sbi->s_cgno[i] = UFS_CGNO_EMPTY;
        }
@@ -646,8 +664,6 @@ static void ufs_put_super_internal(struct super_block *sb)
        
        UFSD("ENTER\n");
-        lock_kernel();
        ufs_put_cstotal(sb);
        size = uspi->s_cssize;
        blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -676,8 +692,6 @@ static void ufs_put_super_internal(struct super_block *sb)
        kfree (sbi->s_ucg);
        kfree (base);
-        unlock_kernel();
        UFSD("EXIT\n");
 }
@@ -696,8 +710,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
        unsigned maxsymlen;
        int ret = -EINVAL;
-        lock_kernel();
        uspi = NULL;
        ubh = NULL;
        flags = 0;
@@ -718,6 +730,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                goto failed;
        }
 #endif
+        mutex_init(&sbi->mutex);
        /*
         * Set default mount options
         * Parse mount options
@@ -1165,7 +1178,6 @@ magic_found:
                        goto failed;
        UFSD("EXIT\n");
-        unlock_kernel();
        return 0;
 dalloc_failed:
@@ -1177,12 +1189,10 @@ failed:
        kfree(sbi);
        sb->s_fs_info = NULL;
        UFSD("EXIT (FAILED)\n");
-        unlock_kernel();
        return ret;
 failed_nomem:
        UFSD("EXIT (NOMEM)\n");
-        unlock_kernel();
        return -ENOMEM;
 }
@@ -1193,8 +1203,8 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
        struct ufs_super_block_third * usb3;
        unsigned flags;
+        lock_ufs(sb);
        lock_super(sb);
-        lock_kernel();
        UFSD("ENTER\n");
@@ -1213,8 +1223,8 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
        sb->s_dirt = 0;
        UFSD("EXIT\n");
-        unlock_kernel();
        unlock_super(sb);
+        unlock_ufs(sb);
        return 0;
 }
@@ -1256,7 +1266,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        unsigned new_mount_opt, ufstype;
        unsigned flags;
-        lock_kernel();
+        lock_ufs(sb);
        lock_super(sb);
        uspi = UFS_SB(sb)->s_uspi;
        flags = UFS_SB(sb)->s_flags;
@@ -1272,7 +1282,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        ufs_set_opt (new_mount_opt, ONERROR_LOCK);
        if (!ufs_parse_options (data, &new_mount_opt)) {
                unlock_super(sb);
-                unlock_kernel();
+                unlock_ufs(sb);
                return -EINVAL;
        }
        if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
@@ -1280,14 +1290,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
                printk("ufstype can't be changed during remount\n");
                unlock_super(sb);
-                unlock_kernel();
+                unlock_ufs(sb);
                return -EINVAL;
        }
        if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
                UFS_SB(sb)->s_mount_opt = new_mount_opt;
                unlock_super(sb);
-                unlock_kernel();
+                unlock_ufs(sb);
                return 0;
        }
        
@@ -1313,7 +1323,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
                printk("ufs was compiled with read-only support, "
                "can't be mounted as read-write\n");
                unlock_super(sb);
-                unlock_kernel();
+                unlock_ufs(sb);
                return -EINVAL;
 #else
                if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 
@@ -1323,13 +1333,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
                    ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
                        printk("this ufstype is read-only supported\n");
                        unlock_super(sb);
-                        unlock_kernel();
+                        unlock_ufs(sb);
                        return -EINVAL;
                }
                if (!ufs_read_cylinder_structures(sb)) {
                        printk("failed during remounting\n");
                        unlock_super(sb);
-                        unlock_kernel();
+                        unlock_ufs(sb);
                        return -EPERM;
                }
                sb->s_flags &= ~MS_RDONLY;
@@ -1337,7 +1347,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        }
        UFS_SB(sb)->s_mount_opt = new_mount_opt;
        unlock_super(sb);
-        unlock_kernel();
+        unlock_ufs(sb);
        return 0;
 }
@@ -1371,7 +1381,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct ufs_super_block_third *usb3;
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
-        lock_kernel();
+        lock_ufs(sb);
        usb1 = ubh_get_usb_first(uspi);
        usb2 = ubh_get_usb_second(uspi);
@@ -1395,7 +1405,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_fsid.val[0] = (u32)id;
        buf->f_fsid.val[1] = (u32)(id >> 32);
-        unlock_kernel();
+        unlock_ufs(sb);
        return 0;
 }
@@ -1405,7 +1415,7 @@ static struct kmem_cache * ufs_inode_cachep;
 static struct inode *ufs_alloc_inode(struct super_block *sb)
 {
        struct ufs_inode_info *ei;
-        ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_KERNEL);
+        ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
        ei->vfs_inode.i_version = 1;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index a58f9155fc9a..e56a4f567212 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -40,7 +40,6 @@
 #include <linux/time.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/sched.h>
@@ -467,7 +466,6 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
        block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block);
-        lock_kernel();
        while (1) {
                retry = ufs_trunc_direct(inode);
                retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK,
@@ -487,7 +485,6 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
        inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
        ufsi->i_lastfrag = DIRECT_FRAGMENT;
-        unlock_kernel();
        mark_inode_dirty(inode);
 out:
        UFSD("EXIT: err %d\n", err);
@@ -510,7 +507,9 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
                /* XXX(truncate): truncate_setsize should be called last */
                truncate_setsize(inode, attr->ia_size);
+                lock_ufs(inode->i_sb);
                error = ufs_truncate(inode, old_i_size);
+                unlock_ufs(inode->i_sb);
                if (error)
                        return error;
        }
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index c08782e1b48a..5be2755dd715 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -18,6 +18,8 @@ struct ufs_sb_info {
        unsigned s_cgno[UFS_MAX_GROUP_LOADED];
        unsigned short s_cg_loaded;
        unsigned s_mount_opt;
+        struct mutex mutex;
+        struct task_struct *mutex_owner;
 };
 struct ufs_inode_info {
@@ -109,7 +111,6 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long);
 extern int ufs_write_inode (struct inode *, struct writeback_control *);
 extern int ufs_sync_inode (struct inode *);
 extern void ufs_evict_inode (struct inode *);
-extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
 extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create);
 /* namei.c */
@@ -154,4 +155,7 @@ static inline u32 ufs_dtogd(struct ufs_sb_private_info * uspi, u64 b)
        return do_div(b, uspi->s_fpg);
 }
+extern void lock_ufs(struct super_block *sb);
+extern void unlock_ufs(struct super_block *sb);
 #endif /* _UFS_UFS_H */
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index d2c36d53fe66..95425b59ce0a 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -27,7 +27,7 @@ struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi,
        if (count > UFS_MAXFRAG)
                return NULL;
        ubh = (struct ufs_buffer_head *)
-                kmalloc (sizeof (struct ufs_buffer_head), GFP_KERNEL);
+                kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS);
        if (!ubh)
                return NULL;
        ubh->fragment = fragment;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ac1c7e8378dd..f83a4c830a65 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -2022,11 +2022,12 @@ xfs_buf_init(void)
        if (!xfslogd_workqueue)
                goto out_free_buf_zone;
-        xfsdatad_workqueue = create_workqueue("xfsdatad");
+        xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
        if (!xfsdatad_workqueue)
                goto out_destroy_xfslogd_workqueue;
-        xfsconvertd_workqueue = create_workqueue("xfsconvertd");
+        xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
+                                                WQ_MEM_RECLAIM, 1);
        if (!xfsconvertd_workqueue)
                goto out_destroy_xfsdatad_workqueue;
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
index 05201ae719e5..d61611c88012 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -152,6 +152,8 @@ xfs_ioc_trim(
        if (!capable(CAP_SYS_ADMIN))
                return -XFS_ERROR(EPERM);
+        if (!blk_queue_discard(q))
+                return -XFS_ERROR(EOPNOTSUPP);
        if (copy_from_user(&range, urange, sizeof(range)))
                return -XFS_ERROR(EFAULT);
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index fc0114da7fdd..f4f878fc0083 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -89,8 +89,10 @@ xfs_fs_encode_fh(
         * seven combinations work.  The real answer is "don't use v2".
         */
        len = xfs_fileid_length(fileid_type);
-        if (*max_len < len)
+        if (*max_len < len) {
+                *max_len = len;
                return 255;
+        }
        *max_len = len;
        switch (fileid_type) {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index f5e2a19e0f8e..0ca0e3c024d7 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -695,14 +695,19 @@ xfs_ioc_fsgeometry_v1(
        xfs_mount_t             *mp,
        void                    __user *arg)
 {
-        xfs_fsop_geom_v1_t      fsgeo;
+        xfs_fsop_geom_t         fsgeo;
        int                     error;
-        error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3);
+        error = xfs_fs_geometry(mp, &fsgeo, 3);
        if (error)
                return -error;
-        if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
+        /*
+         * Caller should have passed an argument of type
+         * xfs_fsop_geom_v1_t.  This is a proper subset of the
+         * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
+         */
+        if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
                return -XFS_ERROR(EFAULT);
        return 0;
 }
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index bd5727852fd6..9ff7fc603d2f 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -102,7 +102,8 @@ xfs_mark_inode_dirty(
 STATIC int
 xfs_init_security(
        struct inode    *inode,
-        struct inode    *dir)
+        struct inode    *dir,
+        const struct qstr *qstr)
 {
        struct xfs_inode *ip = XFS_I(inode);
        size_t          length;
@@ -110,7 +111,7 @@ xfs_init_security(
        unsigned char   *name;
        int             error;
-        error = security_inode_init_security(inode, dir, (char **)&name,
+        error = security_inode_init_security(inode, dir, qstr, (char **)&name,
                                             &value, &length);
        if (error) {
                if (error == -EOPNOTSUPP)
@@ -194,7 +195,7 @@ xfs_vn_mknod(
        inode = VFS_I(ip);
-        error = xfs_init_security(inode, dir);
+        error = xfs_init_security(inode, dir, &dentry->d_name);
        if (unlikely(error))
                goto out_cleanup_inode;
@@ -367,7 +368,7 @@ xfs_vn_symlink(
        inode = VFS_I(cip);
-        error = xfs_init_security(inode, dir);
+        error = xfs_init_security(inode, dir, &dentry->d_name);
        if (unlikely(error))
                goto out_cleanup_inode;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cec89dd5d7d2..85668efb3e3e 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -53,6 +53,9 @@ xfs_fs_geometry(
        xfs_fsop_geom_t         *geo,
        int                     new_version)
 {
+        memset(geo, 0, sizeof(*geo));
        geo->blocksize = mp->m_sb.sb_blocksize;
        geo->rtextsize = mp->m_sb.sb_rextsize;
        geo->agblocks = mp->m_sb.sb_agblocks;
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index edfa178bafb6..4aff56395732 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -309,7 +309,7 @@ xfs_mru_cache_init(void)
        if (!xfs_mru_elem_zone)
                goto out;
-        xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache");
+        xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
        if (!xfs_mru_reap_wq)
                goto out_destroy_mru_elem_zone;