Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6 into for-linus-1

* 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6: (9356 commits) [media] rc: update for bitop name changes fs: simplify iget & friends fs: pull inode->i_lock up out of writeback_single_inode fs: rename inode_lock to inode_hash_lock fs: move i_wb_list out from under inode_lock fs: move i_sb_list out from under inode_lock fs: remove inode_lock from iput_final and prune_icache fs: Lock the inode LRU list separately fs: factor inode disposal fs: protect inode->i_state with inode->i_lock lib, arch: add filter argument to show_mem and fix private implementations SLUB: Write to per cpu data when allocating it slub: Fix debugobjects with lockless fastpath autofs4: Do not potentially dereference NULL pointer returned by fget() in autofs_dev_ioctl_setpipefd() autofs4 - remove autofs4_lock autofs4 - fix d_manage() return on rcu-walk autofs4 - fix autofs4_expire_indirect() traversal autofs4 - fix dentry leak in autofs4_expire_direct() autofs4 - reinstate last used update on access vfs - check non-mountpoint dentry might block in __follow_mount_rcu() ... NOTE! This merge commit was created to fix compilation error. The block tree was merged upstream and removed the 'elv_queue_empty()' function which the new 'mtdswap' driver is using. So a simple merge of the mtd tree with upstream does not compile. And the mtd tree has already be published, so re-basing it is not an option. To fix this unfortunate situation, I had to merge upstream into the mtd-2.6.git tree without committing, put the fixup patch on top of this, and then commit this. The result is that we do not have commits which do not compile. In other words, this merge commit "merges" 3 things: the MTD tree, the upstream tree, and the fixup patch.
author: Artem Bityutskiy <Artem.Bityutskiy@nokia.com> 2011-03-25 11:41:20 -0400
committer: Artem Bityutskiy <Artem.Bityutskiy@nokia.com> 2011-03-25 11:41:20 -0400
commit: 7bf7e370d5919112c223a269462cd0b546903829 (patch)
tree: 03ccc715239df14ae168277dbccc9d9cf4d8a2c8 /fs
parent: 68b1a1e786f29c900fa1c516a402e24f0ece622a (diff)
parent: d39dd11c3e6a7af5c20bfac40594db36cf270f42 (diff)
502 files changed, 13303 insertions, 9389 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 02a2cf616318..535ab6eccb1a 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -21,8 +21,8 @@
 #include <linux/posix_acl_xattr.h>
 #include "xattr.h"
 #include "acl.h"
-#include "v9fs_vfs.h"
 #include "v9fs.h"
+#include "v9fs_vfs.h"
 static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
 {
@@ -59,7 +59,8 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
        struct v9fs_session_info *v9ses;
        v9ses = v9fs_inode2v9ses(inode);
-        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+        if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
+                        ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
                set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL);
                set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
                return 0;
@@ -71,11 +72,15 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
        if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
                set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
                set_cached_acl(inode, ACL_TYPE_ACCESS, pacl);
-                posix_acl_release(dacl);
-                posix_acl_release(pacl);
        } else
                retval = -EIO;
+        if (!IS_ERR(dacl))
+                posix_acl_release(dacl);
+        if (!IS_ERR(pacl))
+                posix_acl_release(pacl);
        return retval;
 }
@@ -100,9 +105,10 @@ int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
                return -ECHILD;
        v9ses = v9fs_inode2v9ses(inode);
-        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+        if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
+                        ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
                /*
-                 * On access = client mode get the acl
+                 * On access = client  and acl = on mode get the acl
                 * values from the server
                 */
                return 0;
@@ -128,6 +134,10 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
        struct inode *inode = dentry->d_inode;
        set_cached_acl(inode, type, acl);
+        if (!acl)
+                return 0;
        /* Set a setxattr request to server */
        size = posix_acl_xattr_size(acl->a_count);
        buffer = kmalloc(size, GFP_KERNEL);
@@ -177,10 +187,8 @@ int v9fs_acl_chmod(struct dentry *dentry)
 int v9fs_set_create_acl(struct dentry *dentry,
                        struct posix_acl *dpacl, struct posix_acl *pacl)
 {
-        if (dpacl)
+        v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
-                v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
+        v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
-        if (pacl)
-                v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
        posix_acl_release(dpacl);
        posix_acl_release(pacl);
        return 0;
@@ -254,7 +262,7 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
        if (strcmp(name, "") != 0)
                return -EINVAL;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        v9ses = v9fs_dentry2v9ses(dentry);
        /*
         * We allow set/get/list of acl when access=client is not specified
         */
@@ -304,7 +312,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
        if (strcmp(name, "") != 0)
                return -EINVAL;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        v9ses = v9fs_dentry2v9ses(dentry);
        /*
         * set the attribute on the remote. Without even looking at the
         * xattr value. We leave it to the server to validate
@@ -315,7 +323,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
        if (S_ISLNK(inode->i_mode))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EPERM;
        if (value) {
                /* update the cached acl value */
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 0dbe0d139ac2..5b335c5086a1 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -33,67 +33,11 @@
 #define CACHETAG_LEN  11
-struct kmem_cache *vcookie_cache;
 struct fscache_netfs v9fs_cache_netfs = {
        .name           = "9p",
        .version        = 0,
 };
-static void init_once(void *foo)
-{
-        struct v9fs_cookie *vcookie = (struct v9fs_cookie *) foo;
-        vcookie->fscache = NULL;
-        vcookie->qid = NULL;
-        inode_init_once(&vcookie->inode);
-}
-/**
- * v9fs_init_vcookiecache - initialize a cache for vcookies to maintain
- *                          vcookie to inode mapping
- *
- * Returns 0 on success.
- */
-static int v9fs_init_vcookiecache(void)
-{
-        vcookie_cache = kmem_cache_create("vcookie_cache",
-                                          sizeof(struct v9fs_cookie),
-                                          0, (SLAB_RECLAIM_ACCOUNT|
-                                              SLAB_MEM_SPREAD),
-                                          init_once);
-        if (!vcookie_cache)
-                return -ENOMEM;
-        return 0;
-}
-/**
- * v9fs_destroy_vcookiecache - destroy the cache of vcookies
- *
- */
-static void v9fs_destroy_vcookiecache(void)
-{
-        kmem_cache_destroy(vcookie_cache);
-}
-int __v9fs_cache_register(void)
-{
-        int ret;
-        ret = v9fs_init_vcookiecache();
-        if (ret < 0)
-                return ret;
-        return fscache_register_netfs(&v9fs_cache_netfs);
-}
-void __v9fs_cache_unregister(void)
-{
-        v9fs_destroy_vcookiecache();
-        fscache_unregister_netfs(&v9fs_cache_netfs);
-}
 /**
 * v9fs_random_cachetag - Generate a random tag to be associated
 *                        with a new cache session.
@@ -133,9 +77,9 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
 }
 const struct fscache_cookie_def v9fs_cache_session_index_def = {
-        .name           = "9P.session",
+        .name           = "9P.session",
-        .type           = FSCACHE_COOKIE_TYPE_INDEX,
+        .type           = FSCACHE_COOKIE_TYPE_INDEX,
-        .get_key        = v9fs_cache_session_get_key,
+        .get_key        = v9fs_cache_session_get_key,
 };
 void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
@@ -163,33 +107,33 @@ void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
 static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
                                         void *buffer, uint16_t bufmax)
 {
-        const struct v9fs_cookie *vcookie = cookie_netfs_data;
+        const struct v9fs_inode *v9inode = cookie_netfs_data;
-        memcpy(buffer, &vcookie->qid->path, sizeof(vcookie->qid->path));
+        memcpy(buffer, &v9inode->fscache_key->path,
+               sizeof(v9inode->fscache_key->path));
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &vcookie->inode,
+        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode,
-                   vcookie->qid->path);
+                   v9inode->fscache_key->path);
-        return sizeof(vcookie->qid->path);
+        return sizeof(v9inode->fscache_key->path);
 }
 static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
                                      uint64_t *size)
 {
-        const struct v9fs_cookie *vcookie = cookie_netfs_data;
+        const struct v9fs_inode *v9inode = cookie_netfs_data;
-        *size = i_size_read(&vcookie->inode);
+        *size = i_size_read(&v9inode->vfs_inode);
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &vcookie->inode,
+        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode,
                   *size);
 }
 static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
                                         void *buffer, uint16_t buflen)
 {
-        const struct v9fs_cookie *vcookie = cookie_netfs_data;
+        const struct v9fs_inode *v9inode = cookie_netfs_data;
-        memcpy(buffer, &vcookie->qid->version, sizeof(vcookie->qid->version));
+        memcpy(buffer, &v9inode->fscache_key->version,
+               sizeof(v9inode->fscache_key->version));
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &vcookie->inode,
+        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode,
-                   vcookie->qid->version);
+                   v9inode->fscache_key->version);
-        return sizeof(vcookie->qid->version);
+        return sizeof(v9inode->fscache_key->version);
 }
 static enum
@@ -197,13 +141,13 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
                                            const void *buffer,
                                            uint16_t buflen)
 {
-        const struct v9fs_cookie *vcookie = cookie_netfs_data;
+        const struct v9fs_inode *v9inode = cookie_netfs_data;
-        if (buflen != sizeof(vcookie->qid->version))
+        if (buflen != sizeof(v9inode->fscache_key->version))
                return FSCACHE_CHECKAUX_OBSOLETE;
-        if (memcmp(buffer, &vcookie->qid->version,
+        if (memcmp(buffer, &v9inode->fscache_key->version,
-                   sizeof(vcookie->qid->version)))
+                   sizeof(v9inode->fscache_key->version)))
                return FSCACHE_CHECKAUX_OBSOLETE;
        return FSCACHE_CHECKAUX_OKAY;
@@ -211,7 +155,7 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
 static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
 {
-        struct v9fs_cookie *vcookie = cookie_netfs_data;
+        struct v9fs_inode *v9inode = cookie_netfs_data;
        struct pagevec pvec;
        pgoff_t first;
        int loop, nr_pages;
@@ -220,7 +164,7 @@ static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
        first = 0;
        for (;;) {
-                nr_pages = pagevec_lookup(&pvec, vcookie->inode.i_mapping,
+                nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping,
                                          first,
                                          PAGEVEC_SIZE - pagevec_count(&pvec));
                if (!nr_pages)
@@ -249,115 +193,114 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = {
 void v9fs_cache_inode_get_cookie(struct inode *inode)
 {
-        struct v9fs_cookie *vcookie;
+        struct v9fs_inode *v9inode;
        struct v9fs_session_info *v9ses;
        if (!S_ISREG(inode->i_mode))
                return;
-        vcookie = v9fs_inode2cookie(inode);
+        v9inode = V9FS_I(inode);
-        if (vcookie->fscache)
+        if (v9inode->fscache)
                return;
        v9ses = v9fs_inode2v9ses(inode);
-        vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
+        v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
                                                  &v9fs_cache_inode_index_def,
-                                                  vcookie);
+                                                  v9inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
-                   vcookie->fscache);
+                   v9inode->fscache);
 }
 void v9fs_cache_inode_put_cookie(struct inode *inode)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return;
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
-                   vcookie->fscache);
+                   v9inode->fscache);
-        fscache_relinquish_cookie(vcookie->fscache, 0);
+        fscache_relinquish_cookie(v9inode->fscache, 0);
-        vcookie->fscache = NULL;
+        v9inode->fscache = NULL;
 }
 void v9fs_cache_inode_flush_cookie(struct inode *inode)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return;
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
-                   vcookie->fscache);
+                   v9inode->fscache);
-        fscache_relinquish_cookie(vcookie->fscache, 1);
+        fscache_relinquish_cookie(v9inode->fscache, 1);
-        vcookie->fscache = NULL;
+        v9inode->fscache = NULL;
 }
 void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        struct p9_fid *fid;
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return;
-        spin_lock(&vcookie->lock);
+        spin_lock(&v9inode->fscache_lock);
        fid = filp->private_data;
        if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
                v9fs_cache_inode_flush_cookie(inode);
        else
                v9fs_cache_inode_get_cookie(inode);
-        spin_unlock(&vcookie->lock);
+        spin_unlock(&v9inode->fscache_lock);
 }
 void v9fs_cache_inode_reset_cookie(struct inode *inode)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        struct v9fs_session_info *v9ses;
        struct fscache_cookie *old;
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return;
-        old = vcookie->fscache;
+        old = v9inode->fscache;
-        spin_lock(&vcookie->lock);
+        spin_lock(&v9inode->fscache_lock);
-        fscache_relinquish_cookie(vcookie->fscache, 1);
+        fscache_relinquish_cookie(v9inode->fscache, 1);
        v9ses = v9fs_inode2v9ses(inode);
-        vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
+        v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
                                                  &v9fs_cache_inode_index_def,
-                                                  vcookie);
+                                                  v9inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
-                   inode, old, vcookie->fscache);
+                   inode, old, v9inode->fscache);
-        spin_unlock(&vcookie->lock);
+        spin_unlock(&v9inode->fscache_lock);
 }
 int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
 {
        struct inode *inode = page->mapping->host;
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        BUG_ON(!vcookie->fscache);
+        BUG_ON(!v9inode->fscache);
-        return fscache_maybe_release_page(vcookie->fscache, page, gfp);
+        return fscache_maybe_release_page(v9inode->fscache, page, gfp);
 }
 void __v9fs_fscache_invalidate_page(struct page *page)
 {
        struct inode *inode = page->mapping->host;
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        BUG_ON(!vcookie->fscache);
+        BUG_ON(!v9inode->fscache);
        if (PageFsCache(page)) {
-                fscache_wait_on_page_write(vcookie->fscache, page);
+                fscache_wait_on_page_write(v9inode->fscache, page);
                BUG_ON(!PageLocked(page));
-                fscache_uncache_page(vcookie->fscache, page);
+                fscache_uncache_page(v9inode->fscache, page);
        }
 }
@@ -380,13 +323,13 @@ static void v9fs_vfs_readpage_complete(struct page *page, void *data,
 int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
 {
        int ret;
-        const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        const struct v9fs_inode *v9inode = V9FS_I(inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return -ENOBUFS;
-        ret = fscache_read_or_alloc_page(vcookie->fscache,
+        ret = fscache_read_or_alloc_page(v9inode->fscache,
                                         page,
                                         v9fs_vfs_readpage_complete,
                                         NULL,
@@ -418,13 +361,13 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
                                  unsigned *nr_pages)
 {
        int ret;
-        const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        const struct v9fs_inode *v9inode = V9FS_I(inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return -ENOBUFS;
-        ret = fscache_read_or_alloc_pages(vcookie->fscache,
+        ret = fscache_read_or_alloc_pages(v9inode->fscache,
                                          mapping, pages, nr_pages,
                                          v9fs_vfs_readpage_complete,
                                          NULL,
@@ -453,11 +396,22 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
 void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
 {
        int ret;
-        const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        const struct v9fs_inode *v9inode = V9FS_I(inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
-        ret = fscache_write_page(vcookie->fscache, page, GFP_KERNEL);
+        ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL);
        P9_DPRINTK(P9_DEBUG_FSC, "ret =  %d", ret);
        if (ret != 0)
                v9fs_uncache_page(inode, page);
 }
+/*
+ * wait for a page to complete writing to the cache
+ */
+void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
+{
+        const struct v9fs_inode *v9inode = V9FS_I(inode);
+        P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+        if (PageFsCache(page))
+                fscache_wait_on_page_write(v9inode->fscache, page);
+}
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index a94192bfaee8..049507a5b01c 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -25,20 +25,6 @@
 #include <linux/fscache.h>
 #include <linux/spinlock.h>
-extern struct kmem_cache *vcookie_cache;
-struct v9fs_cookie {
-        spinlock_t lock;
-        struct inode inode;
-        struct fscache_cookie *fscache;
-        struct p9_qid *qid;
-};
-static inline struct v9fs_cookie *v9fs_inode2cookie(const struct inode *inode)
-{
-        return container_of(inode, struct v9fs_cookie, inode);
-}
 extern struct fscache_netfs v9fs_cache_netfs;
 extern const struct fscache_cookie_def v9fs_cache_session_index_def;
 extern const struct fscache_cookie_def v9fs_cache_inode_index_def;
@@ -64,23 +50,8 @@ extern int __v9fs_readpages_from_fscache(struct inode *inode,
                                         struct list_head *pages,
                                         unsigned *nr_pages);
 extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
+extern void __v9fs_fscache_wait_on_page_write(struct inode *inode,
+                                              struct page *page);
-/**
- * v9fs_cache_register - Register v9fs file system with the cache
- */
-static inline int v9fs_cache_register(void)
-{
-        return __v9fs_cache_register();
-}
-/**
- * v9fs_cache_unregister - Unregister v9fs from the cache
- */
-static inline void v9fs_cache_unregister(void)
-{
-        __v9fs_cache_unregister();
-}
 static inline int v9fs_fscache_release_page(struct page *page,
                                            gfp_t gfp)
@@ -117,28 +88,27 @@ static inline void v9fs_readpage_to_fscache(struct inode *inode,
 static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        fscache_uncache_page(vcookie->fscache, page);
+        fscache_uncache_page(v9inode->fscache, page);
        BUG_ON(PageFsCache(page));
 }
-static inline void v9fs_vcookie_set_qid(struct inode *inode,
+static inline void v9fs_fscache_set_key(struct inode *inode,
                                        struct p9_qid *qid)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        spin_lock(&vcookie->lock);
+        spin_lock(&v9inode->fscache_lock);
-        vcookie->qid = qid;
+        v9inode->fscache_key = qid;
-        spin_unlock(&vcookie->lock);
+        spin_unlock(&v9inode->fscache_lock);
 }
-#else /* CONFIG_9P_FSCACHE */
+static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
+                                                   struct page *page)
-static inline int v9fs_cache_register(void)
 {
-        return 1;
+        return __v9fs_fscache_wait_on_page_write(inode, page);
 }
-static inline void v9fs_cache_unregister(void) {}
+#else /* CONFIG_9P_FSCACHE */
 static inline int v9fs_fscache_release_page(struct page *page,
                                            gfp_t gfp) {
@@ -168,9 +138,11 @@ static inline void v9fs_readpage_to_fscache(struct inode *inode,
 static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
 {}
-static inline void v9fs_vcookie_set_qid(struct inode *inode,
+static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
-                                        struct p9_qid *qid)
+                                                   struct page *page)
-{}
+{
+        return;
+}
 #endif /* CONFIG_9P_FSCACHE */
 #endif /* _9P_CACHE_H */
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index b00223c99d70..0ee594569dcc 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -125,46 +125,17 @@ err_out:
        return -ENOMEM;
 }
-/**
+static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
- * v9fs_fid_lookup - lookup for a fid, try to walk if not found
+                                               uid_t uid, int any)
- * @dentry: dentry to look for fid in
- *
- * Look for a fid in the specified dentry for the current user.
- * If no fid is found, try to create one walking from a fid from the parent
- * dentry (if it has one), or the root dentry. If the user haven't accessed
- * the fs yet, attach now and walk from the root.
- */
-struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 {
-        int i, n, l, clone, any, access;
-        u32 uid;
-        struct p9_fid *fid, *old_fid = NULL;
        struct dentry *ds;
-        struct v9fs_session_info *v9ses;
        char **wnames, *uname;
+        int i, n, l, clone, access;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid, *old_fid = NULL;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        v9ses = v9fs_dentry2v9ses(dentry);
        access = v9ses->flags & V9FS_ACCESS_MASK;
-        switch (access) {
-        case V9FS_ACCESS_SINGLE:
-        case V9FS_ACCESS_USER:
-        case V9FS_ACCESS_CLIENT:
-                uid = current_fsuid();
-                any = 0;
-                break;
-        case V9FS_ACCESS_ANY:
-                uid = v9ses->uid;
-                any = 1;
-                break;
-        default:
-                uid = ~0;
-                any = 0;
-                break;
-        }
        fid = v9fs_fid_find(dentry, uid, any);
        if (fid)
                return fid;
@@ -250,6 +221,45 @@ err_out:
        return fid;
 }
+/**
+ * v9fs_fid_lookup - lookup for a fid, try to walk if not found
+ * @dentry: dentry to look for fid in
+ *
+ * Look for a fid in the specified dentry for the current user.
+ * If no fid is found, try to create one walking from a fid from the parent
+ * dentry (if it has one), or the root dentry. If the user haven't accessed
+ * the fs yet, attach now and walk from the root.
+ */
+struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
+{
+        uid_t uid;
+        int  any, access;
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_dentry2v9ses(dentry);
+        access = v9ses->flags & V9FS_ACCESS_MASK;
+        switch (access) {
+        case V9FS_ACCESS_SINGLE:
+        case V9FS_ACCESS_USER:
+        case V9FS_ACCESS_CLIENT:
+                uid = current_fsuid();
+                any = 0;
+                break;
+        case V9FS_ACCESS_ANY:
+                uid = v9ses->uid;
+                any = 1;
+                break;
+        default:
+                uid = ~0;
+                any = 0;
+                break;
+        }
+        return v9fs_fid_lookup_with_uid(dentry, uid, any);
+}
 struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
 {
        struct p9_fid *fid, *ret;
@@ -261,3 +271,50 @@ struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
        ret = p9_client_walk(fid, 0, NULL, 1);
        return ret;
 }
+static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid)
+{
+        struct p9_fid *fid, *ret;
+        fid = v9fs_fid_lookup_with_uid(dentry, uid, 0);
+        if (IS_ERR(fid))
+                return fid;
+        ret = p9_client_walk(fid, 0, NULL, 1);
+        return ret;
+}
+struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
+{
+        int err, flags;
+        struct p9_fid *fid;
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_dentry2v9ses(dentry);
+        fid = v9fs_fid_clone_with_uid(dentry, 0);
+        if (IS_ERR(fid))
+                goto error_out;
+        /*
+         * writeback fid will only be used to write back the
+         * dirty pages. We always request for the open fid in read-write
+         * mode so that a partial page write which result in page
+         * read can work.
+         *
+         * we don't have a tsyncfs operation for older version
+         * of protocol. So make sure the write back fid is
+         * opened in O_SYNC mode.
+         */
+        if (!v9fs_proto_dotl(v9ses))
+                flags = O_RDWR | O_SYNC;
+        else
+                flags = O_RDWR;
+        err = p9_client_open(fid, flags);
+        if (err < 0) {
+                p9_client_clunk(fid);
+                fid = ERR_PTR(err);
+                goto error_out;
+        }
+error_out:
+        return fid;
+}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index c3bbd6af996d..bb0b6e7f58fc 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -19,7 +19,8 @@
 *  Boston, MA  02111-1301  USA
 *
 */
+#ifndef FS_9P_FID_H
+#define FS_9P_FID_H
 #include <linux/list.h>
 /**
@@ -45,3 +46,5 @@ struct v9fs_dentry {
 struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
 struct p9_fid *v9fs_fid_clone(struct dentry *dentry);
 int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
+struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
+#endif
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 2f77cd33ba83..c82b017f51f3 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -39,6 +39,7 @@
 static DEFINE_SPINLOCK(v9fs_sessionlist_lock);
 static LIST_HEAD(v9fs_sessionlist);
+struct kmem_cache *v9fs_inode_cache;
 /*
 * Option Parsing (code inspired by NFS code)
@@ -55,7 +56,7 @@ enum {
        /* Cache options */
        Opt_cache_loose, Opt_fscache,
        /* Access options */
-        Opt_access,
+        Opt_access, Opt_posixacl,
        /* Error token */
        Opt_err
 };
@@ -73,6 +74,7 @@ static const match_table_t tokens = {
        {Opt_fscache, "fscache"},
        {Opt_cachetag, "cachetag=%s"},
        {Opt_access, "access=%s"},
+        {Opt_posixacl, "posixacl"},
        {Opt_err, NULL}
 };
@@ -194,15 +196,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                        else if (strcmp(s, "any") == 0)
                                v9ses->flags |= V9FS_ACCESS_ANY;
                        else if (strcmp(s, "client") == 0) {
-#ifdef CONFIG_9P_FS_POSIX_ACL
                                v9ses->flags |= V9FS_ACCESS_CLIENT;
-#else
-                                P9_DPRINTK(P9_DEBUG_ERROR,
-                                        "access=client option not supported\n");
-                                kfree(s);
-                                ret = -EINVAL;
-                                goto free_and_return;
-#endif
                        } else {
                                v9ses->flags |= V9FS_ACCESS_SINGLE;
                                v9ses->uid = simple_strtoul(s, &e, 10);
@@ -212,6 +206,16 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                        kfree(s);
                        break;
+                case Opt_posixacl:
+#ifdef CONFIG_9P_FS_POSIX_ACL
+                        v9ses->flags |= V9FS_POSIX_ACL;
+#else
+                        P9_DPRINTK(P9_DEBUG_ERROR,
+                                        "Not defined CONFIG_9P_FS_POSIX_ACL. "
+                                        "Ignoring posixacl option\n");
+#endif
+                        break;
                default:
                        continue;
                }
@@ -260,19 +264,12 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        list_add(&v9ses->slist, &v9fs_sessionlist);
        spin_unlock(&v9fs_sessionlist_lock);
-        v9ses->flags = V9FS_ACCESS_USER;
        strcpy(v9ses->uname, V9FS_DEFUSER);
        strcpy(v9ses->aname, V9FS_DEFANAME);
        v9ses->uid = ~0;
        v9ses->dfltuid = V9FS_DEFUID;
        v9ses->dfltgid = V9FS_DEFGID;
-        rc = v9fs_parse_options(v9ses, data);
-        if (rc < 0) {
-                retval = rc;
-                goto error;
-        }
        v9ses->clnt = p9_client_create(dev_name, data);
        if (IS_ERR(v9ses->clnt)) {
                retval = PTR_ERR(v9ses->clnt);
@@ -281,10 +278,20 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                goto error;
        }
-        if (p9_is_proto_dotl(v9ses->clnt))
+        v9ses->flags = V9FS_ACCESS_USER;
+        if (p9_is_proto_dotl(v9ses->clnt)) {
+                v9ses->flags = V9FS_ACCESS_CLIENT;
                v9ses->flags |= V9FS_PROTO_2000L;
-        else if (p9_is_proto_dotu(v9ses->clnt))
+        } else if (p9_is_proto_dotu(v9ses->clnt)) {
                v9ses->flags |= V9FS_PROTO_2000U;
+        }
+        rc = v9fs_parse_options(v9ses, data);
+        if (rc < 0) {
+                retval = rc;
+                goto error;
+        }
        v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
@@ -306,6 +313,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                v9ses->flags |= V9FS_ACCESS_ANY;
                v9ses->uid = ~0;
        }
+        if (!v9fs_proto_dotl(v9ses) ||
+                !((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
+                /*
+                 * We support ACL checks on clinet only if the protocol is
+                 * 9P2000.L and access is V9FS_ACCESS_CLIENT.
+                 */
+                v9ses->flags &= ~V9FS_ACL_MASK;
+        }
        fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0,
                                                        v9ses->aname);
@@ -467,6 +482,63 @@ static void v9fs_sysfs_cleanup(void)
        kobject_put(v9fs_kobj);
 }
+static void v9fs_inode_init_once(void *foo)
+{
+        struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;
+#ifdef CONFIG_9P_FSCACHE
+        v9inode->fscache = NULL;
+        v9inode->fscache_key = NULL;
+#endif
+        inode_init_once(&v9inode->vfs_inode);
+}
+/**
+ * v9fs_init_inode_cache - initialize a cache for 9P
+ * Returns 0 on success.
+ */
+static int v9fs_init_inode_cache(void)
+{
+        v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
+                                          sizeof(struct v9fs_inode),
+                                          0, (SLAB_RECLAIM_ACCOUNT|
+                                              SLAB_MEM_SPREAD),
+                                          v9fs_inode_init_once);
+        if (!v9fs_inode_cache)
+                return -ENOMEM;
+        return 0;
+}
+/**
+ * v9fs_destroy_inode_cache - destroy the cache of 9P inode
+ *
+ */
+static void v9fs_destroy_inode_cache(void)
+{
+        kmem_cache_destroy(v9fs_inode_cache);
+}
+static int v9fs_cache_register(void)
+{
+        int ret;
+        ret = v9fs_init_inode_cache();
+        if (ret < 0)
+                return ret;
+#ifdef CONFIG_9P_FSCACHE
+        return fscache_register_netfs(&v9fs_cache_netfs);
+#else
+        return ret;
+#endif
+}
+static void v9fs_cache_unregister(void)
+{
+        v9fs_destroy_inode_cache();
+#ifdef CONFIG_9P_FSCACHE
+        fscache_unregister_netfs(&v9fs_cache_netfs);
+#endif
+}
 /**
 * init_v9fs - Initialize module
 *
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index c4b5d8864f0d..9665c2b840e6 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -20,6 +20,9 @@
 *  Boston, MA  02111-1301  USA
 *
 */
+#ifndef FS_9P_V9FS_H
+#define FS_9P_V9FS_H
 #include <linux/backing-dev.h>
 /**
@@ -28,8 +31,10 @@
 * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
+ * @V9FS_ACCESS_CLIENT: Just like user, but access check is performed on client.
 * @V9FS_ACCESS_ANY: use a single attach for all users
 * @V9FS_ACCESS_MASK: bit mask of different ACCESS options
+ * @V9FS_POSIX_ACL: POSIX ACLs are enforced
 *
 * Session flags reflect options selected by users at mount time
 */
@@ -37,13 +42,15 @@
                         V9FS_ACCESS_USER |   \
                         V9FS_ACCESS_CLIENT)
 #define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
+#define V9FS_ACL_MASK V9FS_POSIX_ACL
 enum p9_session_flags {
        V9FS_PROTO_2000U        = 0x01,
        V9FS_PROTO_2000L        = 0x02,
        V9FS_ACCESS_SINGLE      = 0x04,
        V9FS_ACCESS_USER        = 0x08,
-        V9FS_ACCESS_CLIENT      = 0x10
+        V9FS_ACCESS_CLIENT      = 0x10,
+        V9FS_POSIX_ACL          = 0x20
 };
 /* possible values of ->cache */
@@ -109,8 +116,29 @@ struct v9fs_session_info {
        struct list_head slist; /* list of sessions registered with v9fs */
        struct backing_dev_info bdi;
        struct rw_semaphore rename_sem;
+        struct p9_fid *root_fid; /* Used for file system sync */
+};
+/* cache_validity flags */
+#define V9FS_INO_INVALID_ATTR 0x01
+struct v9fs_inode {
+#ifdef CONFIG_9P_FSCACHE
+        spinlock_t fscache_lock;
+        struct fscache_cookie *fscache;
+        struct p9_qid *fscache_key;
+#endif
+        unsigned int cache_validity;
+        struct p9_fid *writeback_fid;
+        struct mutex v_mutex;
+        struct inode vfs_inode;
 };
+static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
+{
+        return container_of(inode, struct v9fs_inode, vfs_inode);
+}
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
                                                                        char *);
 extern void v9fs_session_close(struct v9fs_session_info *v9ses);
@@ -124,16 +152,15 @@ extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        struct inode *new_dir, struct dentry *new_dentry);
 extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
                        void *p);
-extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses,
+extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
-                        struct p9_fid *fid,
+                                         struct p9_fid *fid,
-                        struct super_block *sb);
+                                         struct super_block *sb);
 extern const struct inode_operations v9fs_dir_inode_operations_dotl;
 extern const struct inode_operations v9fs_file_inode_operations_dotl;
 extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
-extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses,
+extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
-                        struct p9_fid *fid,
+                                              struct p9_fid *fid,
-                        struct super_block *sb);
+                                              struct super_block *sb);
 /* other default globals */
 #define V9FS_PORT       564
@@ -147,6 +174,11 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
        return (inode->i_sb->s_fs_info);
 }
+static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry)
+{
+        return dentry->d_sb->s_fs_info;
+}
 static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
 {
        return v9ses->flags & V9FS_PROTO_2000U;
@@ -158,7 +190,7 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
 }
 /**
- * v9fs_inode_from_fid - Helper routine to populate an inode by
+ * v9fs_get_inode_from_fid - Helper routine to populate an inode by
 * issuing a attribute request
 * @v9ses: session information
 * @fid: fid to issue attribute request for
@@ -166,11 +198,12 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
 *
 */
 static inline struct inode *
-v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-                                struct super_block *sb)
+                        struct super_block *sb)
 {
        if (v9fs_proto_dotl(v9ses))
-                return v9fs_inode_dotl(v9ses, fid, sb);
+                return v9fs_inode_from_fid_dotl(v9ses, fid, sb);
        else
-                return v9fs_inode(v9ses, fid, sb);
+                return v9fs_inode_from_fid(v9ses, fid, sb);
 }
+#endif
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index b789f8e597ec..4014160903a9 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -20,6 +20,8 @@
 *  Boston, MA  02111-1301  USA
 *
 */
+#ifndef FS_9P_V9FS_VFS_H
+#define FS_9P_V9FS_VFS_H
 /* plan9 semantics are that created files are implicitly opened.
 * But linux semantics are that you call create, then open.
@@ -36,6 +38,7 @@
 * unlink calls remove, which is an implicit clunk. So we have to track
 * that kind of thing so that we don't try to clunk a dead fid.
 */
+#define P9_LOCK_TIMEOUT (30*HZ)
 extern struct file_system_type v9fs_fs_type;
 extern const struct address_space_operations v9fs_addr_operations;
@@ -45,13 +48,15 @@ extern const struct file_operations v9fs_dir_operations;
 extern const struct file_operations v9fs_dir_operations_dotl;
 extern const struct dentry_operations v9fs_dentry_operations;
 extern const struct dentry_operations v9fs_cached_dentry_operations;
+extern const struct file_operations v9fs_cached_file_operations;
+extern const struct file_operations v9fs_cached_file_operations_dotl;
+extern struct kmem_cache *v9fs_inode_cache;
-#ifdef CONFIG_9P_FSCACHE
 struct inode *v9fs_alloc_inode(struct super_block *sb);
 void v9fs_destroy_inode(struct inode *inode);
-#endif
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
+int v9fs_init_inode(struct v9fs_session_info *v9ses,
+                    struct inode *inode, int mode);
 void v9fs_evict_inode(struct inode *inode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
 void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
@@ -62,8 +67,19 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
 int v9fs_uflags2omode(int uflags, int extended);
 ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
+ssize_t v9fs_fid_readn(struct p9_fid *, char *, char __user *, u32, u64);
 void v9fs_blank_wstat(struct p9_wstat *wstat);
 int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
 int v9fs_file_fsync_dotl(struct file *filp, int datasync);
+ssize_t v9fs_file_write_internal(struct inode *, struct p9_fid *,
-#define P9_LOCK_TIMEOUT (30*HZ)
+                                 const char __user *, size_t, loff_t *, int);
+int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode);
+int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode);
+static inline void v9fs_invalidate_inode_attr(struct inode *inode)
+{
+        struct v9fs_inode *v9inode;
+        v9inode = V9FS_I(inode);
+        v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
+        return;
+}
+#endif
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index b7f2a8e3863e..2524e4cbb8ea 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -39,16 +39,16 @@
 #include "v9fs.h"
 #include "v9fs_vfs.h"
 #include "cache.h"
+#include "fid.h"
 /**
- * v9fs_vfs_readpage - read an entire page in from 9P
+ * v9fs_fid_readpage - read an entire page in from 9P
 *
- * @filp: file being read
+ * @fid: fid being read
 * @page: structure to page
 *
 */
+static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
-static int v9fs_vfs_readpage(struct file *filp, struct page *page)
 {
        int retval;
        loff_t offset;
@@ -67,7 +67,7 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
        buffer = kmap(page);
        offset = page_offset(page);
-        retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset);
+        retval = v9fs_fid_readn(fid, buffer, NULL, PAGE_CACHE_SIZE, offset);
        if (retval < 0) {
                v9fs_uncache_page(inode, page);
                goto done;
@@ -87,6 +87,19 @@ done:
 }
 /**
+ * v9fs_vfs_readpage - read an entire page in from 9P
+ *
+ * @filp: file being read
+ * @page: structure to page
+ *
+ */
+static int v9fs_vfs_readpage(struct file *filp, struct page *page)
+{
+        return v9fs_fid_readpage(filp->private_data, page);
+}
+/**
 * v9fs_vfs_readpages - read a set of pages from 9P
 *
 * @filp: file being read
@@ -124,7 +137,6 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
 {
        if (PagePrivate(page))
                return 0;
        return v9fs_fscache_release_page(page, gfp);
 }
@@ -137,20 +149,89 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
 static void v9fs_invalidate_page(struct page *page, unsigned long offset)
 {
+        /*
+         * If called with zero offset, we should release
+         * the private state assocated with the page
+         */
        if (offset == 0)
                v9fs_fscache_invalidate_page(page);
 }
+static int v9fs_vfs_writepage_locked(struct page *page)
+{
+        char *buffer;
+        int retval, len;
+        loff_t offset, size;
+        mm_segment_t old_fs;
+        struct v9fs_inode *v9inode;
+        struct inode *inode = page->mapping->host;
+        v9inode = V9FS_I(inode);
+        size = i_size_read(inode);
+        if (page->index == size >> PAGE_CACHE_SHIFT)
+                len = size & ~PAGE_CACHE_MASK;
+        else
+                len = PAGE_CACHE_SIZE;
+        set_page_writeback(page);
+        buffer = kmap(page);
+        offset = page_offset(page);
+        old_fs = get_fs();
+        set_fs(get_ds());
+        /* We should have writeback_fid always set */
+        BUG_ON(!v9inode->writeback_fid);
+        retval = v9fs_file_write_internal(inode,
+                                          v9inode->writeback_fid,
+                                          (__force const char __user *)buffer,
+                                          len, &offset, 0);
+        if (retval > 0)
+                retval = 0;
+        set_fs(old_fs);
+        kunmap(page);
+        end_page_writeback(page);
+        return retval;
+}
+static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+        int retval;
+        retval = v9fs_vfs_writepage_locked(page);
+        if (retval < 0) {
+                if (retval == -EAGAIN) {
+                        redirty_page_for_writepage(wbc, page);
+                        retval = 0;
+                } else {
+                        SetPageError(page);
+                        mapping_set_error(page->mapping, retval);
+                }
+        } else
+                retval = 0;
+        unlock_page(page);
+        return retval;
+}
 /**
 * v9fs_launder_page - Writeback a dirty page
- * Since the writes go directly to the server, we simply return a 0
- * here to indicate success.
- *
 * Returns 0 on success.
 */
 static int v9fs_launder_page(struct page *page)
 {
+        int retval;
+        struct inode *inode = page->mapping->host;
+        v9fs_fscache_wait_on_page_write(inode, page);
+        if (clear_page_dirty_for_io(page)) {
+                retval = v9fs_vfs_writepage_locked(page);
+                if (retval)
+                        return retval;
+        }
        return 0;
 }
@@ -173,9 +254,15 @@ static int v9fs_launder_page(struct page *page)
 * with an error.
 *
 */
-ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+static ssize_t
-                loff_t pos, unsigned long nr_segs)
+v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+               loff_t pos, unsigned long nr_segs)
 {
+        /*
+         * FIXME
+         * Now that we do caching with cache mode enabled, We need
+         * to support direct IO
+         */
        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
                        "off/no(%lld/%lu) EINVAL\n",
                        iocb->ki_filp->f_path.dentry->d_name.name,
@@ -183,11 +270,84 @@ ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
        return -EINVAL;
 }
+static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
+                            loff_t pos, unsigned len, unsigned flags,
+                            struct page **pagep, void **fsdata)
+{
+        int retval = 0;
+        struct page *page;
+        struct v9fs_inode *v9inode;
+        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+        struct inode *inode = mapping->host;
+        v9inode = V9FS_I(inode);
+start:
+        page = grab_cache_page_write_begin(mapping, index, flags);
+        if (!page) {
+                retval = -ENOMEM;
+                goto out;
+        }
+        BUG_ON(!v9inode->writeback_fid);
+        if (PageUptodate(page))
+                goto out;
+        if (len == PAGE_CACHE_SIZE)
+                goto out;
+        retval = v9fs_fid_readpage(v9inode->writeback_fid, page);
+        page_cache_release(page);
+        if (!retval)
+                goto start;
+out:
+        *pagep = page;
+        return retval;
+}
+static int v9fs_write_end(struct file *filp, struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned copied,
+                          struct page *page, void *fsdata)
+{
+        loff_t last_pos = pos + copied;
+        struct inode *inode = page->mapping->host;
+        if (unlikely(copied < len)) {
+                /*
+                 * zero out the rest of the area
+                 */
+                unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+                zero_user(page, from + copied, len - copied);
+                flush_dcache_page(page);
+        }
+        if (!PageUptodate(page))
+                SetPageUptodate(page);
+        /*
+         * No need to use i_size_read() here, the i_size
+         * cannot change under us because we hold the i_mutex.
+         */
+        if (last_pos > inode->i_size) {
+                inode_add_bytes(inode, last_pos - inode->i_size);
+                i_size_write(inode, last_pos);
+        }
+        set_page_dirty(page);
+        unlock_page(page);
+        page_cache_release(page);
+        return copied;
+}
 const struct address_space_operations v9fs_addr_operations = {
-      .readpage = v9fs_vfs_readpage,
+        .readpage = v9fs_vfs_readpage,
-      .readpages = v9fs_vfs_readpages,
+        .readpages = v9fs_vfs_readpages,
-      .releasepage = v9fs_release_page,
+        .set_page_dirty = __set_page_dirty_nobuffers,
-      .invalidatepage = v9fs_invalidate_page,
+        .writepage = v9fs_vfs_writepage,
-      .launder_page = v9fs_launder_page,
+        .write_begin = v9fs_write_begin,
-      .direct_IO = v9fs_direct_IO,
+        .write_end = v9fs_write_end,
+        .releasepage = v9fs_release_page,
+        .invalidatepage = v9fs_invalidate_page,
+        .launder_page = v9fs_launder_page,
+        .direct_IO = v9fs_direct_IO,
 };
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 233b7d4ffe5e..b6a3b9f7fe4d 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -63,20 +63,15 @@ static int v9fs_dentry_delete(const struct dentry *dentry)
 * v9fs_cached_dentry_delete - called when dentry refcount equals 0
 * @dentry:  dentry in question
 *
- * Only return 1 if our inode is invalid.  Only non-synthetic files
- * (ones without mtime == 0) should be calling this function.
- *
 */
 static int v9fs_cached_dentry_delete(const struct dentry *dentry)
 {
-        struct inode *inode = dentry->d_inode;
+        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n",
-        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
+                   dentry->d_name.name, dentry);
-                                                                        dentry);
-        if(!inode)
+        /* Don't cache negative dentries */
+        if (!dentry->d_inode)
                return 1;
        return 0;
 }
@@ -105,7 +100,41 @@ static void v9fs_dentry_release(struct dentry *dentry)
        }
 }
+static int v9fs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        struct p9_fid *fid;
+        struct inode *inode;
+        struct v9fs_inode *v9inode;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        if (!inode)
+                goto out_valid;
+        v9inode = V9FS_I(inode);
+        if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
+                int retval;
+                struct v9fs_session_info *v9ses;
+                fid = v9fs_fid_lookup(dentry);
+                if (IS_ERR(fid))
+                        return PTR_ERR(fid);
+                v9ses = v9fs_inode2v9ses(inode);
+                if (v9fs_proto_dotl(v9ses))
+                        retval = v9fs_refresh_inode_dotl(fid, inode);
+                else
+                        retval = v9fs_refresh_inode(fid, inode);
+                if (retval <= 0)
+                        return retval;
+        }
+out_valid:
+        return 1;
+}
 const struct dentry_operations v9fs_cached_dentry_operations = {
+        .d_revalidate = v9fs_lookup_revalidate,
        .d_delete = v9fs_cached_dentry_delete,
        .d_release = v9fs_dentry_release,
 };
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index b84ebe8cefed..9c2bdda5cd9d 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -295,7 +295,6 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
        P9_DPRINTK(P9_DEBUG_VFS,
                        "v9fs_dir_release: inode: %p filp: %p fid: %d\n",
                        inode, filp, fid ? fid->fid : -1);
-        filemap_write_and_wait(inode->i_mapping);
        if (fid)
                p9_client_clunk(fid);
        return 0;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 240c30674396..ffed55817f0c 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -44,8 +44,7 @@
 #include "fid.h"
 #include "cache.h"
-static const struct file_operations v9fs_cached_file_operations;
+static const struct vm_operations_struct v9fs_file_vm_ops;
-static const struct file_operations v9fs_cached_file_operations_dotl;
 /**
 * v9fs_file_open - open a file (or directory)
@@ -57,11 +56,13 @@ static const struct file_operations v9fs_cached_file_operations_dotl;
 int v9fs_file_open(struct inode *inode, struct file *file)
 {
        int err;
+        struct v9fs_inode *v9inode;
        struct v9fs_session_info *v9ses;
        struct p9_fid *fid;
        int omode;
        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
+        v9inode = V9FS_I(inode);
        v9ses = v9fs_inode2v9ses(inode);
        if (v9fs_proto_dotl(v9ses))
                omode = file->f_flags;
@@ -89,20 +90,34 @@ int v9fs_file_open(struct inode *inode, struct file *file)
        }
        file->private_data = fid;
-        if ((fid->qid.version) && (v9ses->cache)) {
+        mutex_lock(&v9inode->v_mutex);
-                P9_DPRINTK(P9_DEBUG_VFS, "cached");
+        if (v9ses->cache && !v9inode->writeback_fid &&
-                /* enable cached file options */
+            ((file->f_flags & O_ACCMODE) != O_RDONLY)) {
-                if(file->f_op == &v9fs_file_operations)
+                /*
-                        file->f_op = &v9fs_cached_file_operations;
+                 * clone a fid and add it to writeback_fid
-                else if (file->f_op == &v9fs_file_operations_dotl)
+                 * we do it during open time instead of
-                        file->f_op = &v9fs_cached_file_operations_dotl;
+                 * page dirty time via write_begin/page_mkwrite
+                 * because we want write after unlink usecase
+                 * to work.
+                 */
+                fid = v9fs_writeback_fid(file->f_path.dentry);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        mutex_unlock(&v9inode->v_mutex);
+                        goto out_error;
+                }
+                v9inode->writeback_fid = (void *) fid;
+        }
+        mutex_unlock(&v9inode->v_mutex);
 #ifdef CONFIG_9P_FSCACHE
+        if (v9ses->cache)
                v9fs_cache_inode_set_cookie(inode, file);
 #endif
-        }
        return 0;
+out_error:
+        p9_client_clunk(file->private_data);
+        file->private_data = NULL;
+        return err;
 }
 /**
@@ -335,25 +350,22 @@ out_err:
 }
 /**
- * v9fs_file_readn - read from a file
+ * v9fs_fid_readn - read from a fid
- * @filp: file pointer to read
+ * @fid: fid to read
 * @data: data buffer to read data into
 * @udata: user data buffer to read data into
 * @count: size of buffer
 * @offset: offset at which to read data
 *
 */
 ssize_t
-v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
+v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count,
               u64 offset)
 {
        int n, total, size;
-        struct p9_fid *fid = filp->private_data;
        P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
-                                        (long long unsigned) offset, count);
+                   (long long unsigned) offset, count);
        n = 0;
        total = 0;
        size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -379,6 +391,22 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
 }
 /**
+ * v9fs_file_readn - read from a file
+ * @filp: file pointer to read
+ * @data: data buffer to read data into
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+ssize_t
+v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
+               u64 offset)
+{
+        return v9fs_fid_readn(filp->private_data, data, udata, count, offset);
+}
+/**
 * v9fs_file_read - read from a file
 * @filp: file pointer to read
 * @udata: user data buffer to read data into
@@ -410,45 +438,22 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
        return ret;
 }
-/**
+ssize_t
- * v9fs_file_write - write to a file
+v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
- * @filp: file pointer to write
+                         const char __user *data, size_t count,
- * @data: data buffer to write data from
+                         loff_t *offset, int invalidate)
- * @count: size of buffer
- * @offset: offset at which to write data
- *
- */
-static ssize_t
-v9fs_file_write(struct file *filp, const char __user * data,
-                size_t count, loff_t * offset)
 {
-        ssize_t retval;
-        size_t total = 0;
        int n;
-        struct p9_fid *fid;
+        loff_t i_size;
+        size_t total = 0;
        struct p9_client *clnt;
-        struct inode *inode = filp->f_path.dentry->d_inode;
        loff_t origin = *offset;
        unsigned long pg_start, pg_end;
        P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
                (int)count, (int)*offset);
-        fid = filp->private_data;
        clnt = fid->clnt;
-        retval = generic_write_checks(filp, &origin, &count, 0);
-        if (retval)
-                goto out;
-        retval = -EINVAL;
-        if ((ssize_t) count < 0)
-                goto out;
-        retval = 0;
-        if (!count)
-                goto out;
        do {
                n = p9_client_write(fid, NULL, data+total, origin+total, count);
                if (n <= 0)
@@ -457,25 +462,63 @@ v9fs_file_write(struct file *filp, const char __user * data,
                total += n;
        } while (count > 0);
-        if (total > 0) {
+        if (invalidate && (total > 0)) {
                pg_start = origin >> PAGE_CACHE_SHIFT;
                pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
                if (inode->i_mapping && inode->i_mapping->nrpages)
                        invalidate_inode_pages2_range(inode->i_mapping,
                                                      pg_start, pg_end);
                *offset += total;
-                i_size_write(inode, i_size_read(inode) + total);
+                i_size = i_size_read(inode);
-                inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+                if (*offset > i_size) {
+                        inode_add_bytes(inode, *offset - i_size);
+                        i_size_write(inode, *offset);
+                }
        }
        if (n < 0)
-                retval = n;
+                return n;
-        else
-                retval = total;
+        return total;
+}
+/**
+ * v9fs_file_write - write to a file
+ * @filp: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_file_write(struct file *filp, const char __user * data,
+                size_t count, loff_t *offset)
+{
+        ssize_t retval = 0;
+        loff_t origin = *offset;
+        retval = generic_write_checks(filp, &origin, &count, 0);
+        if (retval)
+                goto out;
+        retval = -EINVAL;
+        if ((ssize_t) count < 0)
+                goto out;
+        retval = 0;
+        if (!count)
+                goto out;
+        retval = v9fs_file_write_internal(filp->f_path.dentry->d_inode,
+                                        filp->private_data,
+                                        data, count, &origin, 1);
+        /* update offset on successful write */
+        if (retval > 0)
+                *offset = origin;
 out:
        return retval;
 }
 static int v9fs_file_fsync(struct file *filp, int datasync)
 {
        struct p9_fid *fid;
@@ -505,28 +548,182 @@ int v9fs_file_fsync_dotl(struct file *filp, int datasync)
        return retval;
 }
-static const struct file_operations v9fs_cached_file_operations = {
+static int
+v9fs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        int retval;
+        retval = generic_file_mmap(file, vma);
+        if (!retval)
+                vma->vm_ops = &v9fs_file_vm_ops;
+        return retval;
+}
+static int
+v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct v9fs_inode *v9inode;
+        struct page *page = vmf->page;
+        struct file *filp = vma->vm_file;
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n",
+                   page, (unsigned long)filp->private_data);
+        v9inode = V9FS_I(inode);
+        /* make sure the cache has finished storing the page */
+        v9fs_fscache_wait_on_page_write(inode, page);
+        BUG_ON(!v9inode->writeback_fid);
+        lock_page(page);
+        if (page->mapping != inode->i_mapping)
+                goto out_unlock;
+        return VM_FAULT_LOCKED;
+out_unlock:
+        unlock_page(page);
+        return VM_FAULT_NOPAGE;
+}
+static ssize_t
+v9fs_direct_read(struct file *filp, char __user *udata, size_t count,
+                 loff_t *offsetp)
+{
+        loff_t size, offset;
+        struct inode *inode;
+        struct address_space *mapping;
+        offset = *offsetp;
+        mapping = filp->f_mapping;
+        inode = mapping->host;
+        if (!count)
+                return 0;
+        size = i_size_read(inode);
+        if (offset < size)
+                filemap_write_and_wait_range(mapping, offset,
+                                             offset + count - 1);
+        return v9fs_file_read(filp, udata, count, offsetp);
+}
+/**
+ * v9fs_cached_file_read - read from a file
+ * @filp: file pointer to read
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+static ssize_t
+v9fs_cached_file_read(struct file *filp, char __user *data, size_t count,
+                      loff_t *offset)
+{
+        if (filp->f_flags & O_DIRECT)
+                return v9fs_direct_read(filp, data, count, offset);
+        return do_sync_read(filp, data, count, offset);
+}
+static ssize_t
+v9fs_direct_write(struct file *filp, const char __user * data,
+                  size_t count, loff_t *offsetp)
+{
+        loff_t offset;
+        ssize_t retval;
+        struct inode *inode;
+        struct address_space *mapping;
+        offset = *offsetp;
+        mapping = filp->f_mapping;
+        inode = mapping->host;
+        if (!count)
+                return 0;
+        mutex_lock(&inode->i_mutex);
+        retval = filemap_write_and_wait_range(mapping, offset,
+                                              offset + count - 1);
+        if (retval)
+                goto err_out;
+        /*
+         * After a write we want buffered reads to be sure to go to disk to get
+         * the new data.  We invalidate clean cached page from the region we're
+         * about to write.  We do this *before* the write so that if we fail
+         * here we fall back to buffered write
+         */
+        if (mapping->nrpages) {
+                pgoff_t pg_start = offset >> PAGE_CACHE_SHIFT;
+                pgoff_t pg_end   = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+                retval = invalidate_inode_pages2_range(mapping,
+                                                        pg_start, pg_end);
+                /*
+                 * If a page can not be invalidated, fall back
+                 * to buffered write.
+                 */
+                if (retval) {
+                        if (retval == -EBUSY)
+                                goto buff_write;
+                        goto err_out;
+                }
+        }
+        retval = v9fs_file_write(filp, data, count, offsetp);
+err_out:
+        mutex_unlock(&inode->i_mutex);
+        return retval;
+buff_write:
+        mutex_unlock(&inode->i_mutex);
+        return do_sync_write(filp, data, count, offsetp);
+}
+/**
+ * v9fs_cached_file_write - write to a file
+ * @filp: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_cached_file_write(struct file *filp, const char __user * data,
+                       size_t count, loff_t *offset)
+{
+        if (filp->f_flags & O_DIRECT)
+                return v9fs_direct_write(filp, data, count, offset);
+        return do_sync_write(filp, data, count, offset);
+}
+static const struct vm_operations_struct v9fs_file_vm_ops = {
+        .fault = filemap_fault,
+        .page_mkwrite = v9fs_vm_page_mkwrite,
+};
+const struct file_operations v9fs_cached_file_operations = {
        .llseek = generic_file_llseek,
-        .read = do_sync_read,
+        .read = v9fs_cached_file_read,
+        .write = v9fs_cached_file_write,
        .aio_read = generic_file_aio_read,
-        .write = v9fs_file_write,
+        .aio_write = generic_file_aio_write,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
        .lock = v9fs_file_lock,
-        .mmap = generic_file_readonly_mmap,
+        .mmap = v9fs_file_mmap,
        .fsync = v9fs_file_fsync,
 };
-static const struct file_operations v9fs_cached_file_operations_dotl = {
+const struct file_operations v9fs_cached_file_operations_dotl = {
        .llseek = generic_file_llseek,
-        .read = do_sync_read,
+        .read = v9fs_cached_file_read,
+        .write = v9fs_cached_file_write,
        .aio_read = generic_file_aio_read,
-        .write = v9fs_file_write,
+        .aio_write = generic_file_aio_write,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
        .lock = v9fs_file_lock_dotl,
        .flock = v9fs_file_flock_dotl,
-        .mmap = generic_file_readonly_mmap,
+        .mmap = v9fs_file_mmap,
        .fsync = v9fs_file_fsync_dotl,
 };
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b76a40bdf4c2..7f6c67703195 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -203,26 +203,26 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
        wstat->extension = NULL;
 }
-#ifdef CONFIG_9P_FSCACHE
 /**
 * v9fs_alloc_inode - helper function to allocate an inode
- * This callback is executed before setting up the inode so that we
- * can associate a vcookie with each inode.
 *
 */
 struct inode *v9fs_alloc_inode(struct super_block *sb)
 {
-        struct v9fs_cookie *vcookie;
+        struct v9fs_inode *v9inode;
-        vcookie = (struct v9fs_cookie *)kmem_cache_alloc(vcookie_cache,
+        v9inode = (struct v9fs_inode *)kmem_cache_alloc(v9fs_inode_cache,
-                                                         GFP_KERNEL);
+                                                        GFP_KERNEL);
-        if (!vcookie)
+        if (!v9inode)
                return NULL;
+#ifdef CONFIG_9P_FSCACHE
-        vcookie->fscache = NULL;
+        v9inode->fscache = NULL;
-        vcookie->qid = NULL;
+        v9inode->fscache_key = NULL;
-        spin_lock_init(&vcookie->lock);
+        spin_lock_init(&v9inode->fscache_lock);
-        return &vcookie->inode;
+#endif
+        v9inode->writeback_fid = NULL;
+        v9inode->cache_validity = 0;
+        mutex_init(&v9inode->v_mutex);
+        return &v9inode->vfs_inode;
 }
 /**
@@ -234,35 +234,18 @@ static void v9fs_i_callback(struct rcu_head *head)
 {
        struct inode *inode = container_of(head, struct inode, i_rcu);
        INIT_LIST_HEAD(&inode->i_dentry);
-        kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
+        kmem_cache_free(v9fs_inode_cache, V9FS_I(inode));
 }
 void v9fs_destroy_inode(struct inode *inode)
 {
        call_rcu(&inode->i_rcu, v9fs_i_callback);
 }
-#endif
-/**
- * v9fs_get_inode - helper function to setup an inode
- * @sb: superblock
- * @mode: mode to setup inode with
- *
- */
-struct inode *v9fs_get_inode(struct super_block *sb, int mode)
+int v9fs_init_inode(struct v9fs_session_info *v9ses,
+                    struct inode *inode, int mode)
 {
-        int err;
+        int err = 0;
-        struct inode *inode;
-        struct v9fs_session_info *v9ses = sb->s_fs_info;
-        P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
-        inode = new_inode(sb);
-        if (!inode) {
-                P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
-                return ERR_PTR(-ENOMEM);
-        }
        inode_init_owner(inode, NULL, mode);
        inode->i_blocks = 0;
@@ -292,14 +275,20 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
        case S_IFREG:
                if (v9fs_proto_dotl(v9ses)) {
                        inode->i_op = &v9fs_file_inode_operations_dotl;
-                        inode->i_fop = &v9fs_file_operations_dotl;
+                        if (v9ses->cache)
+                                inode->i_fop =
+                                        &v9fs_cached_file_operations_dotl;
+                        else
+                                inode->i_fop = &v9fs_file_operations_dotl;
                } else {
                        inode->i_op = &v9fs_file_inode_operations;
-                        inode->i_fop = &v9fs_file_operations;
+                        if (v9ses->cache)
+                                inode->i_fop = &v9fs_cached_file_operations;
+                        else
+                                inode->i_fop = &v9fs_file_operations;
                }
                break;
        case S_IFLNK:
                if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
                        P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
@@ -335,12 +324,37 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
                err = -EINVAL;
                goto error;
        }
+error:
+        return err;
-        return inode;
+}
-error:
+/**
-        iput(inode);
+ * v9fs_get_inode - helper function to setup an inode
-        return ERR_PTR(err);
+ * @sb: superblock
+ * @mode: mode to setup inode with
+ *
+ */
+struct inode *v9fs_get_inode(struct super_block *sb, int mode)
+{
+        int err;
+        struct inode *inode;
+        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
+        inode = new_inode(sb);
+        if (!inode) {
+                P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        err = v9fs_init_inode(v9ses, inode, mode);
+        if (err) {
+                iput(inode);
+                return ERR_PTR(err);
+        }
+        return inode;
 }
 /*
@@ -403,6 +417,8 @@ error:
 */
 void v9fs_evict_inode(struct inode *inode)
 {
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        truncate_inode_pages(inode->i_mapping, 0);
        end_writeback(inode);
        filemap_fdatawrite(inode->i_mapping);
@@ -410,41 +426,67 @@ void v9fs_evict_inode(struct inode *inode)
 #ifdef CONFIG_9P_FSCACHE
        v9fs_cache_inode_put_cookie(inode);
 #endif
+        /* clunk the fid stashed in writeback_fid */
+        if (v9inode->writeback_fid) {
+                p9_client_clunk(v9inode->writeback_fid);
+                v9inode->writeback_fid = NULL;
+        }
 }
-struct inode *
+static struct inode *v9fs_qid_iget(struct super_block *sb,
-v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+                                   struct p9_qid *qid,
-        struct super_block *sb)
+                                   struct p9_wstat *st)
 {
-        int err, umode;
+        int retval, umode;
-        struct inode *ret = NULL;
+        unsigned long i_ino;
-        struct p9_wstat *st;
+        struct inode *inode;
+        struct v9fs_session_info *v9ses = sb->s_fs_info;
-        st = p9_client_stat(fid);
-        if (IS_ERR(st))
-                return ERR_CAST(st);
+        i_ino = v9fs_qid2ino(qid);
+        inode = iget_locked(sb, i_ino);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        /*
+         * initialize the inode with the stat info
+         * FIXME!! we may need support for stale inodes
+         * later.
+         */
        umode = p9mode2unixmode(v9ses, st->mode);
-        ret = v9fs_get_inode(sb, umode);
+        retval = v9fs_init_inode(v9ses, inode, umode);
-        if (IS_ERR(ret)) {
+        if (retval)
-                err = PTR_ERR(ret);
                goto error;
-        }
-        v9fs_stat2inode(st, ret, sb);
-        ret->i_ino = v9fs_qid2ino(&st->qid);
+        v9fs_stat2inode(st, inode, sb);
 #ifdef CONFIG_9P_FSCACHE
-        v9fs_vcookie_set_qid(ret, &st->qid);
+        v9fs_fscache_set_key(inode, &st->qid);
-        v9fs_cache_inode_get_cookie(ret);
+        v9fs_cache_inode_get_cookie(inode);
 #endif
-        p9stat_free(st);
+        unlock_new_inode(inode);
-        kfree(st);
+        return inode;
-        return ret;
 error:
+        unlock_new_inode(inode);
+        iput(inode);
+        return ERR_PTR(retval);
+}
+struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+                    struct super_block *sb)
+{
+        struct p9_wstat *st;
+        struct inode *inode = NULL;
+        st = p9_client_stat(fid);
+        if (IS_ERR(st))
+                return ERR_CAST(st);
+        inode = v9fs_qid_iget(sb, &st->qid, st);
        p9stat_free(st);
        kfree(st);
-        return ERR_PTR(err);
+        return inode;
 }
 /**
@@ -458,8 +500,8 @@ error:
 static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 {
        int retval;
-        struct inode *file_inode;
        struct p9_fid *v9fid;
+        struct inode *file_inode;
        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
                rmdir);
@@ -470,8 +512,20 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
                return PTR_ERR(v9fid);
        retval = p9_client_remove(v9fid);
-        if (!retval)
+        if (!retval) {
-                drop_nlink(file_inode);
+                /*
+                 * directories on unlink should have zero
+                 * link count
+                 */
+                if (rmdir) {
+                        clear_nlink(file_inode);
+                        drop_nlink(dir);
+                } else
+                        drop_nlink(file_inode);
+                v9fs_invalidate_inode_attr(file_inode);
+                v9fs_invalidate_inode_attr(dir);
+        }
        return retval;
 }
@@ -531,7 +585,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        }
        /* instantiate inode and assign the unopened fid to the dentry */
-        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -570,9 +624,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        int err;
        u32 perm;
        int flags;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid;
        struct file *filp;
+        struct v9fs_inode *v9inode;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid, *inode_fid;
        err = 0;
        fid = NULL;
@@ -592,8 +647,29 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
                goto error;
        }
+        v9fs_invalidate_inode_attr(dir);
        /* if we are opening a file, assign the open fid to the file */
        if (nd && nd->flags & LOOKUP_OPEN) {
+                v9inode = V9FS_I(dentry->d_inode);
+                mutex_lock(&v9inode->v_mutex);
+                if (v9ses->cache && !v9inode->writeback_fid &&
+                    ((flags & O_ACCMODE) != O_RDONLY)) {
+                        /*
+                         * clone a fid and add it to writeback_fid
+                         * we do it during open time instead of
+                         * page dirty time via write_begin/page_mkwrite
+                         * because we want write after unlink usecase
+                         * to work.
+                         */
+                        inode_fid = v9fs_writeback_fid(dentry);
+                        if (IS_ERR(inode_fid)) {
+                                err = PTR_ERR(inode_fid);
+                                mutex_unlock(&v9inode->v_mutex);
+                                goto error;
+                        }
+                        v9inode->writeback_fid = (void *) inode_fid;
+                }
+                mutex_unlock(&v9inode->v_mutex);
                filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
                if (IS_ERR(filp)) {
                        err = PTR_ERR(filp);
@@ -601,6 +677,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
                }
                filp->private_data = fid;
+#ifdef CONFIG_9P_FSCACHE
+                if (v9ses->cache)
+                        v9fs_cache_inode_set_cookie(dentry->d_inode, filp);
+#endif
        } else
                p9_client_clunk(fid);
@@ -625,8 +705,8 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
        int err;
        u32 perm;
-        struct v9fs_session_info *v9ses;
        struct p9_fid *fid;
+        struct v9fs_session_info *v9ses;
        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
        err = 0;
@@ -636,6 +716,9 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (IS_ERR(fid)) {
                err = PTR_ERR(fid);
                fid = NULL;
+        } else {
+                inc_nlink(dir);
+                v9fs_invalidate_inode_attr(dir);
        }
        if (fid)
@@ -687,7 +770,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
                return ERR_PTR(result);
        }
-        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
        if (IS_ERR(inode)) {
                result = PTR_ERR(inode);
                inode = NULL;
@@ -747,17 +830,19 @@ int
 v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                struct inode *new_dir, struct dentry *new_dentry)
 {
+        int retval;
        struct inode *old_inode;
+        struct inode *new_inode;
        struct v9fs_session_info *v9ses;
        struct p9_fid *oldfid;
        struct p9_fid *olddirfid;
        struct p9_fid *newdirfid;
        struct p9_wstat wstat;
-        int retval;
        P9_DPRINTK(P9_DEBUG_VFS, "\n");
        retval = 0;
        old_inode = old_dentry->d_inode;
+        new_inode = new_dentry->d_inode;
        v9ses = v9fs_inode2v9ses(old_inode);
        oldfid = v9fs_fid_lookup(old_dentry);
        if (IS_ERR(oldfid))
@@ -798,9 +883,30 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        retval = p9_client_wstat(oldfid, &wstat);
 clunk_newdir:
-        if (!retval)
+        if (!retval) {
+                if (new_inode) {
+                        if (S_ISDIR(new_inode->i_mode))
+                                clear_nlink(new_inode);
+                        else
+                                drop_nlink(new_inode);
+                        /*
+                         * Work around vfs rename rehash bug with
+                         * FS_RENAME_DOES_D_MOVE
+                         */
+                        v9fs_invalidate_inode_attr(new_inode);
+                }
+                if (S_ISDIR(old_inode->i_mode)) {
+                        if (!new_inode)
+                                inc_nlink(new_dir);
+                        drop_nlink(old_dir);
+                }
+                v9fs_invalidate_inode_attr(old_inode);
+                v9fs_invalidate_inode_attr(old_dir);
+                v9fs_invalidate_inode_attr(new_dir);
                /* successful rename */
                d_move(old_dentry, new_dentry);
+        }
        up_write(&v9ses->rename_sem);
        p9_client_clunk(newdirfid);
@@ -830,10 +936,11 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
        err = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        v9ses = v9fs_dentry2v9ses(dentry);
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                return simple_getattr(mnt, dentry, stat);
+                generic_fillattr(dentry->d_inode, stat);
+                return 0;
+        }
        fid = v9fs_fid_lookup(dentry);
        if (IS_ERR(fid))
                return PTR_ERR(fid);
@@ -865,8 +972,12 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
        struct p9_wstat wstat;
        P9_DPRINTK(P9_DEBUG_VFS, "\n");
+        retval = inode_change_ok(dentry->d_inode, iattr);
+        if (retval)
+                return retval;
        retval = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        v9ses = v9fs_dentry2v9ses(dentry);
        fid = v9fs_fid_lookup(dentry);
        if(IS_ERR(fid))
                return PTR_ERR(fid);
@@ -892,16 +1003,19 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
                        wstat.n_gid = iattr->ia_gid;
        }
+        /* Write all dirty data */
+        if (S_ISREG(dentry->d_inode->i_mode))
+                filemap_write_and_wait(dentry->d_inode->i_mapping);
        retval = p9_client_wstat(fid, &wstat);
        if (retval < 0)
                return retval;
        if ((iattr->ia_valid & ATTR_SIZE) &&
-            iattr->ia_size != i_size_read(dentry->d_inode)) {
+            iattr->ia_size != i_size_read(dentry->d_inode))
-                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
+                truncate_setsize(dentry->d_inode, iattr->ia_size);
-                if (retval)
-                        return retval;
+        v9fs_invalidate_inode_attr(dentry->d_inode);
-        }
        setattr_copy(dentry->d_inode, iattr);
        mark_inode_dirty(dentry->d_inode);
@@ -924,6 +1038,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
        char tag_name[14];
        unsigned int i_nlink;
        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        inode->i_nlink = 1;
@@ -983,6 +1098,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
        /* not real number of blocks, but 512 byte ones ... */
        inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+        v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
 }
 /**
@@ -1023,7 +1139,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
        P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
        retval = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        v9ses = v9fs_dentry2v9ses(dentry);
        fid = v9fs_fid_lookup(dentry);
        if (IS_ERR(fid))
                return PTR_ERR(fid);
@@ -1115,8 +1231,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
        int mode, const char *extension)
 {
        u32 perm;
-        struct v9fs_session_info *v9ses;
        struct p9_fid *fid;
+        struct v9fs_session_info *v9ses;
        v9ses = v9fs_inode2v9ses(dir);
        if (!v9fs_proto_dotu(v9ses)) {
@@ -1130,6 +1246,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(fid))
                return PTR_ERR(fid);
+        v9fs_invalidate_inode_attr(dir);
        p9_client_clunk(fid);
        return 0;
 }
@@ -1166,8 +1283,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
              struct dentry *dentry)
 {
        int retval;
-        struct p9_fid *oldfid;
        char *name;
+        struct p9_fid *oldfid;
        P9_DPRINTK(P9_DEBUG_VFS,
                " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
@@ -1186,7 +1303,10 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
        sprintf(name, "%d\n", oldfid->fid);
        retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name);
        __putname(name);
+        if (!retval) {
+                v9fs_refresh_inode(oldfid, old_dentry->d_inode);
+                v9fs_invalidate_inode_attr(dir);
+        }
 clunk_fid:
        p9_client_clunk(oldfid);
        return retval;
@@ -1237,6 +1357,32 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        return retval;
 }
+int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
+{
+        loff_t i_size;
+        struct p9_wstat *st;
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(inode);
+        st = p9_client_stat(fid);
+        if (IS_ERR(st))
+                return PTR_ERR(st);
+        spin_lock(&inode->i_lock);
+        /*
+         * We don't want to refresh inode->i_size,
+         * because we may have cached data
+         */
+        i_size = inode->i_size;
+        v9fs_stat2inode(st, inode, inode->i_sb);
+        if (v9ses->cache)
+                inode->i_size = i_size;
+        spin_unlock(&inode->i_lock);
+        p9stat_free(st);
+        kfree(st);
+        return 0;
+}
 static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index fe3ffa9aace4..ffbb113d5f33 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -86,40 +86,63 @@ static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
        return dentry;
 }
+static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
+                                        struct p9_qid *qid,
+                                        struct p9_fid *fid,
+                                        struct p9_stat_dotl *st)
+{
+        int retval;
+        unsigned long i_ino;
+        struct inode *inode;
+        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        i_ino = v9fs_qid2ino(qid);
+        inode = iget_locked(sb, i_ino);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        /*
+         * initialize the inode with the stat info
+         * FIXME!! we may need support for stale inodes
+         * later.
+         */
+        retval = v9fs_init_inode(v9ses, inode, st->st_mode);
+        if (retval)
+                goto error;
+        v9fs_stat2inode_dotl(st, inode);
+#ifdef CONFIG_9P_FSCACHE
+        v9fs_fscache_set_key(inode, &st->qid);
+        v9fs_cache_inode_get_cookie(inode);
+#endif
+        retval = v9fs_get_acl(inode, fid);
+        if (retval)
+                goto error;
+        unlock_new_inode(inode);
+        return inode;
+error:
+        unlock_new_inode(inode);
+        iput(inode);
+        return ERR_PTR(retval);
+}
 struct inode *
-v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-        struct super_block *sb)
+                         struct super_block *sb)
 {
-        struct inode *ret = NULL;
-        int err;
        struct p9_stat_dotl *st;
+        struct inode *inode = NULL;
        st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
        if (IS_ERR(st))
                return ERR_CAST(st);
-        ret = v9fs_get_inode(sb, st->st_mode);
+        inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st);
-        if (IS_ERR(ret)) {
-                err = PTR_ERR(ret);
-                goto error;
-        }
-        v9fs_stat2inode_dotl(st, ret);
-        ret->i_ino = v9fs_qid2ino(&st->qid);
-#ifdef CONFIG_9P_FSCACHE
-        v9fs_vcookie_set_qid(ret, &st->qid);
-        v9fs_cache_inode_get_cookie(ret);
-#endif
-        err = v9fs_get_acl(ret, fid);
-        if (err) {
-                iput(ret);
-                goto error;
-        }
        kfree(st);
-        return ret;
+        return inode;
-error:
-        kfree(st);
-        return ERR_PTR(err);
 }
 /**
@@ -136,16 +159,17 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                struct nameidata *nd)
 {
        int err = 0;
-        char *name = NULL;
        gid_t gid;
        int flags;
        mode_t mode;
-        struct v9fs_session_info *v9ses;
+        char *name = NULL;
-        struct p9_fid *fid = NULL;
-        struct p9_fid *dfid, *ofid;
        struct file *filp;
        struct p9_qid qid;
        struct inode *inode;
+        struct p9_fid *fid = NULL;
+        struct v9fs_inode *v9inode;
+        struct p9_fid *dfid, *ofid, *inode_fid;
+        struct v9fs_session_info *v9ses;
        struct posix_acl *pacl = NULL, *dacl = NULL;
        v9ses = v9fs_inode2v9ses(dir);
@@ -196,6 +220,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                                err);
                goto error;
        }
+        v9fs_invalidate_inode_attr(dir);
        /* instantiate inode and assign the unopened fid to the dentry */
        fid = p9_client_walk(dfid, 1, &name, 1);
@@ -205,7 +230,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                fid = NULL;
                goto error;
        }
-        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -219,6 +244,26 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
        /* Now set the ACL based on the default value */
        v9fs_set_create_acl(dentry, dacl, pacl);
+        v9inode = V9FS_I(inode);
+        mutex_lock(&v9inode->v_mutex);
+        if (v9ses->cache && !v9inode->writeback_fid &&
+            ((flags & O_ACCMODE) != O_RDONLY)) {
+                /*
+                 * clone a fid and add it to writeback_fid
+                 * we do it during open time instead of
+                 * page dirty time via write_begin/page_mkwrite
+                 * because we want write after unlink usecase
+                 * to work.
+                 */
+                inode_fid = v9fs_writeback_fid(dentry);
+                if (IS_ERR(inode_fid)) {
+                        err = PTR_ERR(inode_fid);
+                        mutex_unlock(&v9inode->v_mutex);
+                        goto error;
+                }
+                v9inode->writeback_fid = (void *) inode_fid;
+        }
+        mutex_unlock(&v9inode->v_mutex);
        /* Since we are opening a file, assign the open fid to the file */
        filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
        if (IS_ERR(filp)) {
@@ -226,6 +271,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                return PTR_ERR(filp);
        }
        filp->private_data = ofid;
+#ifdef CONFIG_9P_FSCACHE
+        if (v9ses->cache)
+                v9fs_cache_inode_set_cookie(inode, filp);
+#endif
        return 0;
 error:
@@ -300,7 +349,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
                        goto error;
                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -327,7 +376,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
        }
        /* Now set the ACL based on the default value */
        v9fs_set_create_acl(dentry, dacl, pacl);
+        inc_nlink(dir);
+        v9fs_invalidate_inode_attr(dir);
 error:
        if (fid)
                p9_client_clunk(fid);
@@ -345,10 +395,11 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
        err = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        v9ses = v9fs_dentry2v9ses(dentry);
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                return simple_getattr(mnt, dentry, stat);
+                generic_fillattr(dentry->d_inode, stat);
+                return 0;
+        }
        fid = v9fs_fid_lookup(dentry);
        if (IS_ERR(fid))
                return PTR_ERR(fid);
@@ -401,22 +452,24 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
        p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
        retval = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        v9ses = v9fs_dentry2v9ses(dentry);
        fid = v9fs_fid_lookup(dentry);
        if (IS_ERR(fid))
                return PTR_ERR(fid);
+        /* Write all dirty data */
+        if (S_ISREG(dentry->d_inode->i_mode))
+                filemap_write_and_wait(dentry->d_inode->i_mapping);
        retval = p9_client_setattr(fid, &p9attr);
        if (retval < 0)
                return retval;
        if ((iattr->ia_valid & ATTR_SIZE) &&
-            iattr->ia_size != i_size_read(dentry->d_inode)) {
+            iattr->ia_size != i_size_read(dentry->d_inode))
-                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
+                truncate_setsize(dentry->d_inode, iattr->ia_size);
-                if (retval)
-                        return retval;
-        }
+        v9fs_invalidate_inode_attr(dentry->d_inode);
        setattr_copy(dentry->d_inode, iattr);
        mark_inode_dirty(dentry->d_inode);
        if (iattr->ia_valid & ATTR_MODE) {
@@ -439,6 +492,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 void
 v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 {
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
                inode->i_atime.tv_sec = stat->st_atime_sec;
@@ -497,20 +551,21 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
        /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
         * because the inode structure does not have fields for them.
         */
+        v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
 }
 static int
 v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
                const char *symname)
 {
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *dfid;
-        struct p9_fid *fid = NULL;
-        struct inode *inode;
-        struct p9_qid qid;
-        char *name;
        int err;
        gid_t gid;
+        char *name;
+        struct p9_qid qid;
+        struct inode *inode;
+        struct p9_fid *dfid;
+        struct p9_fid *fid = NULL;
+        struct v9fs_session_info *v9ses;
        name = (char *) dentry->d_name.name;
        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
@@ -534,6 +589,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
                goto error;
        }
+        v9fs_invalidate_inode_attr(dir);
        if (v9ses->cache) {
                /* Now walk from the parent so we can get an unopened fid. */
                fid = p9_client_walk(dfid, 1, &name, 1);
@@ -546,7 +602,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
                }
                /* instantiate inode and assign the unopened fid to dentry */
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -588,10 +644,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
                struct dentry *dentry)
 {
        int err;
-        struct p9_fid *dfid, *oldfid;
        char *name;
-        struct v9fs_session_info *v9ses;
        struct dentry *dir_dentry;
+        struct p9_fid *dfid, *oldfid;
+        struct v9fs_session_info *v9ses;
        P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
                        dir->i_ino, old_dentry->d_name.name,
@@ -616,29 +672,17 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
                return err;
        }
+        v9fs_invalidate_inode_attr(dir);
        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
                /* Get the latest stat info from server. */
                struct p9_fid *fid;
-                struct p9_stat_dotl *st;
                fid = v9fs_fid_lookup(old_dentry);
                if (IS_ERR(fid))
                        return PTR_ERR(fid);
-                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+                v9fs_refresh_inode_dotl(fid, old_dentry->d_inode);
-                if (IS_ERR(st))
-                        return PTR_ERR(st);
-                v9fs_stat2inode_dotl(st, old_dentry->d_inode);
-                kfree(st);
-        } else {
-                /* Caching disabled. No need to get upto date stat info.
-                 * This dentry will be released immediately. So, just hold the
-                 * inode
-                 */
-                ihold(old_dentry->d_inode);
        }
+        ihold(old_dentry->d_inode);
        d_instantiate(dentry, old_dentry->d_inode);
        return err;
@@ -657,12 +701,12 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
                dev_t rdev)
 {
        int err;
+        gid_t gid;
        char *name;
        mode_t mode;
        struct v9fs_session_info *v9ses;
        struct p9_fid *fid = NULL, *dfid = NULL;
        struct inode *inode;
-        gid_t gid;
        struct p9_qid qid;
        struct dentry *dir_dentry;
        struct posix_acl *dacl = NULL, *pacl = NULL;
@@ -699,6 +743,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
        if (err < 0)
                goto error;
+        v9fs_invalidate_inode_attr(dir);
        /* instantiate inode and assign the unopened fid to the dentry */
        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
                fid = p9_client_walk(dfid, 1, &name, 1);
@@ -710,7 +755,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
                        goto error;
                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -782,6 +827,31 @@ ndset:
        return NULL;
 }
+int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
+{
+        loff_t i_size;
+        struct p9_stat_dotl *st;
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(inode);
+        st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+        if (IS_ERR(st))
+                return PTR_ERR(st);
+        spin_lock(&inode->i_lock);
+        /*
+         * We don't want to refresh inode->i_size,
+         * because we may have cached data
+         */
+        i_size = inode->i_size;
+        v9fs_stat2inode_dotl(st, inode);
+        if (v9ses->cache)
+                inode->i_size = i_size;
+        spin_unlock(&inode->i_lock);
+        kfree(st);
+        return 0;
+}
 const struct inode_operations v9fs_dir_inode_operations_dotl = {
        .create = v9fs_vfs_create_dotl,
        .lookup = v9fs_vfs_lookup,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index dbaabe3b8131..f3eed3383e4f 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -86,12 +86,15 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
        } else
                sb->s_op = &v9fs_super_ops;
        sb->s_bdi = &v9ses->bdi;
+        if (v9ses->cache)
+                sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE;
-        sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
+        sb->s_flags = flags | MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
-            MS_NOATIME;
+        if (!v9ses->cache)
+                sb->s_flags |= MS_SYNCHRONOUS;
 #ifdef CONFIG_9P_FS_POSIX_ACL
-        if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)
+        if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL)
                sb->s_flags |= MS_POSIXACL;
 #endif
@@ -151,7 +154,6 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
                retval = PTR_ERR(inode);
                goto release_sb;
        }
        root = d_alloc_root(inode);
        if (!root) {
                iput(inode);
@@ -166,7 +168,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
                        retval = PTR_ERR(st);
                        goto release_sb;
                }
+                root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
                v9fs_stat2inode_dotl(st, root->d_inode);
                kfree(st);
        } else {
@@ -183,10 +185,21 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
                p9stat_free(st);
                kfree(st);
        }
+        v9fs_fid_add(root, fid);
        retval = v9fs_get_acl(inode, fid);
        if (retval)
                goto release_sb;
-        v9fs_fid_add(root, fid);
+        /*
+         * Add the root fid to session info. This is used
+         * for file system sync. We want a cloned fid here
+         * so that we can do a sync_filesystem after a
+         * shrink_dcache_for_umount
+         */
+        v9ses->root_fid = v9fs_fid_clone(root);
+        if (IS_ERR(v9ses->root_fid)) {
+                retval = PTR_ERR(v9ses->root_fid);
+                goto release_sb;
+        }
        P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
        return dget(sb->s_root);
@@ -197,15 +210,11 @@ close_session:
        v9fs_session_close(v9ses);
        kfree(v9ses);
        return ERR_PTR(retval);
 release_sb:
        /*
-         * we will do the session_close and root dentry release
+         * we will do the session_close and root dentry
-         * in the below call. But we need to clunk fid, because we haven't
+         * release in the below call.
-         * attached the fid to dentry so it won't get clunked
-         * automatically.
         */
-        p9_client_clunk(fid);
        deactivate_locked_super(sb);
        return ERR_PTR(retval);
 }
@@ -223,7 +232,7 @@ static void v9fs_kill_super(struct super_block *s)
        P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
        kill_anon_super(s);
+        p9_client_clunk(v9ses->root_fid);
        v9fs_session_cancel(v9ses);
        v9fs_session_close(v9ses);
        kfree(v9ses);
@@ -253,7 +262,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
                goto done;
        }
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        v9ses = v9fs_dentry2v9ses(dentry);
        if (v9fs_proto_dotl(v9ses)) {
                res = p9_client_statfs(fid, &rs);
                if (res == 0) {
@@ -276,11 +285,31 @@ done:
        return res;
 }
+static int v9fs_sync_fs(struct super_block *sb, int wait)
+{
+        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_sync_fs: super_block %p\n", sb);
+        return p9_client_sync_fs(v9ses->root_fid);
+}
+static int v9fs_drop_inode(struct inode *inode)
+{
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(inode);
+        if (v9ses->cache)
+                return generic_drop_inode(inode);
+        /*
+         * in case of non cached mode always drop the
+         * the inode because we want the inode attribute
+         * to always match that on the server.
+         */
+        return 1;
+}
 static const struct super_operations v9fs_super_ops = {
-#ifdef CONFIG_9P_FSCACHE
        .alloc_inode = v9fs_alloc_inode,
        .destroy_inode = v9fs_destroy_inode,
-#endif
        .statfs = simple_statfs,
        .evict_inode = v9fs_evict_inode,
        .show_options = generic_show_options,
@@ -288,11 +317,11 @@ static const struct super_operations v9fs_super_ops = {
 };
 static const struct super_operations v9fs_super_ops_dotl = {
-#ifdef CONFIG_9P_FSCACHE
        .alloc_inode = v9fs_alloc_inode,
        .destroy_inode = v9fs_destroy_inode,
-#endif
+        .sync_fs = v9fs_sync_fs,
        .statfs = v9fs_statfs,
+        .drop_inode = v9fs_drop_inode,
        .evict_inode = v9fs_evict_inode,
        .show_options = generic_show_options,
        .umount_begin = v9fs_umount_begin,
@@ -303,5 +332,5 @@ struct file_system_type v9fs_fs_type = {
        .mount = v9fs_mount,
        .kill_sb = v9fs_kill_super,
        .owner = THIS_MODULE,
-        .fs_flags = FS_RENAME_DOES_D_MOVE,
+        .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT,
 };
diff --git a/fs/Kconfig b/fs/Kconfig
index 3db9caa57edc..f3aa9b08b228 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,7 +47,7 @@ config FS_POSIX_ACL
        def_bool n
 config EXPORTFS
-        tristate
+        bool
 config FILE_LOCKING
        bool "Enable POSIX file locking API" if EXPERT
@@ -187,6 +187,7 @@ source "fs/omfs/Kconfig"
 source "fs/hpfs/Kconfig"
 source "fs/qnx4/Kconfig"
 source "fs/romfs/Kconfig"
+source "fs/pstore/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index a7f7cef0c0c8..fb68c2b8cf8a 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -48,6 +48,8 @@ obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o xattr_acl.o
 obj-$(CONFIG_NFS_COMMON)        += nfs_common/
 obj-$(CONFIG_GENERIC_ACL)       += generic_acl.o
+obj-$(CONFIG_FHANDLE)           += fhandle.o
 obj-y                           += quota/
 obj-$(CONFIG_PROC_FS)           += proc/
@@ -121,3 +123,4 @@ obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_EXOFS_FS)          += exofs/
 obj-$(CONFIG_CEPH_FS)           += ceph/
+obj-$(CONFIG_PSTORE)            += pstore/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index 1dd5f34b3cf2..e55182a74605 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,7 +1,6 @@
 config ADFS_FS
        tristate "ADFS file system support (EXPERIMENTAL)"
        depends on BLOCK && EXPERIMENTAL
-        depends on BKL # need to fix
        help
          The Acorn Disc Filing System is the standard file system of the
          RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 2ff622f6f547..718ac1f440c6 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -50,6 +50,7 @@ struct adfs_sb_info {
        gid_t           s_gid;          /* owner gid                             */
        umode_t         s_owner_mask;   /* ADFS owner perm -> unix perm          */
        umode_t         s_other_mask;   /* ADFS other perm -> unix perm          */
+        int             s_ftsuffix;     /* ,xyz hex filetype suffix option */
        __u32           s_ids_per_zone; /* max. no ids in one zone               */
        __u32           s_idlen;        /* length of ID in map                   */
@@ -79,6 +80,10 @@ struct adfs_dir {
        int                     nr_buffers;
        struct buffer_head      *bh[4];
+        /* big directories need allocated buffers */
+        struct buffer_head      **bh_fplus;
        unsigned int            pos;
        unsigned int            parent_id;
@@ -89,7 +94,7 @@ struct adfs_dir {
 /*
 * This is the overall maximum name length
 */
-#define ADFS_MAX_NAME_LEN       256
+#define ADFS_MAX_NAME_LEN       (256 + 4) /* +4 for ,xyz hex filetype suffix */
 struct object_info {
        __u32           parent_id;              /* parent object id     */
        __u32           file_id;                /* object id            */
@@ -97,10 +102,26 @@ struct object_info {
        __u32           execaddr;               /* execution address    */
        __u32           size;                   /* size                 */
        __u8            attr;                   /* RISC OS attributes   */
-        unsigned char   name_len;               /* name length          */
+        unsigned int    name_len;               /* name length          */
        char            name[ADFS_MAX_NAME_LEN];/* file name            */
+        /* RISC OS file type (12-bit: derived from loadaddr) */
+        __u16           filetype;
 };
+/* RISC OS 12-bit filetype converts to ,xyz hex filename suffix */
+static inline int append_filetype_suffix(char *buf, __u16 filetype)
+{
+        if (filetype == 0xffff) /* no explicit 12-bit file type was set */
+                return 0;
+        *buf++ = ',';
+        *buf++ = hex_asc_lo(filetype >> 8);
+        *buf++ = hex_asc_lo(filetype >> 4);
+        *buf++ = hex_asc_lo(filetype >> 0);
+        return 4;
+}
 struct adfs_dir_ops {
        int     (*read)(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir);
        int     (*setpos)(struct adfs_dir *dir, unsigned int fpos);
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 3b4a764ed780..3d83075aaa2e 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -9,7 +9,6 @@
 *
 *  Common directory handling for ADFS
 */
-#include <linux/smp_lock.h>
 #include "adfs.h"
 /*
@@ -27,8 +26,6 @@ adfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct adfs_dir dir;
        int ret = 0;
-        lock_kernel();  
        if (filp->f_pos >> 32)
                goto out;
@@ -70,7 +67,6 @@ free_out:
        ops->free(&dir);
 out:
-        unlock_kernel();
        return ret;
 }
@@ -276,7 +272,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        struct object_info obj;
        int error;
-        lock_kernel();
        error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
        if (error == 0) {
                error = -EACCES;
@@ -288,7 +283,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                if (inode)
                        error = 0;
        }
-        unlock_kernel();
        d_add(dentry, inode);
        return ERR_PTR(error);
 }
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index bafc71222e25..4bbe853ee50a 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -52,7 +52,6 @@ static inline int adfs_readname(char *buf, char *ptr, int maxlen)
                        *buf++ = *ptr;
                ptr++;
        }
-        *buf = '\0';
        return buf - old_buf;
 }
@@ -208,7 +207,8 @@ release_buffers:
 * convert a disk-based directory entry to a Linux ADFS directory entry
 */
 static inline void
-adfs_dir2obj(struct object_info *obj, struct adfs_direntry *de)
+adfs_dir2obj(struct adfs_dir *dir, struct object_info *obj,
+        struct adfs_direntry *de)
 {
        obj->name_len = adfs_readname(obj->name, de->dirobname, ADFS_F_NAME_LEN);
        obj->file_id  = adfs_readval(de->dirinddiscadd, 3);
@@ -216,6 +216,23 @@ adfs_dir2obj(struct object_info *obj, struct adfs_direntry *de)
        obj->execaddr = adfs_readval(de->direxec, 4);
        obj->size     = adfs_readval(de->dirlen,  4);
        obj->attr     = de->newdiratts;
+        obj->filetype = -1;
+        /*
+         * object is a file and is filetyped and timestamped?
+         * RISC OS 12-bit filetype is stored in load_address[19:8]
+         */
+        if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) &&
+                (0xfff00000 == (0xfff00000 & obj->loadaddr))) {
+                obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8);
+                /* optionally append the ,xyz hex filetype suffix */
+                if (ADFS_SB(dir->sb)->s_ftsuffix)
+                        obj->name_len +=
+                                append_filetype_suffix(
+                                        &obj->name[obj->name_len],
+                                        obj->filetype);
+        }
 }
 /*
@@ -260,7 +277,7 @@ __adfs_dir_get(struct adfs_dir *dir, int pos, struct object_info *obj)
        if (!de.dirobname[0])
                return -ENOENT;
-        adfs_dir2obj(obj, &de);
+        adfs_dir2obj(dir, obj, &de);
        return 0;
 }
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 1796bb352d05..d9e3bee4e653 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -8,6 +8,7 @@
 * published by the Free Software Foundation.
 */
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include "adfs.h"
 #include "dir_fplus.h"
@@ -22,30 +23,53 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct
        dir->nr_buffers = 0;
+        /* start off using fixed bh set - only alloc for big dirs */
+        dir->bh_fplus = &dir->bh[0];
        block = __adfs_block_map(sb, id, 0);
        if (!block) {
                adfs_error(sb, "dir object %X has a hole at offset 0", id);
                goto out;
        }
-        dir->bh[0] = sb_bread(sb, block);
+        dir->bh_fplus[0] = sb_bread(sb, block);
-        if (!dir->bh[0])
+        if (!dir->bh_fplus[0])
                goto out;
        dir->nr_buffers += 1;
-        h = (struct adfs_bigdirheader *)dir->bh[0]->b_data;
+        h = (struct adfs_bigdirheader *)dir->bh_fplus[0]->b_data;
        size = le32_to_cpu(h->bigdirsize);
        if (size != sz) {
-                printk(KERN_WARNING "adfs: adfs_fplus_read: directory header size\n"
+                printk(KERN_WARNING "adfs: adfs_fplus_read:"
-                                " does not match directory size\n");
+                                        " directory header size %X\n"
+                                        " does not match directory size %X\n",
+                                        size, sz);
        }
        if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 ||
            h->bigdirversion[2] != 0 || size & 2047 ||
-            h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME))
+            h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME)) {
+                printk(KERN_WARNING "adfs: dir object %X has"
+                                        " malformed dir header\n", id);
                goto out;
+        }
        size >>= sb->s_blocksize_bits;
+        if (size > sizeof(dir->bh)/sizeof(dir->bh[0])) {
+                /* this directory is too big for fixed bh set, must allocate */
+                struct buffer_head **bh_fplus =
+                        kzalloc(size * sizeof(struct buffer_head *),
+                                GFP_KERNEL);
+                if (!bh_fplus) {
+                        adfs_error(sb, "not enough memory for"
+                                        " dir object %X (%d blocks)", id, size);
+                        goto out;
+                }
+                dir->bh_fplus = bh_fplus;
+                /* copy over the pointer to the block that we've already read */
+                dir->bh_fplus[0] = dir->bh[0];
+        }
        for (blk = 1; blk < size; blk++) {
                block = __adfs_block_map(sb, id, blk);
                if (!block) {
@@ -53,25 +77,44 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct
                        goto out;
                }
-                dir->bh[blk] = sb_bread(sb, block);
+                dir->bh_fplus[blk] = sb_bread(sb, block);
-                if (!dir->bh[blk])
+                if (!dir->bh_fplus[blk]) {
+                        adfs_error(sb,  "dir object %X failed read for"
+                                        " offset %d, mapped block %X",
+                                        id, blk, block);
                        goto out;
-                dir->nr_buffers = blk;
+                }
+                dir->nr_buffers += 1;
        }
-        t = (struct adfs_bigdirtail *)(dir->bh[size - 1]->b_data + (sb->s_blocksize - 8));
+        t = (struct adfs_bigdirtail *)
+                (dir->bh_fplus[size - 1]->b_data + (sb->s_blocksize - 8));
        if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) ||
            t->bigdirendmasseq != h->startmasseq ||
-            t->reserved[0] != 0 || t->reserved[1] != 0)
+            t->reserved[0] != 0 || t->reserved[1] != 0) {
+                printk(KERN_WARNING "adfs: dir object %X has "
+                                        "malformed dir end\n", id);
                goto out;
+        }
        dir->parent_id = le32_to_cpu(h->bigdirparent);
        dir->sb = sb;
        return 0;
 out:
-        for (i = 0; i < dir->nr_buffers; i++)
+        if (dir->bh_fplus) {
-                brelse(dir->bh[i]);
+                for (i = 0; i < dir->nr_buffers; i++)
+                        brelse(dir->bh_fplus[i]);
+                if (&dir->bh[0] != dir->bh_fplus)
+                        kfree(dir->bh_fplus);
+                dir->bh_fplus = NULL;
+        }
+        dir->nr_buffers = 0;
        dir->sb = NULL;
        return ret;
 }
@@ -79,7 +122,8 @@ out:
 static int
 adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos)
 {
-        struct adfs_bigdirheader *h = (struct adfs_bigdirheader *)dir->bh[0]->b_data;
+        struct adfs_bigdirheader *h =
+                (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data;
        int ret = -ENOENT;
        if (fpos <= le32_to_cpu(h->bigdirentries)) {
@@ -102,21 +146,27 @@ dir_memcpy(struct adfs_dir *dir, unsigned int offset, void *to, int len)
        partial = sb->s_blocksize - offset;
        if (partial >= len)
-                memcpy(to, dir->bh[buffer]->b_data + offset, len);
+                memcpy(to, dir->bh_fplus[buffer]->b_data + offset, len);
        else {
                char *c = (char *)to;
                remainder = len - partial;
-                memcpy(c, dir->bh[buffer]->b_data + offset, partial);
+                memcpy(c,
-                memcpy(c + partial, dir->bh[buffer + 1]->b_data, remainder);
+                        dir->bh_fplus[buffer]->b_data + offset,
+                        partial);
+                memcpy(c + partial,
+                        dir->bh_fplus[buffer + 1]->b_data,
+                        remainder);
        }
 }
 static int
 adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj)
 {
-        struct adfs_bigdirheader *h = (struct adfs_bigdirheader *)dir->bh[0]->b_data;
+        struct adfs_bigdirheader *h =
+                (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data;
        struct adfs_bigdirentry bde;
        unsigned int offset;
        int i, ret = -ENOENT;
@@ -147,6 +197,24 @@ adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj)
                if (obj->name[i] == '/')
                        obj->name[i] = '.';
+        obj->filetype = -1;
+        /*
+         * object is a file and is filetyped and timestamped?
+         * RISC OS 12-bit filetype is stored in load_address[19:8]
+         */
+        if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) &&
+                (0xfff00000 == (0xfff00000 & obj->loadaddr))) {
+                obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8);
+                /* optionally append the ,xyz hex filetype suffix */
+                if (ADFS_SB(dir->sb)->s_ftsuffix)
+                        obj->name_len +=
+                                append_filetype_suffix(
+                                        &obj->name[obj->name_len],
+                                        obj->filetype);
+        }
        dir->pos += 1;
        ret = 0;
 out:
@@ -160,7 +228,7 @@ adfs_fplus_sync(struct adfs_dir *dir)
        int i;
        for (i = dir->nr_buffers - 1; i >= 0; i--) {
-                struct buffer_head *bh = dir->bh[i];
+                struct buffer_head *bh = dir->bh_fplus[i];
                sync_dirty_buffer(bh);
                if (buffer_req(bh) && !buffer_uptodate(bh))
                        err = -EIO;
@@ -174,8 +242,17 @@ adfs_fplus_free(struct adfs_dir *dir)
 {
        int i;
-        for (i = 0; i < dir->nr_buffers; i++)
+        if (dir->bh_fplus) {
-                brelse(dir->bh[i]);
+                for (i = 0; i < dir->nr_buffers; i++)
+                        brelse(dir->bh_fplus[i]);
+                if (&dir->bh[0] != dir->bh_fplus)
+                        kfree(dir->bh_fplus);
+                dir->bh_fplus = NULL;
+        }
+        dir->nr_buffers = 0;
        dir->sb = NULL;
 }
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 65794b8fe79e..d5250c5aae21 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -7,7 +7,6 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include "adfs.h"
@@ -73,32 +72,18 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
 static const struct address_space_operations adfs_aops = {
        .readpage       = adfs_readpage,
        .writepage      = adfs_writepage,
-        .sync_page      = block_sync_page,
        .write_begin    = adfs_write_begin,
        .write_end      = generic_write_end,
        .bmap           = _adfs_bmap
 };
-static inline unsigned int
-adfs_filetype(struct inode *inode)
-{
-        unsigned int type;
-        if (ADFS_I(inode)->stamped)
-                type = (ADFS_I(inode)->loadaddr >> 8) & 0xfff;
-        else
-                type = (unsigned int) -1;
-        return type;
-}
 /*
 * Convert ADFS attributes and filetype to Linux permission.
 */
 static umode_t
 adfs_atts2mode(struct super_block *sb, struct inode *inode)
 {
-        unsigned int filetype, attr = ADFS_I(inode)->attr;
+        unsigned int attr = ADFS_I(inode)->attr;
        umode_t mode, rmask;
        struct adfs_sb_info *asb = ADFS_SB(sb);
@@ -107,9 +92,7 @@ adfs_atts2mode(struct super_block *sb, struct inode *inode)
                return S_IFDIR | S_IXUGO | mode;
        }
-        filetype = adfs_filetype(inode);
+        switch (ADFS_I(inode)->filetype) {
-        switch (filetype) {
        case 0xfc0:     /* LinkFS */
                return S_IFLNK|S_IRWXUGO;
@@ -175,50 +158,48 @@ adfs_mode2atts(struct super_block *sb, struct inode *inode)
 /*
 * Convert an ADFS time to Unix time.  ADFS has a 40-bit centi-second time
- * referenced to 1 Jan 1900 (til 2248)
+ * referenced to 1 Jan 1900 (til 2248) so we need to discard 2208988800 seconds
+ * of time to convert from RISC OS epoch to Unix epoch.
 */
 static void
 adfs_adfs2unix_time(struct timespec *tv, struct inode *inode)
 {
        unsigned int high, low;
+        /* 01 Jan 1970 00:00:00 (Unix epoch) as nanoseconds since
+         * 01 Jan 1900 00:00:00 (RISC OS epoch)
+         */
+        static const s64 nsec_unix_epoch_diff_risc_os_epoch =
+                                                        2208988800000000000LL;
+        s64 nsec;
        if (ADFS_I(inode)->stamped == 0)
                goto cur_time;
-        high = ADFS_I(inode)->loadaddr << 24;
+        high = ADFS_I(inode)->loadaddr & 0xFF; /* top 8 bits of timestamp */
-        low  = ADFS_I(inode)->execaddr;
+        low  = ADFS_I(inode)->execaddr;    /* bottom 32 bits of timestamp */
-        high |= low >> 8;
+        /* convert 40-bit centi-seconds to 32-bit seconds
-        low  &= 255;
+         * going via nanoseconds to retain precision
+         */
+        nsec = (((s64) high << 32) | (s64) low) * 10000000; /* cs to ns */
        /* Files dated pre  01 Jan 1970 00:00:00. */
-        if (high < 0x336e996a)
+        if (nsec < nsec_unix_epoch_diff_risc_os_epoch)
                goto too_early;
-        /* Files dated post 18 Jan 2038 03:14:05. */
+        /* convert from RISC OS to Unix epoch */
-        if (high >= 0x656e9969)
+        nsec -= nsec_unix_epoch_diff_risc_os_epoch;
-                goto too_late;
-        /* discard 2208988800 (0x336e996a00) seconds of time */
-        high -= 0x336e996a;
-        /* convert 40-bit centi-seconds to 32-bit seconds */
+        *tv = ns_to_timespec(nsec);
-        tv->tv_sec = (((high % 100) << 8) + low) / 100 + (high / 100 << 8);
-        tv->tv_nsec = 0;
        return;
 cur_time:
-        *tv = CURRENT_TIME_SEC;
+        *tv = CURRENT_TIME;
        return;
 too_early:
        tv->tv_sec = tv->tv_nsec = 0;
        return;
- too_late:
-        tv->tv_sec = 0x7ffffffd;
-        tv->tv_nsec = 0;
-        return;
 }
 /*
@@ -280,7 +261,8 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
        ADFS_I(inode)->loadaddr  = obj->loadaddr;
        ADFS_I(inode)->execaddr  = obj->execaddr;
        ADFS_I(inode)->attr      = obj->attr;
-        ADFS_I(inode)->stamped    = ((obj->loadaddr & 0xfff00000) == 0xfff00000);
+        ADFS_I(inode)->filetype  = obj->filetype;
+        ADFS_I(inode)->stamped   = ((obj->loadaddr & 0xfff00000) == 0xfff00000);
        inode->i_mode    = adfs_atts2mode(sb, inode);
        adfs_adfs2unix_time(&inode->i_mtime, inode);
@@ -316,8 +298,6 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
        unsigned int ia_valid = attr->ia_valid;
        int error;
        
-        lock_kernel();
        error = inode_change_ok(inode, attr);
        /*
@@ -359,7 +339,6 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
        if (ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MODE))
                mark_inode_dirty(inode);
 out:
-        unlock_kernel();
        return error;
 }
@@ -374,7 +353,6 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        struct object_info obj;
        int ret;
-        lock_kernel();
        obj.file_id     = inode->i_ino;
        obj.name_len    = 0;
        obj.parent_id   = ADFS_I(inode)->parent_id;
@@ -384,6 +362,5 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        obj.size        = inode->i_size;
        ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
-        unlock_kernel();
        return ret;
 }
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 2d7954049fbe..c8bf36a1996a 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -14,7 +14,6 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/statfs.h>
 #include "adfs.h"
 #include "dir_f.h"
@@ -120,15 +119,11 @@ static void adfs_put_super(struct super_block *sb)
        int i;
        struct adfs_sb_info *asb = ADFS_SB(sb);
-        lock_kernel();
        for (i = 0; i < asb->s_map_size; i++)
                brelse(asb->s_map[i].dm_bh);
        kfree(asb->s_map);
        kfree(asb);
        sb->s_fs_info = NULL;
-        unlock_kernel();
 }
 static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
@@ -143,17 +138,20 @@ static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
                seq_printf(seq, ",ownmask=%o", asb->s_owner_mask);
        if (asb->s_other_mask != ADFS_DEFAULT_OTHER_MASK)
                seq_printf(seq, ",othmask=%o", asb->s_other_mask);
+        if (asb->s_ftsuffix != 0)
+                seq_printf(seq, ",ftsuffix=%u", asb->s_ftsuffix);
        return 0;
 }
-enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_err};
+enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix, Opt_err};
 static const match_table_t tokens = {
        {Opt_uid, "uid=%u"},
        {Opt_gid, "gid=%u"},
        {Opt_ownmask, "ownmask=%o"},
        {Opt_othmask, "othmask=%o"},
+        {Opt_ftsuffix, "ftsuffix=%u"},
        {Opt_err, NULL}
 };
@@ -194,6 +192,11 @@ static int parse_options(struct super_block *sb, char *options)
                                return -EINVAL;
                        asb->s_other_mask = option;
                        break;
+                case Opt_ftsuffix:
+                        if (match_int(args, &option))
+                                return -EINVAL;
+                        asb->s_ftsuffix = option;
+                        break;
                default:
                        printk("ADFS-fs: unrecognised mount option \"%s\" "
                                        "or missing value\n", p);
@@ -359,15 +362,11 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
        struct adfs_sb_info *asb;
        struct inode *root;
-        lock_kernel();
        sb->s_flags |= MS_NODIRATIME;
        asb = kzalloc(sizeof(*asb), GFP_KERNEL);
-        if (!asb) {
+        if (!asb)
-                unlock_kernel();
                return -ENOMEM;
-        }
        sb->s_fs_info = asb;
        /* set default options */
@@ -375,6 +374,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
        asb->s_gid = 0;
        asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
        asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
+        asb->s_ftsuffix = 0;
        if (parse_options(sb, data))
                goto error;
@@ -454,11 +454,13 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
        root_obj.parent_id = root_obj.file_id = le32_to_cpu(dr->root);
        root_obj.name_len  = 0;
-        root_obj.loadaddr  = 0;
+        /* Set root object date as 01 Jan 1987 00:00:00 */
-        root_obj.execaddr  = 0;
+        root_obj.loadaddr  = 0xfff0003f;
+        root_obj.execaddr  = 0xec22c000;
        root_obj.size      = ADFS_NEWDIR_SIZE;
        root_obj.attr      = ADFS_NDA_DIRECTORY   | ADFS_NDA_OWNER_READ |
                             ADFS_NDA_OWNER_WRITE | ADFS_NDA_PUBLIC_READ;
+        root_obj.filetype  = -1;
        /*
         * If this is a F+ disk with variable length directories,
@@ -472,6 +474,12 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
                asb->s_dir     = &adfs_f_dir_ops;
                asb->s_namelen = ADFS_F_NAME_LEN;
        }
+        /*
+         * ,xyz hex filetype suffix may be added by driver
+         * to files that have valid RISC OS filetype
+         */
+        if (asb->s_ftsuffix)
+                asb->s_namelen += 4;
        sb->s_d_op = &adfs_dentry_operations;
        root = adfs_iget(sb, &root_obj);
@@ -485,7 +493,6 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
                adfs_error(sb, "get root inode failed\n");
                goto error;
        }
-        unlock_kernel();
        return 0;
 error_free_bh:
@@ -493,7 +500,6 @@ error_free_bh:
 error:
        sb->s_fs_info = NULL;
        kfree(asb);
-        unlock_kernel();
        return -EINVAL;
 }
diff --git a/fs/affs/Makefile b/fs/affs/Makefile
index b2c4f54446f3..3988b4a78339 100644
--- a/fs/affs/Makefile
+++ b/fs/affs/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the Linux affs filesystem routines.
 #
-#EXTRA_CFLAGS=-DDEBUG=1
+#ccflags-y := -DDEBUG=1
 obj-$(CONFIG_AFFS_FS) += affs.o
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0a90dcd46de2..acf321b70fcd 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -429,7 +429,6 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations affs_aops = {
        .readpage = affs_readpage,
        .writepage = affs_writepage,
-        .sync_page = block_sync_page,
        .write_begin = affs_write_begin,
        .write_end = generic_write_end,
        .bmap = _affs_bmap
@@ -786,7 +785,6 @@ out:
 const struct address_space_operations affs_aops_ofs = {
        .readpage = affs_readpage_ofs,
        //.writepage = affs_writepage_ofs,
-        //.sync_page = affs_sync_page_ofs,
        .write_begin = affs_write_begin_ofs,
        .write_end = affs_write_end_ofs
 };
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 15690bb1d3b5..789b3afb3423 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -140,6 +140,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
        candidate->first = candidate->last = index;
        candidate->offset_first = from;
        candidate->to_last = to;
+        INIT_LIST_HEAD(&candidate->link);
        candidate->usage = 1;
        candidate->state = AFS_WBACK_PENDING;
        init_waitqueue_head(&candidate->waitq);
diff --git a/fs/aio.c b/fs/aio.c
index fc557a3be0a9..e29ec485af25 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -34,8 +34,6 @@
 #include <linux/security.h>
 #include <linux/eventfd.h>
 #include <linux/blkdev.h>
-#include <linux/mempool.h>
-#include <linux/hash.h>
 #include <linux/compat.h>
 #include <asm/kmap_types.h>
@@ -65,14 +63,6 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
 static DEFINE_SPINLOCK(fput_lock);
 static LIST_HEAD(fput_head);
-#define AIO_BATCH_HASH_BITS     3 /* allocated on-stack, so don't go crazy */
-#define AIO_BATCH_HASH_SIZE     (1 << AIO_BATCH_HASH_BITS)
-struct aio_batch_entry {
-        struct hlist_node list;
-        struct address_space *mapping;
-};
-mempool_t *abe_pool;
 static void aio_kick_handler(struct work_struct *);
 static void aio_queue_work(struct kioctx *);
@@ -85,9 +75,8 @@ static int __init aio_setup(void)
        kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
        kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
-        aio_wq = create_workqueue("aio");
+        aio_wq = alloc_workqueue("aio", 0, 1);  /* used to limit concurrency */
-        abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
+        BUG_ON(!aio_wq);
-        BUG_ON(!aio_wq || !abe_pool);
        pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
@@ -239,15 +228,23 @@ static void __put_ioctx(struct kioctx *ctx)
        call_rcu(&ctx->rcu_head, ctx_rcu_free);
 }
-#define get_ioctx(kioctx) do {                                          \
+static inline void get_ioctx(struct kioctx *kioctx)
-        BUG_ON(atomic_read(&(kioctx)->users) <= 0);                     \
+{
-        atomic_inc(&(kioctx)->users);                                   \
+        BUG_ON(atomic_read(&kioctx->users) <= 0);
-} while (0)
+        atomic_inc(&kioctx->users);
-#define put_ioctx(kioctx) do {                                          \
+}
-        BUG_ON(atomic_read(&(kioctx)->users) <= 0);                     \
-        if (unlikely(atomic_dec_and_test(&(kioctx)->users)))            \
+static inline int try_get_ioctx(struct kioctx *kioctx)
-                __put_ioctx(kioctx);                                    \
+{
-} while (0)
+        return atomic_inc_not_zero(&kioctx->users);
+}
+static inline void put_ioctx(struct kioctx *kioctx)
+{
+        BUG_ON(atomic_read(&kioctx->users) <= 0);
+        if (unlikely(atomic_dec_and_test(&kioctx->users)))
+                __put_ioctx(kioctx);
+}
 /* ioctx_alloc
 *      Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
@@ -512,7 +509,7 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
        ctx->reqs_active--;
        if (unlikely(!ctx->reqs_active && ctx->dead))
-                wake_up(&ctx->wait);
+                wake_up_all(&ctx->wait);
 }
 static void aio_fput_routine(struct work_struct *data)
@@ -569,7 +566,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
                spin_lock(&fput_lock);
                list_add(&req->ki_list, &fput_head);
                spin_unlock(&fput_lock);
-                queue_work(aio_wq, &fput_work);
+                schedule_work(&fput_work);
        } else {
                req->ki_filp = NULL;
                really_put_req(ctx, req);
@@ -601,8 +598,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
        rcu_read_lock();
        hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
-                if (ctx->user_id == ctx_id && !ctx->dead) {
+                /*
-                        get_ioctx(ctx);
+                 * RCU protects us against accessing freed memory but
+                 * we have to be careful not to get a reference when the
+                 * reference count already dropped to 0 (ctx->dead test
+                 * is unreliable because of races).
+                 */
+                if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){
                        ret = ctx;
                        break;
                }
@@ -1216,7 +1218,7 @@ static void io_destroy(struct kioctx *ioctx)
         * by other CPUs at this point.  Right now, we rely on the
         * locking done by the above calls to ensure this consistency.
         */
-        wake_up(&ioctx->wait);
+        wake_up_all(&ioctx->wait);
        put_ioctx(ioctx);       /* once for the lookup */
 }
@@ -1512,57 +1514,8 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
        return 0;
 }
-static void aio_batch_add(struct address_space *mapping,
-                          struct hlist_head *batch_hash)
-{
-        struct aio_batch_entry *abe;
-        struct hlist_node *pos;
-        unsigned bucket;
-        bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
-        hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
-                if (abe->mapping == mapping)
-                        return;
-        }
-        abe = mempool_alloc(abe_pool, GFP_KERNEL);
-        /*
-         * we should be using igrab here, but
-         * we don't want to hammer on the global
-         * inode spinlock just to take an extra
-         * reference on a file that we must already
-         * have a reference to.
-         *
-         * When we're called, we always have a reference
-         * on the file, so we must always have a reference
-         * on the inode, so ihold() is safe here.
-         */
-        ihold(mapping->host);
-        abe->mapping = mapping;
-        hlist_add_head(&abe->list, &batch_hash[bucket]);
-        return;
-}
-static void aio_batch_free(struct hlist_head *batch_hash)
-{
-        struct aio_batch_entry *abe;
-        struct hlist_node *pos, *n;
-        int i;
-        for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
-                hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
-                        blk_run_address_space(abe->mapping);
-                        iput(abe->mapping->host);
-                        hlist_del(&abe->list);
-                        mempool_free(abe, abe_pool);
-                }
-        }
-}
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-                         struct iocb *iocb, struct hlist_head *batch_hash,
+                         struct iocb *iocb, bool compat)
-                         bool compat)
 {
        struct kiocb *req;
        struct file *file;
@@ -1629,6 +1582,23 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                goto out_put_req;
        spin_lock_irq(&ctx->ctx_lock);
+        /*
+         * We could have raced with io_destroy() and are currently holding a
+         * reference to ctx which should be destroyed. We cannot submit IO
+         * since ctx gets freed as soon as io_submit() puts its reference.  The
+         * check here is reliable: io_destroy() sets ctx->dead before waiting
+         * for outstanding IO and the barrier between these two is realized by
+         * unlock of mm->ioctx_lock and lock of ctx->ctx_lock.  Analogously we
+         * increment ctx->reqs_active before checking for ctx->dead and the
+         * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we
+         * don't see ctx->dead set here, io_destroy() waits for our IO to
+         * finish.
+         */
+        if (ctx->dead) {
+                spin_unlock_irq(&ctx->ctx_lock);
+                ret = -EINVAL;
+                goto out_put_req;
+        }
        aio_run_iocb(req);
        if (!list_empty(&ctx->run_list)) {
                /* drain the run list */
@@ -1636,11 +1606,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                        ;
        }
        spin_unlock_irq(&ctx->ctx_lock);
-        if (req->ki_opcode == IOCB_CMD_PREAD ||
-            req->ki_opcode == IOCB_CMD_PREADV ||
-            req->ki_opcode == IOCB_CMD_PWRITE ||
-            req->ki_opcode == IOCB_CMD_PWRITEV)
-                aio_batch_add(file->f_mapping, batch_hash);
        aio_put_req(req);       /* drop extra ref to req */
        return 0;
@@ -1657,7 +1622,7 @@ long do_io_submit(aio_context_t ctx_id, long nr,
        struct kioctx *ctx;
        long ret = 0;
        int i;
-        struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
+        struct blk_plug plug;
        if (unlikely(nr < 0))
                return -EINVAL;
@@ -1674,6 +1639,8 @@ long do_io_submit(aio_context_t ctx_id, long nr,
                return -EINVAL;
        }
+        blk_start_plug(&plug);
        /*
         * AKPM: should this return a partial result if some of the IOs were
         * successfully submitted?
@@ -1692,11 +1659,11 @@ long do_io_submit(aio_context_t ctx_id, long nr,
                        break;
                }
-                ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
+                ret = io_submit_one(ctx, user_iocb, &tmp, compat);
                if (ret)
                        break;
        }
-        aio_batch_free(batch_hash);
+        blk_finish_plug(&plug);
        put_ioctx(ctx);
        return i ? i : ret;
diff --git a/fs/attr.c b/fs/attr.c
index 7ca41811afa1..1007ed616314 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -59,7 +59,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
        /* Make sure a caller can chmod. */
        if (ia_valid & ATTR_MODE) {
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EPERM;
                /* Also check the setgid bit! */
                if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
@@ -69,7 +69,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
        /* Check for setting the inode time. */
        if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EPERM;
        }
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 54f923792728..475f9c597cb7 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -61,8 +61,6 @@ do {							\
                current->pid, __func__, ##args);        \
 } while (0)
-extern spinlock_t autofs4_lock;
 /* Unified info structure.  This is pointed to by both the dentry and
   inode structures.  Each file in the filesystem has an instance of this
   structure.  It holds a reference to the dentry, so dentries are never
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 1442da4860e5..509fe1eb66ae 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -372,6 +372,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
                return -EBUSY;
        } else {
                struct file *pipe = fget(pipefd);
+                if (!pipe) {
+                        err = -EBADF;
+                        goto out;
+                }
                if (!pipe->f_op || !pipe->f_op->write) {
                        err = -EPIPE;
                        fput(pipe);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index f43100b9662b..450f529a4eae 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -87,18 +87,70 @@ done:
 }
 /*
+ * Calculate and dget next entry in the subdirs list under root.
+ */
+static struct dentry *get_next_positive_subdir(struct dentry *prev,
+                                                struct dentry *root)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
+        struct list_head *next;
+        struct dentry *p, *q;
+        spin_lock(&sbi->lookup_lock);
+        if (prev == NULL) {
+                spin_lock(&root->d_lock);
+                prev = dget_dlock(root);
+                next = prev->d_subdirs.next;
+                p = prev;
+                goto start;
+        }
+        p = prev;
+        spin_lock(&p->d_lock);
+again:
+        next = p->d_u.d_child.next;
+start:
+        if (next == &root->d_subdirs) {
+                spin_unlock(&p->d_lock);
+                spin_unlock(&sbi->lookup_lock);
+                dput(prev);
+                return NULL;
+        }
+        q = list_entry(next, struct dentry, d_u.d_child);
+        spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
+        /* Negative dentry - try next */
+        if (!simple_positive(q)) {
+                spin_unlock(&p->d_lock);
+                p = q;
+                goto again;
+        }
+        dget_dlock(q);
+        spin_unlock(&q->d_lock);
+        spin_unlock(&p->d_lock);
+        spin_unlock(&sbi->lookup_lock);
+        dput(prev);
+        return q;
+}
+/*
 * Calculate and dget next entry in top down tree traversal.
 */
 static struct dentry *get_next_positive_dentry(struct dentry *prev,
                                                struct dentry *root)
 {
+        struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
        struct list_head *next;
        struct dentry *p, *ret;
        if (prev == NULL)
                return dget(root);
-        spin_lock(&autofs4_lock);
+        spin_lock(&sbi->lookup_lock);
 relock:
        p = prev;
        spin_lock(&p->d_lock);
@@ -110,7 +162,7 @@ again:
                        if (p == root) {
                                spin_unlock(&p->d_lock);
-                                spin_unlock(&autofs4_lock);
+                                spin_unlock(&sbi->lookup_lock);
                                dput(prev);
                                return NULL;
                        }
@@ -140,7 +192,7 @@ again:
        dget_dlock(ret);
        spin_unlock(&ret->d_lock);
        spin_unlock(&p->d_lock);
-        spin_unlock(&autofs4_lock);
+        spin_unlock(&sbi->lookup_lock);
        dput(prev);
@@ -290,11 +342,8 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
        spin_lock(&sbi->fs_lock);
        ino = autofs4_dentry_ino(root);
        /* No point expiring a pending mount */
-        if (ino->flags & AUTOFS_INF_PENDING) {
+        if (ino->flags & AUTOFS_INF_PENDING)
-                spin_unlock(&sbi->fs_lock);
+                goto out;
-                return NULL;
-        }
-        managed_dentry_set_transit(root);
        if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
                struct autofs_info *ino = autofs4_dentry_ino(root);
                ino->flags |= AUTOFS_INF_EXPIRING;
@@ -302,7 +351,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
                spin_unlock(&sbi->fs_lock);
                return root;
        }
-        managed_dentry_clear_transit(root);
+out:
        spin_unlock(&sbi->fs_lock);
        dput(root);
@@ -336,13 +385,12 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
        timeout = sbi->exp_timeout;
        dentry = NULL;
-        while ((dentry = get_next_positive_dentry(dentry, root))) {
+        while ((dentry = get_next_positive_subdir(dentry, root))) {
                spin_lock(&sbi->fs_lock);
                ino = autofs4_dentry_ino(dentry);
                /* No point expiring a pending mount */
                if (ino->flags & AUTOFS_INF_PENDING)
-                        goto cont;
+                        goto next;
-                managed_dentry_set_transit(dentry);
                /*
                 * Case 1: (i) indirect mount or top level pseudo direct mount
@@ -402,8 +450,6 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                        }
                }
 next:
-                managed_dentry_clear_transit(dentry);
-cont:
                spin_unlock(&sbi->fs_lock);
        }
        return NULL;
@@ -415,13 +461,13 @@ found:
        ino->flags |= AUTOFS_INF_EXPIRING;
        init_completion(&ino->expire_complete);
        spin_unlock(&sbi->fs_lock);
-        spin_lock(&autofs4_lock);
+        spin_lock(&sbi->lookup_lock);
        spin_lock(&expired->d_parent->d_lock);
        spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
        list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
        spin_unlock(&expired->d_lock);
        spin_unlock(&expired->d_parent->d_lock);
-        spin_unlock(&autofs4_lock);
+        spin_unlock(&sbi->lookup_lock);
        return expired;
 }
@@ -484,8 +530,6 @@ int autofs4_expire_run(struct super_block *sb,
        spin_lock(&sbi->fs_lock);
        ino = autofs4_dentry_ino(dentry);
        ino->flags &= ~AUTOFS_INF_EXPIRING;
-        if (!d_unhashed(dentry))
-                managed_dentry_clear_transit(dentry);
        complete_all(&ino->expire_complete);
        spin_unlock(&sbi->fs_lock);
@@ -513,9 +557,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
                spin_lock(&sbi->fs_lock);
                ino->flags &= ~AUTOFS_INF_EXPIRING;
                spin_lock(&dentry->d_lock);
-                if (ret)
+                if (!ret) {
-                        __managed_dentry_clear_transit(dentry);
-                else {
                        if ((IS_ROOT(dentry) ||
                            (autofs_type_indirect(sbi->type) &&
                             IS_ROOT(dentry->d_parent))) &&
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 014e7aba3b08..96804a17bbd0 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -23,8 +23,6 @@
 #include "autofs_i.h"
-DEFINE_SPINLOCK(autofs4_lock);
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
@@ -36,7 +34,7 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static struct vfsmount *autofs4_d_automount(struct path *);
-static int autofs4_d_manage(struct dentry *, bool, bool);
+static int autofs4_d_manage(struct dentry *, bool);
 static void autofs4_dentry_release(struct dentry *);
 const struct file_operations autofs4_root_operations = {
@@ -125,15 +123,15 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
         * autofs file system so just let the libfs routines handle
         * it.
         */
-        spin_lock(&autofs4_lock);
+        spin_lock(&sbi->lookup_lock);
        spin_lock(&dentry->d_lock);
        if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
                spin_unlock(&dentry->d_lock);
-                spin_unlock(&autofs4_lock);
+                spin_unlock(&sbi->lookup_lock);
                return -ENOENT;
        }
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&autofs4_lock);
+        spin_unlock(&sbi->lookup_lock);
 out:
        return dcache_dir_open(inode, file);
@@ -171,7 +169,6 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
        const unsigned char *str = name->name;
        struct list_head *p, *head;
-        spin_lock(&autofs4_lock);
        spin_lock(&sbi->lookup_lock);
        head = &sbi->active_list;
        list_for_each(p, head) {
@@ -204,14 +201,12 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
                        dget_dlock(active);
                        spin_unlock(&active->d_lock);
                        spin_unlock(&sbi->lookup_lock);
-                        spin_unlock(&autofs4_lock);
                        return active;
                }
 next:
                spin_unlock(&active->d_lock);
        }
        spin_unlock(&sbi->lookup_lock);
-        spin_unlock(&autofs4_lock);
        return NULL;
 }
@@ -226,7 +221,6 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
        const unsigned char *str = name->name;
        struct list_head *p, *head;
-        spin_lock(&autofs4_lock);
        spin_lock(&sbi->lookup_lock);
        head = &sbi->expiring_list;
        list_for_each(p, head) {
@@ -259,14 +253,12 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
                        dget_dlock(expiring);
                        spin_unlock(&expiring->d_lock);
                        spin_unlock(&sbi->lookup_lock);
-                        spin_unlock(&autofs4_lock);
                        return expiring;
                }
 next:
                spin_unlock(&expiring->d_lock);
        }
        spin_unlock(&sbi->lookup_lock);
-        spin_unlock(&autofs4_lock);
        return NULL;
 }
@@ -275,17 +267,16 @@ static int autofs4_mount_wait(struct dentry *dentry)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
-        int status;
+        int status = 0;
        if (ino->flags & AUTOFS_INF_PENDING) {
                DPRINTK("waiting for mount name=%.*s",
                        dentry->d_name.len, dentry->d_name.name);
                status = autofs4_wait(sbi, dentry, NFY_MOUNT);
                DPRINTK("mount wait done status=%d", status);
-                ino->last_used = jiffies;
-                return status;
        }
-        return 0;
+        ino->last_used = jiffies;
+        return status;
 }
 static int do_expire_wait(struct dentry *dentry)
@@ -319,9 +310,12 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
         */
        if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
                struct dentry *parent = dentry->d_parent;
+                struct autofs_info *ino;
                struct dentry *new = d_lookup(parent, &dentry->d_name);
                if (!new)
                        return NULL;
+                ino = autofs4_dentry_ino(new);
+                ino->last_used = jiffies;
                dput(path->dentry);
                path->dentry = new;
        }
@@ -338,18 +332,6 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
        DPRINTK("dentry=%p %.*s",
                dentry, dentry->d_name.len, dentry->d_name.name);
-        /*
-         * Someone may have manually umounted this or it was a submount
-         * that has gone away.
-         */
-        spin_lock(&dentry->d_lock);
-        if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
-                if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
-                     (dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
-                        __managed_dentry_set_transit(path->dentry);
-        }
-        spin_unlock(&dentry->d_lock);
        /* The daemon never triggers a mount. */
        if (autofs4_oz_mode(sbi))
                return NULL;
@@ -418,18 +400,17 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
 done:
        if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
                /*
-                 * Any needed mounting has been completed and the path updated
+                 * Any needed mounting has been completed and the path
-                 * so turn this into a normal dentry so we don't continually
+                 * updated so clear DCACHE_NEED_AUTOMOUNT so we don't
-                 * call ->d_automount() and ->d_manage().
+                 * call ->d_automount() on rootless multi-mounts since
-                 */
+                 * it can lead to an incorrect ELOOP error return.
-                spin_lock(&dentry->d_lock);
+                 *
-                __managed_dentry_clear_transit(dentry);
-                /*
                 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
                 * symlinks as in all other cases the dentry will be covered by
                 * an actual mount so ->d_automount() won't be called during
                 * the follow.
                 */
+                spin_lock(&dentry->d_lock);
                if ((!d_mountpoint(dentry) &&
                    !list_empty(&dentry->d_subdirs)) ||
                    (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
@@ -446,7 +427,7 @@ done:
        return NULL;
 }
-int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
+int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
@@ -454,7 +435,9 @@ int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
                dentry, dentry->d_name.len, dentry->d_name.name);
        /* The daemon never waits. */
-        if (autofs4_oz_mode(sbi) || mounting_here) {
+        if (autofs4_oz_mode(sbi)) {
+                if (rcu_walk)
+                        return 0;
                if (!d_mountpoint(dentry))
                        return -EISDIR;
                return 0;
@@ -612,12 +595,12 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
        dir->i_mtime = CURRENT_TIME;
-        spin_lock(&autofs4_lock);
+        spin_lock(&sbi->lookup_lock);
-        autofs4_add_expiring(dentry);
+        __autofs4_add_expiring(dentry);
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&autofs4_lock);
+        spin_unlock(&sbi->lookup_lock);
        return 0;
 }
@@ -686,20 +669,17 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
        if (!autofs4_oz_mode(sbi))
                return -EACCES;
-        spin_lock(&autofs4_lock);
        spin_lock(&sbi->lookup_lock);
        spin_lock(&dentry->d_lock);
        if (!list_empty(&dentry->d_subdirs)) {
                spin_unlock(&dentry->d_lock);
                spin_unlock(&sbi->lookup_lock);
-                spin_unlock(&autofs4_lock);
                return -ENOTEMPTY;
        }
        __autofs4_add_expiring(dentry);
-        spin_unlock(&sbi->lookup_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&autofs4_lock);
+        spin_unlock(&sbi->lookup_lock);
        if (sbi->version < 5)
                autofs_clear_leaf_automount_flags(dentry);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 56010056b2e6..25435987d6ae 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -197,12 +197,12 @@ rename_retry:
        seq = read_seqbegin(&rename_lock);
        rcu_read_lock();
-        spin_lock(&autofs4_lock);
+        spin_lock(&sbi->fs_lock);
        for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
                len += tmp->d_name.len + 1;
        if (!len || --len > NAME_MAX) {
-                spin_unlock(&autofs4_lock);
+                spin_unlock(&sbi->fs_lock);
                rcu_read_unlock();
                if (read_seqretry(&rename_lock, seq))
                        goto rename_retry;
@@ -218,7 +218,7 @@ rename_retry:
                p -= tmp->d_name.len;
                strncpy(p, tmp->d_name.name, tmp->d_name.len);
        }
-        spin_unlock(&autofs4_lock);
+        spin_unlock(&sbi->fs_lock);
        rcu_read_unlock();
        if (read_seqretry(&rename_lock, seq))
                goto rename_retry;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b1d0c794747b..06457ed8f3e7 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -75,7 +75,6 @@ static const struct inode_operations befs_dir_inode_operations = {
 static const struct address_space_operations befs_aops = {
        .readpage       = befs_readpage,
-        .sync_page      = block_sync_page,
        .bmap           = befs_bmap,
 };
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 685ecff3ab31..b14cebfd9047 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -97,7 +97,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
        if (!inode)
                return -ENOSPC;
        mutex_lock(&info->bfs_lock);
-        ino = find_first_zero_bit(info->si_imap, info->si_lasti);
+        ino = find_first_zero_bit(info->si_imap, info->si_lasti + 1);
        if (ino > info->si_lasti) {
                mutex_unlock(&info->bfs_lock);
                iput(inode);
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index eb67edd0f8ea..f20e8a71062f 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -186,7 +186,6 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations bfs_aops = {
        .readpage       = bfs_readpage,
        .writepage      = bfs_writepage,
-        .sync_page      = block_sync_page,
        .write_begin    = bfs_write_begin,
        .write_end      = generic_write_end,
        .bmap           = bfs_bmap,
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d5b640ba6cb1..f34078d702d3 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -570,7 +570,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        unsigned long elf_entry;
        unsigned long interp_load_addr = 0;
        unsigned long start_code, end_code, start_data, end_data;
-        unsigned long reloc_func_desc = 0;
+        unsigned long reloc_func_desc __maybe_unused = 0;
        int executable_stack = EXSTACK_DEFAULT;
        unsigned long def_flags = 0;
        struct {
@@ -1906,7 +1906,7 @@ static int elf_core_dump(struct coredump_params *cprm)
        segs = current->mm->map_count;
        segs += elf_core_extra_phdrs();
-        gate_vma = get_gate_vma(current);
+        gate_vma = get_gate_vma(current->mm);
        if (gate_vma != NULL)
                segs++;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index e49cce234c65..9c5e6b2cd11a 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -761,6 +761,9 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size)
 {
        unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
+        if (bs->bio_integrity_pool)
+                return 0;
        bs->bio_integrity_pool =
                mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
diff --git a/fs/bio.c b/fs/bio.c
index 4bd454fa844e..4d6d4b6c2bf1 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -43,7 +43,7 @@ static mempool_t *bio_split_pool __read_mostly;
 * unsigned short
 */
 #define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
-struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
+static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
        BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
 };
 #undef BV
@@ -111,7 +111,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
        if (!slab)
                goto out_unlock;
-        printk("bio: create slab <%s> at %d\n", bslab->name, entry);
+        printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry);
        bslab->slab = slab;
        bslab->slab_ref = 1;
        bslab->slab_size = sz;
@@ -1636,9 +1636,6 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
        if (!bs->bio_pool)
                goto bad;
-        if (bioset_integrity_create(bs, pool_size))
-                goto bad;
        if (!biovec_create_pools(bs, pool_size))
                return bs;
@@ -1656,12 +1653,10 @@ static void __init biovec_init_slabs(void)
                int size;
                struct biovec_slab *bvs = bvec_slabs + i;
-#ifndef CONFIG_BLK_DEV_INTEGRITY
                if (bvs->nr_vecs <= BIO_INLINE_VECS) {
                        bvs->slab = NULL;
                        continue;
                }
-#endif
                size = bvs->nr_vecs * sizeof(struct bio_vec);
                bvs->slab = kmem_cache_create(bvs->name, size, 0,
@@ -1684,6 +1679,9 @@ static int __init init_bio(void)
        if (!fs_bio_set)
                panic("bio: can't allocate bios\n");
+        if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
+                panic("bio: can't create integrity pool\n");
        bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
                                                     sizeof(struct bio_pair));
        if (!bio_split_pool)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 333a7bb4cb9c..c1511c674f53 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -55,11 +55,13 @@ EXPORT_SYMBOL(I_BDEV);
 static void bdev_inode_switch_bdi(struct inode *inode,
                        struct backing_dev_info *dst)
 {
-        spin_lock(&inode_lock);
+        spin_lock(&inode_wb_list_lock);
+        spin_lock(&inode->i_lock);
        inode->i_data.backing_dev_info = dst;
        if (inode->i_state & I_DIRTY)
                list_move(&inode->i_wb_list, &dst->wb.b_dirty);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&inode_wb_list_lock);
 }
 static sector_t max_block(struct block_device *bdev)
@@ -873,6 +875,11 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
        ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
        if (ret)
                goto out_del;
+        /*
+         * bdev could be deleted beneath us which would implicitly destroy
+         * the holder directory.  Hold on to it.
+         */
+        kobject_get(bdev->bd_part->holder_dir);
        list_add(&holder->list, &bdev->bd_holder_disks);
        goto out_unlock;
@@ -909,6 +916,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
                del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
                del_symlink(bdev->bd_part->holder_dir,
                            &disk_to_dev(disk)->kobj);
+                kobject_put(bdev->bd_part->holder_dir);
                list_del_init(&holder->list);
                kfree(holder);
        }
@@ -922,14 +930,15 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
 * flush_disk - invalidates all buffer-cache entries on a disk
 *
 * @bdev:      struct block device to be flushed
+ * @kill_dirty: flag to guide handling of dirty inodes
 *
 * Invalidates all buffer-cache entries on a disk. It should be called
 * when a disk has been changed -- either by a media change or online
 * resize.
 */
-static void flush_disk(struct block_device *bdev)
+static void flush_disk(struct block_device *bdev, bool kill_dirty)
 {
-        if (__invalidate_device(bdev)) {
+        if (__invalidate_device(bdev, kill_dirty)) {
                char name[BDEVNAME_SIZE] = "";
                if (bdev->bd_disk)
@@ -966,7 +975,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
                       "%s: detected capacity change from %lld to %lld\n",
                       name, bdev_size, disk_size);
                i_size_write(bdev->bd_inode, disk_size);
-                flush_disk(bdev);
+                flush_disk(bdev, false);
        }
 }
 EXPORT_SYMBOL(check_disk_size_change);
@@ -1019,7 +1028,7 @@ int check_disk_change(struct block_device *bdev)
        if (!(events & DISK_EVENT_MEDIA_CHANGE))
                return 0;
-        flush_disk(bdev);
+        flush_disk(bdev, true);
        if (bdops->revalidate_disk)
                bdops->revalidate_disk(bdev->bd_disk);
        return 1;
@@ -1080,6 +1089,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        if (!disk)
                goto out;
+        disk_block_events(disk);
        mutex_lock_nested(&bdev->bd_mutex, for_part);
        if (!bdev->bd_openers) {
                bdev->bd_disk = disk;
@@ -1101,10 +1111,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                         */
                                        disk_put_part(bdev->bd_part);
                                        bdev->bd_part = NULL;
-                                        module_put(disk->fops->owner);
-                                        put_disk(disk);
                                        bdev->bd_disk = NULL;
                                        mutex_unlock(&bdev->bd_mutex);
+                                        disk_unblock_events(disk);
+                                        module_put(disk->fops->owner);
+                                        put_disk(disk);
                                        goto restart;
                                }
                                if (ret)
@@ -1141,9 +1152,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                        bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
                }
        } else {
-                module_put(disk->fops->owner);
-                put_disk(disk);
-                disk = NULL;
                if (bdev->bd_contains == bdev) {
                        if (bdev->bd_disk->fops->open) {
                                ret = bdev->bd_disk->fops->open(bdev, mode);
@@ -1153,11 +1161,15 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                        if (bdev->bd_invalidated)
                                rescan_partitions(bdev->bd_disk, bdev);
                }
+                /* only one opener holds refs to the module and disk */
+                module_put(disk->fops->owner);
+                put_disk(disk);
        }
        bdev->bd_openers++;
        if (for_part)
                bdev->bd_part_count++;
        mutex_unlock(&bdev->bd_mutex);
+        disk_unblock_events(disk);
        return 0;
 out_clear:
@@ -1170,10 +1182,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        bdev->bd_contains = NULL;
 out_unlock_bdev:
        mutex_unlock(&bdev->bd_mutex);
- out:
+        disk_unblock_events(disk);
-        if (disk)
+        module_put(disk->fops->owner);
-                module_put(disk->fops->owner);
        put_disk(disk);
+ out:
        bdput(bdev);
        return ret;
@@ -1215,12 +1227,6 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
        res = __blkdev_get(bdev, mode, 0);
-        /* __blkdev_get() may alter read only status, check it afterwards */
-        if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
-                __blkdev_put(bdev, mode, 0);
-                res = -EACCES;
-        }
        if (whole) {
                /* finish claiming */
                mutex_lock(&bdev->bd_mutex);
@@ -1298,6 +1304,11 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
        if (err)
                return ERR_PTR(err);
+        if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
+                blkdev_put(bdev, mode);
+                return ERR_PTR(-EACCES);
+        }
        return bdev;
 }
 EXPORT_SYMBOL(blkdev_get_by_path);
@@ -1440,14 +1451,13 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
                if (bdev_free) {
                        if (bdev->bd_write_holder) {
                                disk_unblock_events(bdev->bd_disk);
-                                bdev->bd_write_holder = false;
-                        } else
                                disk_check_events(bdev->bd_disk);
+                                bdev->bd_write_holder = false;
+                        }
                }
                mutex_unlock(&bdev->bd_mutex);
-        } else
+        }
-                disk_check_events(bdev->bd_disk);
        return __blkdev_put(bdev, mode, 0);
 }
@@ -1521,7 +1531,6 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
 static const struct address_space_operations def_blk_aops = {
        .readpage       = blkdev_readpage,
        .writepage      = blkdev_writepage,
-        .sync_page      = block_sync_page,
        .write_begin    = blkdev_write_begin,
        .write_end      = blkdev_write_end,
        .writepages     = generic_writepages,
@@ -1601,7 +1610,7 @@ fail:
 }
 EXPORT_SYMBOL(lookup_bdev);
-int __invalidate_device(struct block_device *bdev)
+int __invalidate_device(struct block_device *bdev, bool kill_dirty)
 {
        struct super_block *sb = get_super(bdev);
        int res = 0;
@@ -1614,7 +1623,7 @@ int __invalidate_device(struct block_device *bdev)
                 * hold).
                 */
                shrink_dcache_sb(sb);
-                res = invalidate_inodes(sb);
+                res = invalidate_inodes(sb, kill_dirty);
                drop_super(sb);
        }
        invalidate_bdev(bdev);
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 15b5ca2a2606..de34bfad9ec3 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -37,6 +37,9 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
        char *value = NULL;
        struct posix_acl *acl;
+        if (!IS_POSIXACL(inode))
+                return NULL;
        acl = get_cached_acl(inode, type);
        if (acl != ACL_NOT_CACHED)
                return acl;
@@ -84,6 +87,9 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
        struct posix_acl *acl;
        int ret = 0;
+        if (!IS_POSIXACL(dentry->d_inode))
+                return -EOPNOTSUPP;
        acl = btrfs_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
@@ -164,7 +170,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
        int ret;
        struct posix_acl *acl = NULL;
-        if (!is_owner_or_cap(dentry->d_inode))
+        if (!inode_owner_or_capable(dentry->d_inode))
                return -EPERM;
        if (!IS_POSIXACL(dentry->d_inode))
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index f745287fbf2e..4d2110eafe29 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -562,7 +562,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        u64 em_len;
        u64 em_start;
        struct extent_map *em;
-        int ret;
+        int ret = -ENOMEM;
        u32 *sums;
        tree = &BTRFS_I(inode)->io_tree;
@@ -577,6 +577,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        compressed_len = em->block_len;
        cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+        if (!cb)
+                goto out;
        atomic_set(&cb->pending_bios, 0);
        cb->errors = 0;
        cb->inode = inode;
@@ -597,13 +600,18 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
                                 PAGE_CACHE_SIZE;
-        cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+        cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
                                       GFP_NOFS);
+        if (!cb->compressed_pages)
+                goto fail1;
        bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
        for (page_index = 0; page_index < nr_pages; page_index++) {
                cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
                                                              __GFP_HIGHMEM);
+                if (!cb->compressed_pages[page_index])
+                        goto fail2;
        }
        cb->nr_pages = nr_pages;
@@ -614,6 +622,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        cb->len = uncompressed_len;
        comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+        if (!comp_bio)
+                goto fail2;
        comp_bio->bi_private = cb;
        comp_bio->bi_end_io = end_compressed_bio_read;
        atomic_inc(&cb->pending_bios);
@@ -681,6 +691,17 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        bio_put(comp_bio);
        return 0;
+fail2:
+        for (page_index = 0; page_index < nr_pages; page_index++)
+                free_page((unsigned long)cb->compressed_pages[page_index]);
+        kfree(cb->compressed_pages);
+fail1:
+        kfree(cb);
+out:
+        free_extent_map(em);
+        return ret;
 }
 static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
@@ -900,7 +921,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
        return ret;
 }
-void __exit btrfs_exit_compress(void)
+void btrfs_exit_compress(void)
 {
        free_workspaces();
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c98b3af6052..7f78cc78fdd0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -729,6 +729,15 @@ struct btrfs_space_info {
        u64 disk_total;         /* total bytes on disk, takes mirrors into
                                   account */
+        /*
+         * we bump reservation progress every time we decrement
+         * bytes_reserved.  This way people waiting for reservations
+         * know something good has happened and they can check
+         * for progress.  The number here isn't to be trusted, it
+         * just shows reclaim activity
+         */
+        unsigned long reservation_progress;
        int full;               /* indicates that we cannot allocate any more
                                   chunks for this space */
        int force_alloc;        /* set if we need to force a chunk alloc for
@@ -1254,6 +1263,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_SPACE_CACHE         (1 << 12)
 #define BTRFS_MOUNT_CLEAR_CACHE         (1 << 13)
 #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
+#define BTRFS_MOUNT_ENOSPC_DEBUG         (1 << 15)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -2218,6 +2228,8 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root,
                                   u64 start, u64 end);
 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
                               u64 num_bytes);
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 type);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b531c36455d8..830d261d0e6b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -359,10 +359,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
        tree = &BTRFS_I(page->mapping->host)->io_tree;
-        if (page->private == EXTENT_PAGE_PRIVATE)
+        if (page->private == EXTENT_PAGE_PRIVATE) {
+                WARN_ON(1);
                goto out;
-        if (!page->private)
+        }
+        if (!page->private) {
+                WARN_ON(1);
                goto out;
+        }
        len = page->private >> 2;
        WARN_ON(len == 0);
@@ -843,7 +847,6 @@ static const struct address_space_operations btree_aops = {
        .writepages     = btree_writepages,
        .releasepage    = btree_releasepage,
        .invalidatepage = btree_invalidatepage,
-        .sync_page      = block_sync_page,
 #ifdef CONFIG_MIGRATION
        .migratepage    = btree_migratepage,
 #endif
@@ -1327,82 +1330,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 }
 /*
- * this unplugs every device on the box, and it is only used when page
- * is null
- */
-static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-        struct btrfs_device *device;
-        struct btrfs_fs_info *info;
-        info = (struct btrfs_fs_info *)bdi->unplug_io_data;
-        list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
-                if (!device->bdev)
-                        continue;
-                bdi = blk_get_backing_dev_info(device->bdev);
-                if (bdi->unplug_io_fn)
-                        bdi->unplug_io_fn(bdi, page);
-        }
-}
-static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-        struct inode *inode;
-        struct extent_map_tree *em_tree;
-        struct extent_map *em;
-        struct address_space *mapping;
-        u64 offset;
-        /* the generic O_DIRECT read code does this */
-        if (1 || !page) {
-                __unplug_io_fn(bdi, page);
-                return;
-        }
-        /*
-         * page->mapping may change at any time.  Get a consistent copy
-         * and use that for everything below
-         */
-        smp_mb();
-        mapping = page->mapping;
-        if (!mapping)
-                return;
-        inode = mapping->host;
-        /*
-         * don't do the expensive searching for a small number of
-         * devices
-         */
-        if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
-                __unplug_io_fn(bdi, page);
-                return;
-        }
-        offset = page_offset(page);
-        em_tree = &BTRFS_I(inode)->extent_tree;
-        read_lock(&em_tree->lock);
-        em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
-        read_unlock(&em_tree->lock);
-        if (!em) {
-                __unplug_io_fn(bdi, page);
-                return;
-        }
-        if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-                free_extent_map(em);
-                __unplug_io_fn(bdi, page);
-                return;
-        }
-        offset = offset - em->start;
-        btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
-                          em->block_start + offset, page);
-        free_extent_map(em);
-}
-/*
 * If this fails, caller must call bdi_destroy() to get rid of the
 * bdi again.
 */
@@ -1416,8 +1343,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
                return err;
        bdi->ra_pages   = default_backing_dev_info.ra_pages;
-        bdi->unplug_io_fn       = btrfs_unplug_io_fn;
-        bdi->unplug_io_data     = info;
        bdi->congested_fn       = btrfs_congested_fn;
        bdi->congested_data     = info;
        return 0;
@@ -1550,6 +1475,7 @@ static int transaction_kthread(void *arg)
                spin_unlock(&root->fs_info->new_trans_lock);
                trans = btrfs_join_transaction(root, 1);
+                BUG_ON(IS_ERR(trans));
                if (transid == trans->transid) {
                        ret = btrfs_commit_transaction(trans, root);
                        BUG_ON(ret);
@@ -2453,10 +2379,14 @@ int btrfs_commit_super(struct btrfs_root *root)
        up_write(&root->fs_info->cleanup_work_sem);
        trans = btrfs_join_transaction(root, 1);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
        ret = btrfs_commit_transaction(trans, root);
        BUG_ON(ret);
        /* run commit again to drop the original snapshot */
        trans = btrfs_join_transaction(root, 1);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
        btrfs_commit_transaction(trans, root);
        ret = btrfs_write_and_wait_transaction(NULL, root);
        BUG_ON(ret);
@@ -2484,7 +2414,7 @@ int close_ctree(struct btrfs_root *root)
         * ERROR state on disk.
         *
         * 2. when btrfs flips readonly just in btrfs_commit_super,
-         * and in such case, btrfs cannnot write sb via btrfs_commit_super,
+         * and in such case, btrfs cannot write sb via btrfs_commit_super,
         * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
         * btrfs will cleanup all FS resources first and write sb then.
         */
@@ -2554,6 +2484,8 @@ int close_ctree(struct btrfs_root *root)
        kfree(fs_info->chunk_root);
        kfree(fs_info->dev_root);
        kfree(fs_info->csum_root);
+        kfree(fs_info);
        return 0;
 }
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 9786963b07e5..b4ffad859adb 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
        int len = *max_len;
        int type;
-        if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
+        if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
-            (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
+                *max_len = BTRFS_FID_SIZE_CONNECTABLE;
                return 255;
+        } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
+                *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+                return 255;
+        }
        len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
        type = FILEID_BTRFS_WITHOUT_PARENT;
@@ -171,6 +175,8 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
        int ret;
        path = btrfs_alloc_path();
+        if (!path)
+                return ERR_PTR(-ENOMEM);
        if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
                key.objectid = root->root_key.objectid;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b55269340cec..7b3089b5c2df 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -320,11 +320,6 @@ static int caching_kthread(void *data)
        if (!path)
                return -ENOMEM;
-        exclude_super_stripes(extent_root, block_group);
-        spin_lock(&block_group->space_info->lock);
-        block_group->space_info->bytes_readonly += block_group->bytes_super;
-        spin_unlock(&block_group->space_info->lock);
        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
        /*
@@ -467,8 +462,10 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                        cache->cached = BTRFS_CACHE_NO;
                }
                spin_unlock(&cache->lock);
-                if (ret == 1)
+                if (ret == 1) {
+                        free_excluded_extents(fs_info->extent_root, cache);
                        return 0;
+                }
        }
        if (load_cache_only)
@@ -3344,21 +3341,24 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
        u64 reserved;
        u64 max_reclaim;
        u64 reclaimed = 0;
-        int pause = 1;
+        long time_left;
        int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
+        int loops = 0;
+        unsigned long progress;
        block_rsv = &root->fs_info->delalloc_block_rsv;
        space_info = block_rsv->space_info;
        smp_mb();
        reserved = space_info->bytes_reserved;
+        progress = space_info->reservation_progress;
        if (reserved == 0)
                return 0;
        max_reclaim = min(reserved, to_reclaim);
-        while (1) {
+        while (loops < 1024) {
                /* have the flusher threads jump in and do some IO */
                smp_mb();
                nr_pages = min_t(unsigned long, nr_pages,
@@ -3371,17 +3371,31 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
                reserved = space_info->bytes_reserved;
                spin_unlock(&space_info->lock);
+                loops++;
                if (reserved == 0 || reclaimed >= max_reclaim)
                        break;
                if (trans && trans->transaction->blocked)
                        return -EAGAIN;
-                __set_current_state(TASK_INTERRUPTIBLE);
+                time_left = schedule_timeout_interruptible(1);
-                schedule_timeout(pause);
-                pause <<= 1;
+                /* We were interrupted, exit */
-                if (pause > HZ / 10)
+                if (time_left)
-                        pause = HZ / 10;
+                        break;
+                /* we've kicked the IO a few times, if anything has been freed,
+                 * exit.  There is no sense in looping here for a long time
+                 * when we really need to commit the transaction, or there are
+                 * just too many writers without enough free space
+                 */
+                if (loops > 3) {
+                        smp_mb();
+                        if (progress != space_info->reservation_progress)
+                                break;
+                }
        }
        return reclaimed >= to_reclaim;
@@ -3588,10 +3602,23 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
        if (num_bytes > 0) {
                if (dest) {
-                        block_rsv_add_bytes(dest, num_bytes, 0);
+                        spin_lock(&dest->lock);
-                } else {
+                        if (!dest->full) {
+                                u64 bytes_to_add;
+                                bytes_to_add = dest->size - dest->reserved;
+                                bytes_to_add = min(num_bytes, bytes_to_add);
+                                dest->reserved += bytes_to_add;
+                                if (dest->reserved >= dest->size)
+                                        dest->full = 1;
+                                num_bytes -= bytes_to_add;
+                        }
+                        spin_unlock(&dest->lock);
+                }
+                if (num_bytes) {
                        spin_lock(&space_info->lock);
                        space_info->bytes_reserved -= num_bytes;
+                        space_info->reservation_progress++;
                        spin_unlock(&space_info->lock);
                }
        }
@@ -3824,6 +3851,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
        if (block_rsv->reserved >= block_rsv->size) {
                num_bytes = block_rsv->reserved - block_rsv->size;
                sinfo->bytes_reserved -= num_bytes;
+                sinfo->reservation_progress++;
                block_rsv->reserved = block_rsv->size;
                block_rsv->full = 1;
        }
@@ -3985,7 +4013,6 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                to_reserve = 0;
        }
        spin_unlock(&BTRFS_I(inode)->accounting_lock);
        to_reserve += calc_csum_metadata_size(inode, num_bytes);
        ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
        if (ret)
@@ -4012,6 +4039,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
        num_bytes = ALIGN(num_bytes, root->sectorsize);
        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+        WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
        spin_lock(&BTRFS_I(inode)->accounting_lock);
        nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
@@ -4112,6 +4140,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        btrfs_set_block_group_used(&cache->item, old_val);
                        cache->reserved -= num_bytes;
                        cache->space_info->bytes_reserved -= num_bytes;
+                        cache->space_info->reservation_progress++;
                        cache->space_info->bytes_used += num_bytes;
                        cache->space_info->disk_used += num_bytes * factor;
                        spin_unlock(&cache->lock);
@@ -4163,6 +4192,7 @@ static int pin_down_extent(struct btrfs_root *root,
        if (reserved) {
                cache->reserved -= num_bytes;
                cache->space_info->bytes_reserved -= num_bytes;
+                cache->space_info->reservation_progress++;
        }
        spin_unlock(&cache->lock);
        spin_unlock(&cache->space_info->lock);
@@ -4213,6 +4243,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
                                space_info->bytes_readonly += num_bytes;
                        cache->reserved -= num_bytes;
                        space_info->bytes_reserved -= num_bytes;
+                        space_info->reservation_progress++;
                }
                spin_unlock(&cache->lock);
                spin_unlock(&space_info->lock);
@@ -4691,6 +4722,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                if (ret) {
                        spin_lock(&cache->space_info->lock);
                        cache->space_info->bytes_reserved -= buf->len;
+                        cache->space_info->reservation_progress++;
                        spin_unlock(&cache->space_info->lock);
                }
                goto out;
@@ -5355,7 +5387,7 @@ again:
                               num_bytes, data, 1);
                goto again;
        }
-        if (ret == -ENOSPC) {
+        if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
                struct btrfs_space_info *sinfo;
                sinfo = __find_space_info(root->fs_info, data);
@@ -5633,6 +5665,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
              struct btrfs_root *root, u32 blocksize)
 {
        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
        int ret;
        block_rsv = get_block_rsv(trans, root);
@@ -5640,14 +5673,39 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        if (block_rsv->size == 0) {
                ret = reserve_metadata_bytes(trans, root, block_rsv,
                                             blocksize, 0);
-                if (ret)
+                /*
+                 * If we couldn't reserve metadata bytes try and use some from
+                 * the global reserve.
+                 */
+                if (ret && block_rsv != global_rsv) {
+                        ret = block_rsv_use_bytes(global_rsv, blocksize);
+                        if (!ret)
+                                return global_rsv;
+                        return ERR_PTR(ret);
+                } else if (ret) {
                        return ERR_PTR(ret);
+                }
                return block_rsv;
        }
        ret = block_rsv_use_bytes(block_rsv, blocksize);
        if (!ret)
                return block_rsv;
+        if (ret) {
+                WARN_ON(1);
+                ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
+                                             0);
+                if (!ret) {
+                        spin_lock(&block_rsv->lock);
+                        block_rsv->size += blocksize;
+                        spin_unlock(&block_rsv->lock);
+                        return block_rsv;
+                } else if (ret && block_rsv != global_rsv) {
+                        ret = block_rsv_use_bytes(global_rsv, blocksize);
+                        if (!ret)
+                                return global_rsv;
+                }
+        }
        return ERR_PTR(-ENOSPC);
 }
@@ -6221,6 +6279,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        BUG_ON(!wc);
        trans = btrfs_start_transaction(tree_root, 0);
+        BUG_ON(IS_ERR(trans));
        if (block_rsv)
                trans->block_rsv = block_rsv;
@@ -6318,6 +6378,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                        btrfs_end_transaction_throttle(trans, tree_root);
                        trans = btrfs_start_transaction(tree_root, 0);
+                        BUG_ON(IS_ERR(trans));
                        if (block_rsv)
                                trans->block_rsv = block_rsv;
                }
@@ -6446,6 +6507,8 @@ static noinline int relocate_inode_pages(struct inode *inode, u64 start,
        int ret = 0;
        ra = kzalloc(sizeof(*ra), GFP_NOFS);
+        if (!ra)
+                return -ENOMEM;
        mutex_lock(&inode->i_mutex);
        first_index = start >> PAGE_CACHE_SHIFT;
@@ -6531,7 +6594,7 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
        u64 end = start + extent_key->offset - 1;
        em = alloc_extent_map(GFP_NOFS);
-        BUG_ON(!em || IS_ERR(em));
+        BUG_ON(!em);
        em->start = start;
        em->len = extent_key->offset;
@@ -7477,7 +7540,7 @@ int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
                BUG_ON(reloc_root->commit_root != NULL);
                while (1) {
                        trans = btrfs_join_transaction(root, 1);
-                        BUG_ON(!trans);
+                        BUG_ON(IS_ERR(trans));
                        mutex_lock(&root->fs_info->drop_mutex);
                        ret = btrfs_drop_snapshot(trans, reloc_root);
@@ -7535,7 +7598,7 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
        if (found) {
                trans = btrfs_start_transaction(root, 1);
-                BUG_ON(!trans);
+                BUG_ON(IS_ERR(trans));
                ret = btrfs_commit_transaction(trans, root);
                BUG_ON(ret);
        }
@@ -7779,7 +7842,7 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
        trans = btrfs_start_transaction(extent_root, 1);
-        BUG_ON(!trans);
+        BUG_ON(IS_ERR(trans));
        if (extent_key->objectid == 0) {
                ret = del_extent_zero(trans, extent_root, path, extent_key);
@@ -8013,6 +8076,13 @@ out:
        return ret;
 }
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 type)
+{
+        u64 alloc_flags = get_alloc_profile(root, type);
+        return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+}
 /*
 * helper to account the unused space of all the readonly block group in the
 * list. takes mirrors into account.
@@ -8270,6 +8340,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                if (block_group->cached == BTRFS_CACHE_STARTED)
                        wait_block_group_cache_done(block_group);
+                /*
+                 * We haven't cached this block group, which means we could
+                 * possibly have excluded extents on this block group.
+                 */
+                if (block_group->cached == BTRFS_CACHE_NO)
+                        free_excluded_extents(info->extent_root, block_group);
                btrfs_remove_free_space_cache(block_group);
                btrfs_put_block_group(block_group);
@@ -8385,6 +8462,13 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                cache->sectorsize = root->sectorsize;
                /*
+                 * We need to exclude the super stripes now so that the space
+                 * info has super bytes accounted for, otherwise we'll think
+                 * we have more space than we actually do.
+                 */
+                exclude_super_stripes(root, cache);
+                /*
                 * check for two cases, either we are full, and therefore
                 * don't need to bother with the caching work since we won't
                 * find any space, or we are empty, and we can just add all
@@ -8392,12 +8476,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                 * time, particularly in the full case.
                 */
                if (found_key.offset == btrfs_block_group_used(&cache->item)) {
-                        exclude_super_stripes(root, cache);
                        cache->last_byte_to_unpin = (u64)-1;
                        cache->cached = BTRFS_CACHE_FINISHED;
                        free_excluded_extents(root, cache);
                } else if (btrfs_block_group_used(&cache->item) == 0) {
-                        exclude_super_stripes(root, cache);
                        cache->last_byte_to_unpin = (u64)-1;
                        cache->cached = BTRFS_CACHE_FINISHED;
                        add_new_free_space(cache, root->fs_info,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2e993cf1766e..b5b92824a271 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1433,12 +1433,13 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 */
 u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end, u64 max_bytes,
-                     unsigned long bits)
+                     unsigned long bits, int contig)
 {
        struct rb_node *node;
        struct extent_state *state;
        u64 cur_start = *start;
        u64 total_bytes = 0;
+        u64 last = 0;
        int found = 0;
        if (search_end <= cur_start) {
@@ -1463,7 +1464,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
                state = rb_entry(node, struct extent_state, rb_node);
                if (state->start > search_end)
                        break;
-                if (state->end >= cur_start && (state->state & bits)) {
+                if (contig && found && state->start > last + 1)
+                        break;
+                if (state->end >= cur_start && (state->state & bits) == bits) {
                        total_bytes += min(search_end, state->end) + 1 -
                                       max(cur_start, state->start);
                        if (total_bytes >= max_bytes)
@@ -1472,6 +1475,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
                                *start = state->start;
                                found = 1;
                        }
+                        last = state->end;
+                } else if (contig && found) {
+                        break;
                }
                node = rb_next(node);
                if (!node)
@@ -1865,7 +1871,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
        bio_get(bio);
        if (tree->ops && tree->ops->submit_bio_hook)
-                tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+                ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
                                           mirror_num, bio_flags, start);
        else
                submit_bio(rw, bio);
@@ -1920,6 +1926,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
                nr = bio_get_nr_vecs(bdev);
        bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+        if (!bio)
+                return -ENOMEM;
        bio_add_page(bio, page, page_size, offset);
        bio->bi_end_io = end_io_func;
@@ -1944,6 +1952,7 @@ void set_page_extent_mapped(struct page *page)
 static void set_page_extent_head(struct page *page, unsigned long len)
 {
+        WARN_ON(!PagePrivate(page));
        set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
 }
@@ -2126,7 +2135,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
        ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
                                      &bio_flags);
        if (bio)
-                submit_one_bio(READ, bio, 0, bio_flags);
+                ret = submit_one_bio(READ, bio, 0, bio_flags);
        return ret;
 }
@@ -2179,7 +2188,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        unsigned long nr_written = 0;
        if (wbc->sync_mode == WB_SYNC_ALL)
-                write_flags = WRITE_SYNC_PLUG;
+                write_flags = WRITE_SYNC;
        else
                write_flags = WRITE;
@@ -2819,9 +2828,17 @@ int try_release_extent_state(struct extent_map_tree *map,
                 * at this point we can safely clear everything except the
                 * locked bit and the nodatasum bit
                 */
-                clear_extent_bit(tree, start, end,
+                ret = clear_extent_bit(tree, start, end,
                                 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
                                 0, 0, NULL, mask);
+                /* if clear_extent_bit failed for enomem reasons,
+                 * we can't allow the release to continue.
+                 */
+                if (ret < 0)
+                        ret = 0;
+                else
+                        ret = 1;
        }
        return ret;
 }
@@ -2901,6 +2918,46 @@ out:
        return sector;
 }
+/*
+ * helper function for fiemap, which doesn't want to see any holes.
+ * This maps until we find something past 'last'
+ */
+static struct extent_map *get_extent_skip_holes(struct inode *inode,
+                                                u64 offset,
+                                                u64 last,
+                                                get_extent_t *get_extent)
+{
+        u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
+        struct extent_map *em;
+        u64 len;
+        if (offset >= last)
+                return NULL;
+        while(1) {
+                len = last - offset;
+                if (len == 0)
+                        break;
+                len = (len + sectorsize - 1) & ~(sectorsize - 1);
+                em = get_extent(inode, NULL, 0, offset, len, 0);
+                if (!em || IS_ERR(em))
+                        return em;
+                /* if this isn't a hole return it */
+                if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
+                    em->block_start != EXTENT_MAP_HOLE) {
+                        return em;
+                }
+                /* this is a hole, advance to the next extent */
+                offset = extent_map_end(em);
+                free_extent_map(em);
+                if (offset >= last)
+                        break;
+        }
+        return NULL;
+}
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len, get_extent_t *get_extent)
 {
@@ -2910,16 +2967,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        u32 flags = 0;
        u32 found_type;
        u64 last;
+        u64 last_for_get_extent = 0;
        u64 disko = 0;
+        u64 isize = i_size_read(inode);
        struct btrfs_key found_key;
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        struct btrfs_path *path;
        struct btrfs_file_extent_item *item;
        int end = 0;
-        u64 em_start = 0, em_len = 0;
+        u64 em_start = 0;
+        u64 em_len = 0;
+        u64 em_end = 0;
        unsigned long emflags;
-        int hole = 0;
        if (len == 0)
                return -EINVAL;
@@ -2929,6 +2989,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                return -ENOMEM;
        path->leave_spinning = 1;
+        /*
+         * lookup the last file extent.  We're not using i_size here
+         * because there might be preallocation past i_size
+         */
        ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
                                       path, inode->i_ino, -1, 0);
        if (ret < 0) {
@@ -2942,18 +3006,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
        found_type = btrfs_key_type(&found_key);
-        /* No extents, just return */
+        /* No extents, but there might be delalloc bits */
        if (found_key.objectid != inode->i_ino ||
            found_type != BTRFS_EXTENT_DATA_KEY) {
-                btrfs_free_path(path);
+                /* have to trust i_size as the end */
-                return 0;
+                last = (u64)-1;
+                last_for_get_extent = isize;
+        } else {
+                /*
+                 * remember the start of the last extent.  There are a
+                 * bunch of different factors that go into the length of the
+                 * extent, so its much less complex to remember where it started
+                 */
+                last = found_key.offset;
+                last_for_get_extent = last + 1;
        }
-        last = found_key.offset;
        btrfs_free_path(path);
+        /*
+         * we might have some extents allocated but more delalloc past those
+         * extents.  so, we trust isize unless the start of the last extent is
+         * beyond isize
+         */
+        if (last < isize) {
+                last = (u64)-1;
+                last_for_get_extent = isize;
+        }
        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
                         &cached_state, GFP_NOFS);
-        em = get_extent(inode, NULL, 0, off, max - off, 0);
+        em = get_extent_skip_holes(inode, off, last_for_get_extent,
+                                   get_extent);
        if (!em)
                goto out;
        if (IS_ERR(em)) {
@@ -2962,22 +3046,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        }
        while (!end) {
-                hole = 0;
+                u64 offset_in_extent;
-                off = em->start + em->len;
-                if (off >= max)
-                        end = 1;
-                if (em->block_start == EXTENT_MAP_HOLE) {
+                /* break if the extent we found is outside the range */
-                        hole = 1;
+                if (em->start >= max || extent_map_end(em) < off)
-                        goto next;
+                        break;
-                }
-                em_start = em->start;
+                /*
-                em_len = em->len;
+                 * get_extent may return an extent that starts before our
+                 * requested range.  We have to make sure the ranges
+                 * we return to fiemap always move forward and don't
+                 * overlap, so adjust the offsets here
+                 */
+                em_start = max(em->start, off);
+                /*
+                 * record the offset from the start of the extent
+                 * for adjusting the disk offset below
+                 */
+                offset_in_extent = em_start - em->start;
+                em_end = extent_map_end(em);
+                em_len = em_end - em_start;
+                emflags = em->flags;
                disko = 0;
                flags = 0;
+                /*
+                 * bump off for our next call to get_extent
+                 */
+                off = extent_map_end(em);
+                if (off >= max)
+                        end = 1;
                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
                        end = 1;
                        flags |= FIEMAP_EXTENT_LAST;
@@ -2988,42 +3088,34 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        flags |= (FIEMAP_EXTENT_DELALLOC |
                                  FIEMAP_EXTENT_UNKNOWN);
                } else {
-                        disko = em->block_start;
+                        disko = em->block_start + offset_in_extent;
                }
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
                        flags |= FIEMAP_EXTENT_ENCODED;
-next:
-                emflags = em->flags;
                free_extent_map(em);
                em = NULL;
-                if (!end) {
+                if ((em_start >= last) || em_len == (u64)-1 ||
-                        em = get_extent(inode, NULL, 0, off, max - off, 0);
+                   (last == (u64)-1 && isize <= em_end)) {
-                        if (!em)
-                                goto out;
-                        if (IS_ERR(em)) {
-                                ret = PTR_ERR(em);
-                                goto out;
-                        }
-                        emflags = em->flags;
-                }
-                if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
                        flags |= FIEMAP_EXTENT_LAST;
                        end = 1;
                }
-                if (em_start == last) {
+                /* now scan forward to see if this is really the last extent. */
+                em = get_extent_skip_holes(inode, off, last_for_get_extent,
+                                           get_extent);
+                if (IS_ERR(em)) {
+                        ret = PTR_ERR(em);
+                        goto out;
+                }
+                if (!em) {
                        flags |= FIEMAP_EXTENT_LAST;
                        end = 1;
                }
+                ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
-                if (!hole) {
+                                              em_len, flags);
-                        ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+                if (ret)
-                                                em_len, flags);
+                        goto out_free;
-                        if (ret)
-                                goto out_free;
-                }
        }
 out_free:
        free_extent_map(em);
@@ -3192,7 +3284,13 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                }
                if (!PageUptodate(p))
                        uptodate = 0;
-                unlock_page(p);
+                /*
+                 * see below about how we avoid a nasty race with release page
+                 * and why we unlock later
+                 */
+                if (i != 0)
+                        unlock_page(p);
        }
        if (uptodate)
                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -3216,9 +3314,26 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
        atomic_inc(&eb->refs);
        spin_unlock(&tree->buffer_lock);
        radix_tree_preload_end();
+        /*
+         * there is a race where release page may have
+         * tried to find this extent buffer in the radix
+         * but failed.  It will tell the VM it is safe to
+         * reclaim the, and it will clear the page private bit.
+         * We must make sure to set the page private bit properly
+         * after the extent buffer is in the radix tree so
+         * it doesn't get lost
+         */
+        set_page_extent_mapped(eb->first_page);
+        set_page_extent_head(eb->first_page, eb->len);
+        if (!page0)
+                unlock_page(eb->first_page);
        return eb;
 free_eb:
+        if (eb->first_page && !page0)
+                unlock_page(eb->first_page);
        if (!atomic_dec_and_test(&eb->refs))
                return exists;
        btrfs_release_extent_buffer(eb);
@@ -3269,10 +3384,11 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                        continue;
                lock_page(page);
+                WARN_ON(!PagePrivate(page));
+                set_page_extent_mapped(page);
                if (i == 0)
                        set_page_extent_head(page, eb->len);
-                else
-                        set_page_private(page, EXTENT_PAGE_PRIVATE);
                clear_page_dirty_for_io(page);
                spin_lock_irq(&page->mapping->tree_lock);
@@ -3462,6 +3578,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        for (i = start_i; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
+                WARN_ON(!PagePrivate(page));
+                set_page_extent_mapped(page);
+                if (i == 0)
+                        set_page_extent_head(page, eb->len);
                if (inc_all_pages)
                        page_cache_get(page);
                if (!PageUptodate(page)) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7083cfafd061..9318dfefd59c 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -191,7 +191,7 @@ void extent_io_exit(void);
 u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end,
-                     u64 max_bytes, unsigned long bits);
+                     u64 max_bytes, unsigned long bits, int contig);
 void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b0e1fce12530..2b6c12e983b3 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -51,8 +51,8 @@ struct extent_map *alloc_extent_map(gfp_t mask)
 {
        struct extent_map *em;
        em = kmem_cache_alloc(extent_map_cache, mask);
-        if (!em || IS_ERR(em))
+        if (!em)
-                return em;
+                return NULL;
        em->in_tree = 0;
        em->flags = 0;
        em->compress_type = BTRFS_COMPRESS_NONE;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a562a250ae77..4f19a3e1bf32 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -536,6 +536,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
        root = root->fs_info->csum_root;
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        while (1) {
                key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -548,7 +550,10 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
                        if (path->slots[0] == 0)
                                goto out;
                        path->slots[0]--;
+                } else if (ret < 0) {
+                        goto out;
                }
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c800d58f3013..f447b783bb84 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -70,6 +70,19 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
                /* Flush processor's dcache for this page */
                flush_dcache_page(page);
+                /*
+                 * if we get a partial write, we can end up with
+                 * partially up to date pages.  These add
+                 * a lot of complexity, so make sure they don't
+                 * happen by forcing this copy to be retried.
+                 *
+                 * The rest of the btrfs_file_write code will fall
+                 * back to page at a time copies after we return 0.
+                 */
+                if (!PageUptodate(page) && copied < count)
+                        copied = 0;
                iov_iter_advance(i, copied);
                write_bytes -= copied;
                total_copied += copied;
@@ -186,6 +199,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split = alloc_extent_map(GFP_NOFS);
                if (!split2)
                        split2 = alloc_extent_map(GFP_NOFS);
+                BUG_ON(!split || !split2);
                write_lock(&em_tree->lock);
                em = lookup_extent_mapping(em_tree, start, len);
@@ -762,6 +776,27 @@ out:
 }
 /*
+ * on error we return an unlocked page and the error value
+ * on success we return a locked page and 0
+ */
+static int prepare_uptodate_page(struct page *page, u64 pos)
+{
+        int ret = 0;
+        if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
+                ret = btrfs_readpage(NULL, page);
+                if (ret)
+                        return ret;
+                lock_page(page);
+                if (!PageUptodate(page)) {
+                        unlock_page(page);
+                        return -EIO;
+                }
+        }
+        return 0;
+}
+/*
 * this gets pages into the page cache and locks them down, it also properly
 * waits for data=ordered extents to finish before allowing the pages to be
 * modified.
@@ -776,6 +811,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
        unsigned long index = pos >> PAGE_CACHE_SHIFT;
        struct inode *inode = fdentry(file)->d_inode;
        int err = 0;
+        int faili = 0;
        u64 start_pos;
        u64 last_pos;
@@ -793,11 +829,24 @@ again:
        for (i = 0; i < num_pages; i++) {
                pages[i] = grab_cache_page(inode->i_mapping, index + i);
                if (!pages[i]) {
+                        faili = i - 1;
                        err = -ENOMEM;
-                        BUG_ON(1);
+                        goto fail;
+                }
+                if (i == 0)
+                        err = prepare_uptodate_page(pages[i], pos);
+                if (i == num_pages - 1)
+                        err = prepare_uptodate_page(pages[i],
+                                                    pos + write_bytes);
+                if (err) {
+                        page_cache_release(pages[i]);
+                        faili = i - 1;
+                        goto fail;
                }
                wait_on_page_writeback(pages[i]);
        }
+        err = 0;
        if (start_pos < inode->i_size) {
                struct btrfs_ordered_extent *ordered;
                lock_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -837,6 +886,14 @@ again:
                WARN_ON(!PageLocked(pages[i]));
        }
        return 0;
+fail:
+        while (faili >= 0) {
+                unlock_page(pages[faili]);
+                page_cache_release(pages[faili]);
+                faili--;
+        }
+        return err;
 }
 static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
@@ -846,7 +903,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        struct inode *inode = fdentry(file)->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct page *pinned[2];
        struct page **pages = NULL;
        struct iov_iter i;
        loff_t *ppos = &iocb->ki_pos;
@@ -867,9 +923,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
                      (file->f_flags & O_DIRECT));
-        pinned[0] = NULL;
-        pinned[1] = NULL;
        start_pos = pos;
        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
@@ -946,6 +999,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
                     (sizeof(struct page *)));
        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+        if (!pages) {
+                ret = -ENOMEM;
+                goto out;
+        }
        /* generic_write_checks can change our pos */
        start_pos = pos;
@@ -953,39 +1010,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        first_index = pos >> PAGE_CACHE_SHIFT;
        last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
-        /*
-         * there are lots of better ways to do this, but this code
-         * makes sure the first and last page in the file range are
-         * up to date and ready for cow
-         */
-        if ((pos & (PAGE_CACHE_SIZE - 1))) {
-                pinned[0] = grab_cache_page(inode->i_mapping, first_index);
-                if (!PageUptodate(pinned[0])) {
-                        ret = btrfs_readpage(NULL, pinned[0]);
-                        BUG_ON(ret);
-                        wait_on_page_locked(pinned[0]);
-                } else {
-                        unlock_page(pinned[0]);
-                }
-        }
-        if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
-                pinned[1] = grab_cache_page(inode->i_mapping, last_index);
-                if (!PageUptodate(pinned[1])) {
-                        ret = btrfs_readpage(NULL, pinned[1]);
-                        BUG_ON(ret);
-                        wait_on_page_locked(pinned[1]);
-                } else {
-                        unlock_page(pinned[1]);
-                }
-        }
        while (iov_iter_count(&i) > 0) {
                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
                size_t write_bytes = min(iov_iter_count(&i),
                                         nrptrs * (size_t)PAGE_CACHE_SIZE -
                                         offset);
-                size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
+                size_t num_pages = (write_bytes + offset +
-                                        PAGE_CACHE_SHIFT;
+                                    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
                WARN_ON(num_pages > nrptrs);
                memset(pages, 0, sizeof(struct page *) * nrptrs);
@@ -1015,8 +1046,20 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                copied = btrfs_copy_from_user(pos, num_pages,
                                           write_bytes, pages, &i);
-                dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >>
-                                        PAGE_CACHE_SHIFT;
+                /*
+                 * if we have trouble faulting in the pages, fall
+                 * back to one page at a time
+                 */
+                if (copied < write_bytes)
+                        nrptrs = 1;
+                if (copied == 0)
+                        dirty_pages = 0;
+                else
+                        dirty_pages = (copied + offset +
+                                       PAGE_CACHE_SIZE - 1) >>
+                                       PAGE_CACHE_SHIFT;
                if (num_pages > dirty_pages) {
                        if (copied > 0)
@@ -1060,10 +1103,6 @@ out:
                err = ret;
        kfree(pages);
-        if (pinned[0])
-                page_cache_release(pinned[0]);
-        if (pinned[1])
-                page_cache_release(pinned[1]);
        *ppos = pos;
        /*
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 60d684266959..a0390657451b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -987,11 +987,18 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
        return entry;
 }
-static void unlink_free_space(struct btrfs_block_group_cache *block_group,
+static inline void
-                              struct btrfs_free_space *info)
+__unlink_free_space(struct btrfs_block_group_cache *block_group,
+                    struct btrfs_free_space *info)
 {
        rb_erase(&info->offset_index, &block_group->free_space_offset);
        block_group->free_extents--;
+}
+static void unlink_free_space(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *info)
+{
+        __unlink_free_space(block_group, info);
        block_group->free_space -= info->bytes;
 }
@@ -1016,14 +1023,18 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
        u64 max_bytes;
        u64 bitmap_bytes;
        u64 extent_bytes;
+        u64 size = block_group->key.offset;
        /*
         * The goal is to keep the total amount of memory used per 1gb of space
         * at or below 32k, so we need to adjust how much memory we allow to be
         * used by extent based free space tracking
         */
-        max_bytes = MAX_CACHE_BYTES_PER_GIG *
+        if (size < 1024 * 1024 * 1024)
-                (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
+                max_bytes = MAX_CACHE_BYTES_PER_GIG;
+        else
+                max_bytes = MAX_CACHE_BYTES_PER_GIG *
+                        div64_u64(size, 1024 * 1024 * 1024);
        /*
         * we want to account for 1 more bitmap than what we have so we can make
@@ -1171,6 +1182,16 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
        recalculate_thresholds(block_group);
 }
+static void free_bitmap(struct btrfs_block_group_cache *block_group,
+                        struct btrfs_free_space *bitmap_info)
+{
+        unlink_free_space(block_group, bitmap_info);
+        kfree(bitmap_info->bitmap);
+        kfree(bitmap_info);
+        block_group->total_bitmaps--;
+        recalculate_thresholds(block_group);
+}
 static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
                              struct btrfs_free_space *bitmap_info,
                              u64 *offset, u64 *bytes)
@@ -1195,6 +1216,7 @@ again:
         */
        search_start = *offset;
        search_bytes = *bytes;
+        search_bytes = min(search_bytes, end - search_start + 1);
        ret = search_bitmap(block_group, bitmap_info, &search_start,
                            &search_bytes);
        BUG_ON(ret < 0 || search_start != *offset);
@@ -1211,13 +1233,8 @@ again:
        if (*bytes) {
                struct rb_node *next = rb_next(&bitmap_info->offset_index);
-                if (!bitmap_info->bytes) {
+                if (!bitmap_info->bytes)
-                        unlink_free_space(block_group, bitmap_info);
+                        free_bitmap(block_group, bitmap_info);
-                        kfree(bitmap_info->bitmap);
-                        kfree(bitmap_info);
-                        block_group->total_bitmaps--;
-                        recalculate_thresholds(block_group);
-                }
                /*
                 * no entry after this bitmap, but we still have bytes to
@@ -1250,13 +1267,8 @@ again:
                        return -EAGAIN;
                goto again;
-        } else if (!bitmap_info->bytes) {
+        } else if (!bitmap_info->bytes)
-                unlink_free_space(block_group, bitmap_info);
+                free_bitmap(block_group, bitmap_info);
-                kfree(bitmap_info->bitmap);
-                kfree(bitmap_info);
-                block_group->total_bitmaps--;
-                recalculate_thresholds(block_group);
-        }
        return 0;
 }
@@ -1359,22 +1371,14 @@ out:
        return ret;
 }
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
-                         u64 offset, u64 bytes)
+                          struct btrfs_free_space *info, bool update_stat)
 {
-        struct btrfs_free_space *right_info = NULL;
+        struct btrfs_free_space *left_info;
-        struct btrfs_free_space *left_info = NULL;
+        struct btrfs_free_space *right_info;
-        struct btrfs_free_space *info = NULL;
+        bool merged = false;
-        int ret = 0;
+        u64 offset = info->offset;
+        u64 bytes = info->bytes;
-        info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
-        if (!info)
-                return -ENOMEM;
-        info->offset = offset;
-        info->bytes = bytes;
-        spin_lock(&block_group->tree_lock);
        /*
         * first we want to see if there is free space adjacent to the range we
@@ -1388,37 +1392,62 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
        else
                left_info = tree_search_offset(block_group, offset - 1, 0, 0);
-        /*
-         * If there was no extent directly to the left or right of this new
-         * extent then we know we're going to have to allocate a new extent, so
-         * before we do that see if we need to drop this into a bitmap
-         */
-        if ((!left_info || left_info->bitmap) &&
-            (!right_info || right_info->bitmap)) {
-                ret = insert_into_bitmap(block_group, info);
-                if (ret < 0) {
-                        goto out;
-                } else if (ret) {
-                        ret = 0;
-                        goto out;
-                }
-        }
        if (right_info && !right_info->bitmap) {
-                unlink_free_space(block_group, right_info);
+                if (update_stat)
+                        unlink_free_space(block_group, right_info);
+                else
+                        __unlink_free_space(block_group, right_info);
                info->bytes += right_info->bytes;
                kfree(right_info);
+                merged = true;
        }
        if (left_info && !left_info->bitmap &&
            left_info->offset + left_info->bytes == offset) {
-                unlink_free_space(block_group, left_info);
+                if (update_stat)
+                        unlink_free_space(block_group, left_info);
+                else
+                        __unlink_free_space(block_group, left_info);
                info->offset = left_info->offset;
                info->bytes += left_info->bytes;
                kfree(left_info);
+                merged = true;
        }
+        return merged;
+}
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+                         u64 offset, u64 bytes)
+{
+        struct btrfs_free_space *info;
+        int ret = 0;
+        info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+        if (!info)
+                return -ENOMEM;
+        info->offset = offset;
+        info->bytes = bytes;
+        spin_lock(&block_group->tree_lock);
+        if (try_merge_free_space(block_group, info, true))
+                goto link;
+        /*
+         * There was no extent directly to the left or right of this new
+         * extent then we know we're going to have to allocate a new extent, so
+         * before we do that see if we need to drop this into a bitmap
+         */
+        ret = insert_into_bitmap(block_group, info);
+        if (ret < 0) {
+                goto out;
+        } else if (ret) {
+                ret = 0;
+                goto out;
+        }
+link:
        ret = link_free_space(block_group, info);
        if (ret)
                kfree(info);
@@ -1621,6 +1650,7 @@ __btrfs_return_cluster_to_free_space(
                node = rb_next(&entry->offset_index);
                rb_erase(&entry->offset_index, &cluster->root);
                BUG_ON(entry->bitmap);
+                try_merge_free_space(block_group, entry, false);
                tree_insert_offset(&block_group->free_space_offset,
                                   entry->offset, &entry->offset_index, 0);
        }
@@ -1685,13 +1715,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
        ret = offset;
        if (entry->bitmap) {
                bitmap_clear_bits(block_group, entry, offset, bytes);
-                if (!entry->bytes) {
+                if (!entry->bytes)
-                        unlink_free_space(block_group, entry);
+                        free_bitmap(block_group, entry);
-                        kfree(entry->bitmap);
-                        kfree(entry);
-                        block_group->total_bitmaps--;
-                        recalculate_thresholds(block_group);
-                }
        } else {
                unlink_free_space(block_group, entry);
                entry->offset += bytes;
@@ -1789,6 +1814,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
        ret = search_start;
        bitmap_clear_bits(block_group, entry, ret, bytes);
+        if (entry->bytes == 0)
+                free_bitmap(block_group, entry);
 out:
        spin_unlock(&cluster->lock);
        spin_unlock(&block_group->tree_lock);
@@ -1842,15 +1869,26 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
                entry->offset += bytes;
                entry->bytes -= bytes;
-                if (entry->bytes == 0) {
+                if (entry->bytes == 0)
                        rb_erase(&entry->offset_index, &cluster->root);
-                        kfree(entry);
-                }
                break;
        }
 out:
        spin_unlock(&cluster->lock);
+        if (!ret)
+                return 0;
+        spin_lock(&block_group->tree_lock);
+        block_group->free_space -= bytes;
+        if (entry->bytes == 0) {
+                block_group->free_extents--;
+                kfree(entry);
+        }
+        spin_unlock(&block_group->tree_lock);
        return ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 160b55b3e132..119520bdb9a5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -90,13 +90,14 @@ static noinline int cow_file_range(struct inode *inode,
                                   unsigned long *nr_written, int unlock);
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
-                                     struct inode *inode,  struct inode *dir)
+                                     struct inode *inode,  struct inode *dir,
+                                     const struct qstr *qstr)
 {
        int err;
        err = btrfs_init_acl(trans, inode, dir);
        if (!err)
-                err = btrfs_xattr_security_init(trans, inode, dir);
+                err = btrfs_xattr_security_init(trans, inode, dir, qstr);
        return err;
 }
@@ -416,7 +417,7 @@ again:
        }
        if (start == 0) {
                trans = btrfs_join_transaction(root, 1);
-                BUG_ON(!trans);
+                BUG_ON(IS_ERR(trans));
                btrfs_set_trans_block_group(trans, inode);
                trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -612,6 +613,7 @@ retry:
                            GFP_NOFS);
                trans = btrfs_join_transaction(root, 1);
+                BUG_ON(IS_ERR(trans));
                ret = btrfs_reserve_extent(trans, root,
                                           async_extent->compressed_size,
                                           async_extent->compressed_size,
@@ -643,6 +645,7 @@ retry:
                                        async_extent->ram_size - 1, 0);
                em = alloc_extent_map(GFP_NOFS);
+                BUG_ON(!em);
                em->start = async_extent->start;
                em->len = async_extent->ram_size;
                em->orig_start = em->start;
@@ -771,7 +774,7 @@ static noinline int cow_file_range(struct inode *inode,
        BUG_ON(root == root->fs_info->tree_root);
        trans = btrfs_join_transaction(root, 1);
-        BUG_ON(!trans);
+        BUG_ON(IS_ERR(trans));
        btrfs_set_trans_block_group(trans, inode);
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -819,6 +822,7 @@ static noinline int cow_file_range(struct inode *inode,
                BUG_ON(ret);
                em = alloc_extent_map(GFP_NOFS);
+                BUG_ON(!em);
                em->start = start;
                em->orig_start = em->start;
                ram_size = ins.offset;
@@ -1049,7 +1053,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
        } else {
                trans = btrfs_join_transaction(root, 1);
        }
-        BUG_ON(!trans);
+        BUG_ON(IS_ERR(trans));
        cow_start = (u64)-1;
        cur_offset = start;
@@ -1168,6 +1172,7 @@ out_check:
                        struct extent_map_tree *em_tree;
                        em_tree = &BTRFS_I(inode)->extent_tree;
                        em = alloc_extent_map(GFP_NOFS);
+                        BUG_ON(!em);
                        em->start = cur_offset;
                        em->orig_start = em->start;
                        em->len = num_bytes;
@@ -1557,6 +1562,7 @@ out:
 out_page:
        unlock_page(page);
        page_cache_release(page);
+        kfree(fixup);
 }
 /*
@@ -1703,7 +1709,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                trans = btrfs_join_transaction_nolock(root, 1);
                        else
                                trans = btrfs_join_transaction(root, 1);
-                        BUG_ON(!trans);
+                        BUG_ON(IS_ERR(trans));
                        btrfs_set_trans_block_group(trans, inode);
                        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                        ret = btrfs_update_inode(trans, root, inode);
@@ -1720,6 +1726,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                trans = btrfs_join_transaction_nolock(root, 1);
        else
                trans = btrfs_join_transaction(root, 1);
+        BUG_ON(IS_ERR(trans));
        btrfs_set_trans_block_group(trans, inode);
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -1907,7 +1914,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start)
        private = 0;
        if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
-                             (u64)-1, 1, EXTENT_DIRTY)) {
+                             (u64)-1, 1, EXTENT_DIRTY, 0)) {
                ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
                                        start, &private_failure);
                if (ret == 0) {
@@ -2354,6 +2361,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                 */
                if (is_bad_inode(inode)) {
                        trans = btrfs_start_transaction(root, 0);
+                        BUG_ON(IS_ERR(trans));
                        btrfs_orphan_del(trans, inode);
                        btrfs_end_transaction(trans, root);
                        iput(inode);
@@ -2381,6 +2389,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
        if (root->orphan_block_rsv || root->orphan_item_inserted) {
                trans = btrfs_join_transaction(root, 1);
+                BUG_ON(IS_ERR(trans));
                btrfs_end_transaction(trans, root);
        }
@@ -2641,7 +2650,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
-                goto err;
+                goto out;
        }
        path->leave_spinning = 1;
@@ -2714,9 +2723,10 @@ static int check_path_shared(struct btrfs_root *root,
        struct extent_buffer *eb;
        int level;
        u64 refs = 1;
-        int uninitialized_var(ret);
        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+                int ret;
                if (!path->nodes[level])
                        break;
                eb = path->nodes[level];
@@ -2727,7 +2737,7 @@ static int check_path_shared(struct btrfs_root *root,
                if (refs > 1)
                        return 1;
        }
-        return ret; /* XXX callers? */
+        return 0;
 }
 /*
@@ -4134,7 +4144,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        }
        srcu_read_unlock(&root->fs_info->subvol_srcu, index);
-        if (root != sub_root) {
+        if (!IS_ERR(inode) && root != sub_root) {
                down_read(&root->fs_info->cleanup_work_sem);
                if (!(inode->i_sb->s_flags & MS_RDONLY))
                        btrfs_orphan_cleanup(sub_root);
@@ -4347,6 +4357,8 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
                        trans = btrfs_join_transaction_nolock(root, 1);
                else
                        trans = btrfs_join_transaction(root, 1);
+                if (IS_ERR(trans))
+                        return PTR_ERR(trans);
                btrfs_set_trans_block_group(trans, inode);
                if (nolock)
                        ret = btrfs_end_transaction_nolock(trans, root);
@@ -4372,6 +4384,7 @@ void btrfs_dirty_inode(struct inode *inode)
                return;
        trans = btrfs_join_transaction(root, 1);
+        BUG_ON(IS_ERR(trans));
        btrfs_set_trans_block_group(trans, inode);
        ret = btrfs_update_inode(trans, root, inode);
@@ -4692,7 +4705,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(inode))
                goto out_unlock;
-        err = btrfs_init_inode_security(trans, inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
@@ -4753,7 +4766,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(inode))
                goto out_unlock;
-        err = btrfs_init_inode_security(trans, inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
@@ -4794,9 +4807,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        int err;
        int drop_inode = 0;
-        if (inode->i_nlink == 0)
-                return -ENOENT;
        /* do not allow sys_link's with other subvols of the same device */
        if (root->objectid != BTRFS_I(inode)->root->objectid)
                return -EPERM;
@@ -4809,10 +4819,11 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                goto fail;
        /*
-         * 1 item for inode ref
+         * 2 items for inode and inode ref
         * 2 items for dir items
+         * 1 item for parent inode
         */
-        trans = btrfs_start_transaction(root, 3);
+        trans = btrfs_start_transaction(root, 5);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto fail;
@@ -4881,7 +4892,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        drop_on_err = 1;
-        err = btrfs_init_inode_security(trans, inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err)
                goto out_fail;
@@ -5176,6 +5187,8 @@ again:
                                em = NULL;
                                btrfs_release_path(root, path);
                                trans = btrfs_join_transaction(root, 1);
+                                if (IS_ERR(trans))
+                                        return ERR_CAST(trans);
                                goto again;
                        }
                        map = kmap(page);
@@ -5266,6 +5279,128 @@ out:
        return em;
 }
+struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
+                                           size_t pg_offset, u64 start, u64 len,
+                                           int create)
+{
+        struct extent_map *em;
+        struct extent_map *hole_em = NULL;
+        u64 range_start = start;
+        u64 end;
+        u64 found;
+        u64 found_end;
+        int err = 0;
+        em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+        if (IS_ERR(em))
+                return em;
+        if (em) {
+                /*
+                 * if our em maps to a hole, there might
+                 * actually be delalloc bytes behind it
+                 */
+                if (em->block_start != EXTENT_MAP_HOLE)
+                        return em;
+                else
+                        hole_em = em;
+        }
+        /* check to see if we've wrapped (len == -1 or similar) */
+        end = start + len;
+        if (end < start)
+                end = (u64)-1;
+        else
+                end -= 1;
+        em = NULL;
+        /* ok, we didn't find anything, lets look for delalloc */
+        found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
+                                 end, len, EXTENT_DELALLOC, 1);
+        found_end = range_start + found;
+        if (found_end < range_start)
+                found_end = (u64)-1;
+        /*
+         * we didn't find anything useful, return
+         * the original results from get_extent()
+         */
+        if (range_start > end || found_end <= start) {
+                em = hole_em;
+                hole_em = NULL;
+                goto out;
+        }
+        /* adjust the range_start to make sure it doesn't
+         * go backwards from the start they passed in
+         */
+        range_start = max(start,range_start);
+        found = found_end - range_start;
+        if (found > 0) {
+                u64 hole_start = start;
+                u64 hole_len = len;
+                em = alloc_extent_map(GFP_NOFS);
+                if (!em) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                /*
+                 * when btrfs_get_extent can't find anything it
+                 * returns one huge hole
+                 *
+                 * make sure what it found really fits our range, and
+                 * adjust to make sure it is based on the start from
+                 * the caller
+                 */
+                if (hole_em) {
+                        u64 calc_end = extent_map_end(hole_em);
+                        if (calc_end <= start || (hole_em->start > end)) {
+                                free_extent_map(hole_em);
+                                hole_em = NULL;
+                        } else {
+                                hole_start = max(hole_em->start, start);
+                                hole_len = calc_end - hole_start;
+                        }
+                }
+                em->bdev = NULL;
+                if (hole_em && range_start > hole_start) {
+                        /* our hole starts before our delalloc, so we
+                         * have to return just the parts of the hole
+                         * that go until  the delalloc starts
+                         */
+                        em->len = min(hole_len,
+                                      range_start - hole_start);
+                        em->start = hole_start;
+                        em->orig_start = hole_start;
+                        /*
+                         * don't adjust block start at all,
+                         * it is fixed at EXTENT_MAP_HOLE
+                         */
+                        em->block_start = hole_em->block_start;
+                        em->block_len = hole_len;
+                } else {
+                        em->start = range_start;
+                        em->len = found;
+                        em->orig_start = range_start;
+                        em->block_start = EXTENT_MAP_DELALLOC;
+                        em->block_len = found;
+                }
+        } else if (hole_em) {
+                return hole_em;
+        }
+out:
+        free_extent_map(hole_em);
+        if (err) {
+                free_extent_map(em);
+                return ERR_PTR(err);
+        }
+        return em;
+}
 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                                                  u64 start, u64 len)
 {
@@ -5280,8 +5415,8 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
        btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
        trans = btrfs_join_transaction(root, 0);
-        if (!trans)
+        if (IS_ERR(trans))
-                return ERR_PTR(-ENOMEM);
+                return ERR_CAST(trans);
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -5505,7 +5640,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                 * while we look for nocow cross refs
                 */
                trans = btrfs_join_transaction(root, 0);
-                if (!trans)
+                if (IS_ERR(trans))
                        goto must_cow;
                if (can_nocow_odirect(trans, inode, start, len) == 1) {
@@ -5640,7 +5775,7 @@ again:
        BUG_ON(!ordered);
        trans = btrfs_join_transaction(root, 1);
-        if (!trans) {
+        if (IS_ERR(trans)) {
                err = -ENOMEM;
                goto out;
        }
@@ -5920,6 +6055,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
        if (!skip_sum) {
                dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
                if (!dip->csums) {
+                        kfree(dip);
                        ret = -ENOMEM;
                        goto free_ordered;
                }
@@ -6088,7 +6224,7 @@ out:
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len)
 {
-        return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
+        return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
 }
 int btrfs_readpage(struct file *file, struct page *page)
@@ -6968,7 +7104,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(inode))
                goto out_unlock;
-        err = btrfs_init_inode_security(trans, inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
@@ -7204,7 +7340,6 @@ static const struct address_space_operations btrfs_aops = {
        .writepage      = btrfs_writepage,
        .writepages     = btrfs_writepages,
        .readpages      = btrfs_readpages,
-        .sync_page      = block_sync_page,
        .direct_IO      = btrfs_direct_IO,
        .invalidatepage = btrfs_invalidatepage,
        .releasepage    = btrfs_releasepage,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a506a22b522a..d1bace3df9b6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -158,7 +158,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
                      FS_SYNC_FL | FS_DIRSYNC_FL))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EACCES;
        mutex_lock(&inode->i_mutex);
@@ -203,7 +203,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        trans = btrfs_join_transaction(root, 1);
-        BUG_ON(!trans);
+        BUG_ON(IS_ERR(trans));
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
@@ -907,6 +907,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        if (new_size > old_size) {
                trans = btrfs_start_transaction(root, 0);
+                if (IS_ERR(trans)) {
+                        ret = PTR_ERR(trans);
+                        goto out_unlock;
+                }
                ret = btrfs_grow_device(trans, device, new_size);
                btrfs_commit_transaction(trans, root);
        } else {
@@ -1067,12 +1071,15 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
        if (copy_from_user(&flags, arg, sizeof(flags)))
                return -EFAULT;
-        if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
+        if (flags & BTRFS_SUBVOL_CREATE_ASYNC)
                return -EINVAL;
        if (flags & ~BTRFS_SUBVOL_RDONLY)
                return -EOPNOTSUPP;
+        if (!inode_owner_or_capable(inode))
+                return -EACCES;
        down_write(&root->fs_info->subvol_sem);
        /* nothing to do */
@@ -1093,7 +1100,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
                goto out_reset;
        }
-        ret = btrfs_update_root(trans, root,
+        ret = btrfs_update_root(trans, root->fs_info->tree_root,
                                &root->root_key, &root->root_item);
        btrfs_commit_transaction(trans, root);
@@ -1898,7 +1905,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        memcpy(&new_key, &key, sizeof(new_key));
                        new_key.objectid = inode->i_ino;
-                        new_key.offset = key.offset + destoff - off;
+                        if (off <= key.offset)
+                                new_key.offset = key.offset + destoff - off;
+                        else
+                                new_key.offset = destoff;
                        trans = btrfs_start_transaction(root, 1);
                        if (IS_ERR(trans)) {
@@ -2082,7 +2092,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
        ret = -ENOMEM;
        trans = btrfs_start_ioctl_transaction(root, 0);
-        if (!trans)
+        if (IS_ERR(trans))
                goto out_drop;
        file->private_data = trans;
@@ -2138,9 +2148,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        path->leave_spinning = 1;
        trans = btrfs_start_transaction(root, 1);
-        if (!trans) {
+        if (IS_ERR(trans)) {
                btrfs_free_path(path);
-                return -ENOMEM;
+                return PTR_ERR(trans);
        }
        dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
@@ -2201,7 +2211,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
        int num_types = 4;
        int alloc_size;
        int ret = 0;
-        int slot_count = 0;
+        u64 slot_count = 0;
        int i, c;
        if (copy_from_user(&space_args,
@@ -2240,7 +2250,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
                goto out;
        }
-        slot_count = min_t(int, space_args.space_slots, slot_count);
+        slot_count = min_t(u64, space_args.space_slots, slot_count);
        alloc_size = sizeof(*dest) * slot_count;
@@ -2260,6 +2270,9 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
        for (i = 0; i < num_types; i++) {
                struct btrfs_space_info *tmp;
+                if (!slot_count)
+                        break;
                info = NULL;
                rcu_read_lock();
                list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
@@ -2281,7 +2294,10 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
                                memcpy(dest, &space, sizeof(space));
                                dest++;
                                space_args.total_spaces++;
+                                slot_count--;
                        }
+                        if (!slot_count)
+                                break;
                }
                up_read(&info->groups_sem);
        }
@@ -2334,6 +2350,8 @@ static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp
        u64 transid;
        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
        transid = trans->transid;
        btrfs_commit_transaction_async(trans, root, 0);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index cc9b450399df..a178f5ebea78 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -280,6 +280,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
        unsigned long tot_out;
        unsigned long tot_len;
        char *buf;
+        bool may_late_unmap, need_unmap;
        data_in = kmap(pages_in[0]);
        tot_len = read_compress_length(data_in);
@@ -300,11 +301,13 @@ static int lzo_decompress_biovec(struct list_head *ws,
                tot_in += in_len;
                working_bytes = in_len;
+                may_late_unmap = need_unmap = false;
                /* fast path: avoid using the working buffer */
                if (in_page_bytes_left >= in_len) {
                        buf = data_in + in_offset;
                        bytes = in_len;
+                        may_late_unmap = true;
                        goto cont;
                }
@@ -329,14 +332,17 @@ cont:
                                if (working_bytes == 0 && tot_in >= tot_len)
                                        break;
-                                kunmap(pages_in[page_in_index]);
+                                if (page_in_index + 1 >= total_pages_in) {
-                                page_in_index++;
-                                if (page_in_index >= total_pages_in) {
                                        ret = -1;
-                                        data_in = NULL;
                                        goto done;
                                }
-                                data_in = kmap(pages_in[page_in_index]);
+                                if (may_late_unmap)
+                                        need_unmap = true;
+                                else
+                                        kunmap(pages_in[page_in_index]);
+                                data_in = kmap(pages_in[++page_in_index]);
                                in_page_bytes_left = PAGE_CACHE_SIZE;
                                in_offset = 0;
@@ -346,6 +352,8 @@ cont:
                out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
                ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
                                            &out_len);
+                if (need_unmap)
+                        kunmap(pages_in[page_in_index - 1]);
                if (ret != LZO_E_OK) {
                        printk(KERN_WARNING "btrfs decompress failed\n");
                        ret = -1;
@@ -363,8 +371,7 @@ cont:
                        break;
        }
 done:
-        if (data_in)
+        kunmap(pages_in[page_in_index]);
-                kunmap(pages_in[page_in_index]);
        return ret;
 }
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2b61e1ddcd99..083a55477375 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -141,7 +141,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
                                          u64 file_offset)
 {
        struct rb_root *root = &tree->tree;
-        struct rb_node *prev;
+        struct rb_node *prev = NULL;
        struct rb_node *ret;
        struct btrfs_ordered_extent *entry;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0d126be22b63..fb2605d998e9 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -260,6 +260,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 #else
                        BUG();
 #endif
+                        break;
                case BTRFS_BLOCK_GROUP_ITEM_KEY:
                        bi = btrfs_item_ptr(l, i,
                                            struct btrfs_block_group_item);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 045c9c2b2d7e..31ade5802ae8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1157,6 +1157,7 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
        new_node->bytenr = dest->node->start;
        new_node->level = node->level;
        new_node->lowest = node->lowest;
+        new_node->checked = 1;
        new_node->root = dest;
        if (!node->lowest) {
@@ -2028,6 +2029,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
        while (1) {
                trans = btrfs_start_transaction(root, 0);
+                BUG_ON(IS_ERR(trans));
                trans->block_rsv = rc->block_rsv;
                ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
@@ -2147,6 +2149,12 @@ again:
        }
        trans = btrfs_join_transaction(rc->extent_root, 1);
+        if (IS_ERR(trans)) {
+                if (!err)
+                        btrfs_block_rsv_release(rc->extent_root,
+                                                rc->block_rsv, num_bytes);
+                return PTR_ERR(trans);
+        }
        if (!err) {
                if (num_bytes != rc->merging_rsv_size) {
@@ -3222,6 +3230,7 @@ truncate:
        trans = btrfs_join_transaction(root, 0);
        if (IS_ERR(trans)) {
                btrfs_free_path(path);
+                ret = PTR_ERR(trans);
                goto out;
        }
@@ -3628,6 +3637,7 @@ int prepare_to_relocate(struct reloc_control *rc)
        set_reloc_control(rc);
        trans = btrfs_join_transaction(rc->extent_root, 1);
+        BUG_ON(IS_ERR(trans));
        btrfs_commit_transaction(trans, rc->extent_root);
        return 0;
 }
@@ -3644,6 +3654,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        u32 item_size;
        int ret;
        int err = 0;
+        int progress = 0;
        path = btrfs_alloc_path();
        if (!path)
@@ -3656,8 +3667,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        }
        while (1) {
+                progress++;
                trans = btrfs_start_transaction(rc->extent_root, 0);
+                BUG_ON(IS_ERR(trans));
+restart:
                if (update_backref_cache(trans, &rc->backref_cache)) {
                        btrfs_end_transaction(trans, rc->extent_root);
                        continue;
@@ -3770,6 +3783,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                        }
                }
        }
+        if (trans && progress && err == -ENOSPC) {
+                ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
+                                              rc->block_group->flags);
+                if (ret == 0) {
+                        err = 0;
+                        progress = 0;
+                        goto restart;
+                }
+        }
        btrfs_release_path(rc->extent_root, path);
        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
@@ -3804,7 +3826,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        /* get rid of pinned extents */
        trans = btrfs_join_transaction(rc->extent_root, 1);
-        btrfs_commit_transaction(trans, rc->extent_root);
+        if (IS_ERR(trans))
+                err = PTR_ERR(trans);
+        else
+                btrfs_commit_transaction(trans, rc->extent_root);
 out_free:
        btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
        btrfs_free_path(path);
@@ -4022,6 +4047,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
        int ret;
        trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
+        BUG_ON(IS_ERR(trans));
        memset(&root->root_item.drop_progress, 0,
                sizeof(root->root_item.drop_progress));
@@ -4125,6 +4151,11 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        set_reloc_control(rc);
        trans = btrfs_join_transaction(rc->extent_root, 1);
+        if (IS_ERR(trans)) {
+                unset_reloc_control(rc);
+                err = PTR_ERR(trans);
+                goto out_free;
+        }
        rc->merge_reloc_tree = 1;
@@ -4154,9 +4185,13 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        unset_reloc_control(rc);
        trans = btrfs_join_transaction(rc->extent_root, 1);
-        btrfs_commit_transaction(trans, rc->extent_root);
+        if (IS_ERR(trans))
-out:
+                err = PTR_ERR(trans);
+        else
+                btrfs_commit_transaction(trans, rc->extent_root);
+out_free:
        kfree(rc);
+out:
        while (!list_empty(&reloc_roots)) {
                reloc_root = list_entry(reloc_roots.next,
                                        struct btrfs_root, root_list);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b2130c46fdb5..d39a9895d932 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -155,7 +155,8 @@ enum {
        Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
-        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
+        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
+        Opt_enospc_debug, Opt_err,
 };
 static match_table_t tokens = {
@@ -184,6 +185,7 @@ static match_table_t tokens = {
        {Opt_space_cache, "space_cache"},
        {Opt_clear_cache, "clear_cache"},
        {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+        {Opt_enospc_debug, "enospc_debug"},
        {Opt_err, NULL},
 };
@@ -358,6 +360,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_user_subvol_rm_allowed:
                        btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
                        break;
+                case Opt_enospc_debug:
+                        btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
+                        break;
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
@@ -383,7 +388,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
                struct btrfs_fs_devices **fs_devices)
 {
        substring_t args[MAX_OPT_ARGS];
-        char *opts, *p;
+        char *opts, *orig, *p;
        int error = 0;
        int intarg;
@@ -397,6 +402,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
        opts = kstrdup(options, GFP_KERNEL);
        if (!opts)
                return -ENOMEM;
+        orig = opts;
        while ((p = strsep(&opts, ",")) != NULL) {
                int token;
@@ -432,7 +438,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
        }
 out_free_opts:
-        kfree(opts);
+        kfree(orig);
 out:
        /*
         * If no subvolume name is specified we use the default one.  Allocate
@@ -623,6 +629,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        btrfs_wait_ordered_extents(root, 0, 0);
        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
        ret = btrfs_commit_transaction(trans, root);
        return ret;
 }
@@ -761,6 +769,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
                }
                btrfs_close_devices(fs_devices);
+                kfree(fs_info);
+                kfree(tree_root);
        } else {
                char b[BDEVNAME_SIZE];
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index bae5c7b8bbe2..3d73c8d93bbb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1161,6 +1161,11 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
        INIT_DELAYED_WORK(&ac->work, do_async_commit);
        ac->root = root;
        ac->newtrans = btrfs_join_transaction(root, 0);
+        if (IS_ERR(ac->newtrans)) {
+                int err = PTR_ERR(ac->newtrans);
+                kfree(ac);
+                return err;
+        }
        /* take transaction reference */
        mutex_lock(&root->fs_info->trans_mutex);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 054744ac5719..a4bbb854dfd2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -338,6 +338,12 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
                }
                dst_copy = kmalloc(item_size, GFP_NOFS);
                src_copy = kmalloc(item_size, GFP_NOFS);
+                if (!dst_copy || !src_copy) {
+                        btrfs_release_path(root, path);
+                        kfree(dst_copy);
+                        kfree(src_copy);
+                        return -ENOMEM;
+                }
                read_extent_buffer(eb, src_copy, src_ptr, item_size);
@@ -665,6 +671,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
        btrfs_dir_item_key_to_cpu(leaf, di, &location);
        name_len = btrfs_dir_name_len(leaf, di);
        name = kmalloc(name_len, GFP_NOFS);
+        if (!name)
+                return -ENOMEM;
        read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
        btrfs_release_path(root, path);
@@ -744,6 +753,9 @@ static noinline int backref_in_log(struct btrfs_root *log,
        int match = 0;
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
        if (ret != 0)
                goto out;
@@ -967,6 +979,8 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
        key.offset = (u64)-1;
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -1178,6 +1192,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
        name_len = btrfs_dir_name_len(eb, di);
        name = kmalloc(name_len, GFP_NOFS);
+        if (!name)
+                return -ENOMEM;
        log_type = btrfs_dir_type(eb, di);
        read_extent_buffer(eb, name, (unsigned long)(di + 1),
                   name_len);
@@ -1692,6 +1709,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                root_owner = btrfs_header_owner(parent);
                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+                if (!next)
+                        return -ENOMEM;
                if (*level == 1) {
                        wc->process_func(root, next, wc, ptr_gen);
@@ -2032,6 +2051,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                wait_log_commit(trans, log_root_tree,
                                log_root_tree->log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
+                ret = 0;
                goto out;
        }
        atomic_set(&log_root_tree->log_commit[index2], 1);
@@ -2096,7 +2116,7 @@ out:
        smp_mb();
        if (waitqueue_active(&root->log_commit_wait[index1]))
                wake_up(&root->log_commit_wait[index1]);
-        return 0;
+        return ret;
 }
 static void free_log_tree(struct btrfs_trans_handle *trans,
@@ -2194,6 +2214,9 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        log = root->log_root;
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
                                   name, name_len, -1);
        if (IS_ERR(di)) {
@@ -2594,6 +2617,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
                           nr * sizeof(u32), GFP_NOFS);
+        if (!ins_data)
+                return -ENOMEM;
        ins_sizes = (u32 *)ins_data;
        ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
@@ -2725,7 +2751,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        log = root->log_root;
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        dst_path = btrfs_alloc_path();
+        if (!dst_path) {
+                btrfs_free_path(path);
+                return -ENOMEM;
+        }
        min_key.objectid = inode->i_ino;
        min_key.type = BTRFS_INODE_ITEM_KEY;
@@ -3080,6 +3112,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
        BUG_ON(!path);
        trans = btrfs_start_transaction(fs_info->tree_root, 0);
+        BUG_ON(IS_ERR(trans));
        wc.trans = trans;
        wc.pin = 1;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d158530233b7..9d554e8e6583 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -162,7 +162,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        struct bio *cur;
        int again = 0;
        unsigned long num_run;
-        unsigned long num_sync_run;
        unsigned long batch_run = 0;
        unsigned long limit;
        unsigned long last_waited = 0;
@@ -173,11 +172,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        limit = btrfs_async_submit_limit(fs_info);
        limit = limit * 2 / 3;
-        /* we want to make sure that every time we switch from the sync
-         * list to the normal list, we unplug
-         */
-        num_sync_run = 0;
 loop:
        spin_lock(&device->io_lock);
@@ -223,15 +217,6 @@ loop_lock:
        spin_unlock(&device->io_lock);
-        /*
-         * if we're doing the regular priority list, make sure we unplug
-         * for any high prio bios we've sent down
-         */
-        if (pending_bios == &device->pending_bios && num_sync_run > 0) {
-                num_sync_run = 0;
-                blk_run_backing_dev(bdi, NULL);
-        }
        while (pending) {
                rmb();
@@ -259,19 +244,11 @@ loop_lock:
                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
-                if (cur->bi_rw & REQ_SYNC)
-                        num_sync_run++;
                submit_bio(cur->bi_rw, cur);
                num_run++;
                batch_run++;
-                if (need_resched()) {
+                if (need_resched())
-                        if (num_sync_run) {
-                                blk_run_backing_dev(bdi, NULL);
-                                num_sync_run = 0;
-                        }
                        cond_resched();
-                }
                /*
                 * we made progress, there is more work to do and the bdi
@@ -304,13 +281,8 @@ loop_lock:
                                 * against it before looping
                                 */
                                last_waited = ioc->last_waited;
-                                if (need_resched()) {
+                                if (need_resched())
-                                        if (num_sync_run) {
-                                                blk_run_backing_dev(bdi, NULL);
-                                                num_sync_run = 0;
-                                        }
                                        cond_resched();
-                                }
                                continue;
                        }
                        spin_lock(&device->io_lock);
@@ -323,22 +295,6 @@ loop_lock:
                }
        }
-        if (num_sync_run) {
-                num_sync_run = 0;
-                blk_run_backing_dev(bdi, NULL);
-        }
-        /*
-         * IO has already been through a long path to get here.  Checksumming,
-         * async helper threads, perhaps compression.  We've done a pretty
-         * good job of collecting a batch of IO and should just unplug
-         * the device right away.
-         *
-         * This will help anyone who is waiting on the IO, they might have
-         * already unplugged, but managed to do so before the bio they
-         * cared about found its way down here.
-         */
-        blk_run_backing_dev(bdi, NULL);
        cond_resched();
        if (again)
                goto loop;
@@ -1213,6 +1169,10 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
                return -ENOMEM;
        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                btrfs_free_path(path);
+                return PTR_ERR(trans);
+        }
        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
        key.type = BTRFS_DEV_ITEM_KEY;
        key.offset = device->devid;
@@ -1334,11 +1294,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        ret = btrfs_shrink_device(device, 0);
        if (ret)
-                goto error_brelse;
+                goto error_undo;
        ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
        if (ret)
-                goto error_brelse;
+                goto error_undo;
        device->in_fs_metadata = 0;
@@ -1412,6 +1372,13 @@ out:
        mutex_unlock(&root->fs_info->volume_mutex);
        mutex_unlock(&uuid_mutex);
        return ret;
+error_undo:
+        if (device->writeable) {
+                list_add(&device->dev_alloc_list,
+                         &root->fs_info->fs_devices->alloc_list);
+                root->fs_info->fs_devices->rw_devices++;
+        }
+        goto error_brelse;
 }
 /*
@@ -1601,11 +1568,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        ret = find_next_devid(root, &device->devid);
        if (ret) {
+                kfree(device->name);
                kfree(device);
                goto error;
        }
        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                kfree(device->name);
+                kfree(device);
+                ret = PTR_ERR(trans);
+                goto error;
+        }
        lock_chunks(root);
        device->writeable = 1;
@@ -1621,7 +1596,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        device->dev_root = root->fs_info->dev_root;
        device->bdev = bdev;
        device->in_fs_metadata = 1;
-        device->mode = 0;
+        device->mode = FMODE_EXCL;
        set_blocksize(device->bdev, 4096);
        if (seeding_dev) {
@@ -1873,7 +1848,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
                return ret;
        trans = btrfs_start_transaction(root, 0);
-        BUG_ON(!trans);
+        BUG_ON(IS_ERR(trans));
        lock_chunks(root);
@@ -2047,7 +2022,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
                BUG_ON(ret);
                trans = btrfs_start_transaction(dev_root, 0);
-                BUG_ON(!trans);
+                BUG_ON(IS_ERR(trans));
                ret = btrfs_grow_device(trans, device, old_size);
                BUG_ON(ret);
@@ -2213,6 +2188,11 @@ again:
        /* Shrinking succeeded, else we would be at "done". */
        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto done;
+        }
        lock_chunks(root);
        device->disk_total_bytes = new_size;
@@ -2931,7 +2911,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                             u64 logical, u64 *length,
                             struct btrfs_multi_bio **multi_ret,
-                             int mirror_num, struct page *unplug_page)
+                             int mirror_num)
 {
        struct extent_map *em;
        struct map_lookup *map;
@@ -2963,11 +2943,6 @@ again:
        em = lookup_extent_mapping(em_tree, logical, *length);
        read_unlock(&em_tree->lock);
-        if (!em && unplug_page) {
-                kfree(multi);
-                return 0;
-        }
        if (!em) {
                printk(KERN_CRIT "unable to find logical %llu len %llu\n",
                       (unsigned long long)logical,
@@ -3023,13 +2998,13 @@ again:
                *length = em->len - offset;
        }
-        if (!multi_ret && !unplug_page)
+        if (!multi_ret)
                goto out;
        num_stripes = 1;
        stripe_index = 0;
        if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-                if (unplug_page || (rw & REQ_WRITE))
+                if (rw & REQ_WRITE)
                        num_stripes = map->num_stripes;
                else if (mirror_num)
                        stripe_index = mirror_num - 1;
@@ -3051,7 +3026,7 @@ again:
                stripe_index = do_div(stripe_nr, factor);
                stripe_index *= map->sub_stripes;
-                if (unplug_page || (rw & REQ_WRITE))
+                if (rw & REQ_WRITE)
                        num_stripes = map->sub_stripes;
                else if (mirror_num)
                        stripe_index += mirror_num - 1;
@@ -3071,22 +3046,10 @@ again:
        BUG_ON(stripe_index >= map->num_stripes);
        for (i = 0; i < num_stripes; i++) {
-                if (unplug_page) {
+                multi->stripes[i].physical =
-                        struct btrfs_device *device;
+                        map->stripes[stripe_index].physical +
-                        struct backing_dev_info *bdi;
+                        stripe_offset + stripe_nr * map->stripe_len;
+                multi->stripes[i].dev = map->stripes[stripe_index].dev;
-                        device = map->stripes[stripe_index].dev;
-                        if (device->bdev) {
-                                bdi = blk_get_backing_dev_info(device->bdev);
-                                if (bdi->unplug_io_fn)
-                                        bdi->unplug_io_fn(bdi, unplug_page);
-                        }
-                } else {
-                        multi->stripes[i].physical =
-                                map->stripes[stripe_index].physical +
-                                stripe_offset + stripe_nr * map->stripe_len;
-                        multi->stripes[i].dev = map->stripes[stripe_index].dev;
-                }
                stripe_index++;
        }
        if (multi_ret) {
@@ -3104,7 +3067,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                      struct btrfs_multi_bio **multi_ret, int mirror_num)
 {
        return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
-                                 mirror_num, NULL);
+                                 mirror_num);
 }
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -3172,14 +3135,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
        return 0;
 }
-int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
-                      u64 logical, struct page *page)
-{
-        u64 length = PAGE_CACHE_SIZE;
-        return __btrfs_map_block(map_tree, READ, logical, &length,
-                                 NULL, 0, page);
-}
 static void end_bio_multi_stripe(struct bio *bio, int err)
 {
        struct btrfs_multi_bio *multi = bio->bi_private;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a5776531dc2b..d779cefcfd7d 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -370,7 +370,8 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
 }
 int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
-                              struct inode *inode, struct inode *dir)
+                              struct inode *inode, struct inode *dir,
+                              const struct qstr *qstr)
 {
        int err;
        size_t len;
@@ -378,7 +379,8 @@ int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
        char *suffix;
        char *name;
-        err = security_inode_init_security(inode, dir, &suffix, &value, &len);
+        err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
+                                           &len);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 7a43fd640bbb..b3cc8039134b 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -37,6 +37,7 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
 extern int btrfs_removexattr(struct dentry *dentry, const char *name);
 extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
-                                     struct inode *inode, struct inode *dir);
+                                     struct inode *inode, struct inode *dir,
+                                     const struct qstr *qstr);
 #endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index f5ec2d44150d..faccd47c6c46 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -57,7 +57,8 @@ static struct list_head *zlib_alloc_workspace(void)
        if (!workspace)
                return ERR_PTR(-ENOMEM);
-        workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+        workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize(
+                                                MAX_WBITS, MAX_MEM_LEVEL));
        workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
        workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
        if (!workspace->def_strm.workspace ||
diff --git a/fs/buffer.c b/fs/buffer.c
index 2219a76e2caf..a08bb8e61c6f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -54,23 +54,15 @@ init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 }
 EXPORT_SYMBOL(init_buffer);
-static int sync_buffer(void *word)
+static int sleep_on_buffer(void *word)
 {
-        struct block_device *bd;
-        struct buffer_head *bh
-                = container_of(word, struct buffer_head, b_state);
-        smp_mb();
-        bd = bh->b_bdev;
-        if (bd)
-                blk_run_address_space(bd->bd_inode->i_mapping);
        io_schedule();
        return 0;
 }
 void __lock_buffer(struct buffer_head *bh)
 {
-        wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
+        wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
                                                        TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__lock_buffer);
@@ -90,7 +82,7 @@ EXPORT_SYMBOL(unlock_buffer);
 */
 void __wait_on_buffer(struct buffer_head * bh)
 {
-        wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
+        wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__wait_on_buffer);
@@ -749,10 +741,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
        struct buffer_head *bh;
        struct list_head tmp;
-        struct address_space *mapping, *prev_mapping = NULL;
+        struct address_space *mapping;
        int err = 0, err2;
+        struct blk_plug plug;
        INIT_LIST_HEAD(&tmp);
+        blk_start_plug(&plug);
        spin_lock(lock);
        while (!list_empty(list)) {
@@ -775,7 +769,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
                                 * still in flight on potentially older
                                 * contents.
                                 */
-                                write_dirty_buffer(bh, WRITE_SYNC_PLUG);
+                                write_dirty_buffer(bh, WRITE_SYNC);
                                /*
                                 * Kick off IO for the previous mapping. Note
@@ -783,16 +777,16 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
                                 * wait_on_buffer() will do that for us
                                 * through sync_buffer().
                                 */
-                                if (prev_mapping && prev_mapping != mapping)
-                                        blk_run_address_space(prev_mapping);
-                                prev_mapping = mapping;
                                brelse(bh);
                                spin_lock(lock);
                        }
                }
        }
+        spin_unlock(lock);
+        blk_finish_plug(&plug);
+        spin_lock(lock);
        while (!list_empty(&tmp)) {
                bh = BH_ENTRY(tmp.prev);
                get_bh(bh);
@@ -1144,7 +1138,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
 * inode list.
 *
 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
- * mapping->tree_lock and the global inode_lock.
+ * mapping->tree_lock and mapping->host->i_lock.
 */
 void mark_buffer_dirty(struct buffer_head *bh)
 {
@@ -1614,14 +1608,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
 * prevents this contention from occurring.
 *
 * If block_write_full_page() is called with wbc->sync_mode ==
- * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
+ * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
- * causes the writes to be flagged as synchronous writes, but the
+ * causes the writes to be flagged as synchronous writes.
- * block device queue will NOT be unplugged, since usually many pages
- * will be pushed to the out before the higher-level caller actually
- * waits for the writes to be completed.  The various wait functions,
- * such as wait_on_writeback_range() will ultimately call sync_page()
- * which will ultimately call blk_run_backing_dev(), which will end up
- * unplugging the device queue.
 */
 static int __block_write_full_page(struct inode *inode, struct page *page,
                        get_block_t *get_block, struct writeback_control *wbc,
@@ -1634,7 +1622,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
        const unsigned blocksize = 1 << inode->i_blkbits;
        int nr_underway = 0;
        int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
-                        WRITE_SYNC_PLUG : WRITE);
+                        WRITE_SYNC : WRITE);
        BUG_ON(!PageLocked(page));
@@ -3138,17 +3126,6 @@ out:
 }
 EXPORT_SYMBOL(try_to_free_buffers);
-void block_sync_page(struct page *page)
-{
-        struct address_space *mapping;
-        smp_mb();
-        mapping = page_mapping(page);
-        if (mapping)
-                blk_run_backing_dev(mapping->backing_dev_info, page);
-}
-EXPORT_SYMBOL(block_sync_page);
 /*
 * There are no bdflush tunables left.  But distributions are
 * still running obsolete flush daemons, so we terminate them here.
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 42c7fafc8bfe..a0358c2189cb 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -275,6 +275,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
                                  bool preemptive)
 {
        struct dentry *grave, *trap;
+        struct path path, path_to_graveyard;
        char nbuffer[8 + 8 + 1];
        int ret;
@@ -287,10 +288,18 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
        /* non-directories can just be unlinked */
        if (!S_ISDIR(rep->d_inode->i_mode)) {
                _debug("unlink stale object");
-                ret = vfs_unlink(dir->d_inode, rep);
-                if (preemptive)
+                path.mnt = cache->mnt;
-                        cachefiles_mark_object_buried(cache, rep);
+                path.dentry = dir;
+                ret = security_path_unlink(&path, rep);
+                if (ret < 0) {
+                        cachefiles_io_error(cache, "Unlink security error");
+                } else {
+                        ret = vfs_unlink(dir->d_inode, rep);
+                        if (preemptive)
+                                cachefiles_mark_object_buried(cache, rep);
+                }
                mutex_unlock(&dir->d_inode->i_mutex);
@@ -379,12 +388,23 @@ try_again:
        }
        /* attempt the rename */
-        ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave);
+        path.mnt = cache->mnt;
-        if (ret != 0 && ret != -ENOMEM)
+        path.dentry = dir;
-                cachefiles_io_error(cache, "Rename failed with error %d", ret);
+        path_to_graveyard.mnt = cache->mnt;
+        path_to_graveyard.dentry = cache->graveyard;
+        ret = security_path_rename(&path, rep, &path_to_graveyard, grave);
+        if (ret < 0) {
+                cachefiles_io_error(cache, "Rename security error %d", ret);
+        } else {
+                ret = vfs_rename(dir->d_inode, rep,
+                                 cache->graveyard->d_inode, grave);
+                if (ret != 0 && ret != -ENOMEM)
+                        cachefiles_io_error(cache,
+                                            "Rename failed with error %d", ret);
-        if (preemptive)
+                if (preemptive)
-                cachefiles_mark_object_buried(cache, rep);
+                        cachefiles_mark_object_buried(cache, rep);
+        }
        unlock_rename(cache->graveyard, dir);
        dput(grave);
@@ -448,6 +468,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
 {
        struct cachefiles_cache *cache;
        struct dentry *dir, *next = NULL;
+        struct path path;
        unsigned long start;
        const char *name;
        int ret, nlen;
@@ -458,6 +479,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
        cache = container_of(parent->fscache.cache,
                             struct cachefiles_cache, cache);
+        path.mnt = cache->mnt;
        ASSERT(parent->dentry);
        ASSERT(parent->dentry->d_inode);
@@ -511,6 +533,10 @@ lookup_again:
                        if (ret < 0)
                                goto create_error;
+                        path.dentry = dir;
+                        ret = security_path_mkdir(&path, next, 0);
+                        if (ret < 0)
+                                goto create_error;
                        start = jiffies;
                        ret = vfs_mkdir(dir->d_inode, next, 0);
                        cachefiles_hist(cachefiles_mkdir_histogram, start);
@@ -536,6 +562,10 @@ lookup_again:
                        if (ret < 0)
                                goto create_error;
+                        path.dentry = dir;
+                        ret = security_path_mknod(&path, next, S_IFREG, 0);
+                        if (ret < 0)
+                                goto create_error;
                        start = jiffies;
                        ret = vfs_create(dir->d_inode, next, S_IFREG, NULL);
                        cachefiles_hist(cachefiles_create_histogram, start);
@@ -692,6 +722,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 {
        struct dentry *subdir;
        unsigned long start;
+        struct path path;
        int ret;
        _enter(",,%s", dirname);
@@ -719,6 +750,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
                _debug("attempt mkdir");
+                path.mnt = cache->mnt;
+                path.dentry = dir;
+                ret = security_path_mkdir(&path, subdir, 0700);
+                if (ret < 0)
+                        goto mkdir_error;
                ret = vfs_mkdir(dir->d_inode, subdir, 0700);
                if (ret < 0)
                        goto mkdir_error;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 60d27bc9eb83..6b61ded701e1 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1560,9 +1560,10 @@ retry_locked:
                /* NOTE: no side-effects allowed, until we take s_mutex */
                revoking = cap->implemented & ~cap->issued;
-                if (revoking)
+                dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
-                        dout(" mds%d revoking %s\n", cap->mds,
+                     cap->mds, cap, ceph_cap_string(cap->issued),
-                             ceph_cap_string(revoking));
+                     ceph_cap_string(cap->implemented),
+                     ceph_cap_string(revoking));
                if (cap == ci->i_auth_cap &&
                    (cap->issued & CEPH_CAP_FILE_WR)) {
@@ -1658,6 +1659,8 @@ ack:
                if (cap == ci->i_auth_cap && ci->i_dirty_caps)
                        flushing = __mark_caps_flushing(inode, session);
+                else
+                        flushing = 0;
                mds = cap->mds;  /* remember mds, so we don't repeat */
                sent++;
@@ -1940,6 +1943,35 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
        }
 }
+static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
+                                     struct ceph_mds_session *session,
+                                     struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_cap *cap;
+        int delayed = 0;
+        spin_lock(&inode->i_lock);
+        cap = ci->i_auth_cap;
+        dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
+             ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+        __ceph_flush_snaps(ci, &session, 1);
+        if (ci->i_flushing_caps) {
+                delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+                                     __ceph_caps_used(ci),
+                                     __ceph_caps_wanted(ci),
+                                     cap->issued | cap->implemented,
+                                     ci->i_flushing_caps, NULL);
+                if (delayed) {
+                        spin_lock(&inode->i_lock);
+                        __cap_delay_requeue(mdsc, ci);
+                        spin_unlock(&inode->i_lock);
+                }
+        } else {
+                spin_unlock(&inode->i_lock);
+        }
+}
 /*
 * Take references to capabilities we hold, so that we don't release
@@ -2687,7 +2719,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
        ceph_add_cap(inode, session, cap_id, -1,
                     issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
                     NULL /* no caps context */);
-        try_flush_caps(inode, session, NULL);
+        kick_flushing_inode_caps(mdsc, session, inode);
        up_read(&mdsc->snap_rwsem);
        /* make sure we re-request max_size, if necessary */
@@ -2785,8 +2817,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        case CEPH_CAP_OP_IMPORT:
                handle_cap_import(mdsc, inode, h, session,
                                  snaptrace, snaptrace_len);
-                ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
+                ceph_check_caps(ceph_inode(inode), 0, session);
-                                session);
                goto done_unlocked;
        }
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 08f65faac112..0dba6915712b 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -210,8 +210,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
        if (!fsc->debugfs_congestion_kb)
                goto out;
-        dout("a\n");
        snprintf(name, sizeof(name), "../../bdi/%s",
                 dev_name(fsc->backing_dev_info.dev));
        fsc->debugfs_bdi =
@@ -221,7 +219,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
        if (!fsc->debugfs_bdi)
                goto out;
-        dout("b\n");
        fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
                                        0600,
                                        fsc->client->debugfs_dir,
@@ -230,7 +227,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
        if (!fsc->debugfs_mdsmap)
                goto out;
-        dout("ca\n");
        fsc->debugfs_mdsc = debugfs_create_file("mdsc",
                                                0600,
                                                fsc->client->debugfs_dir,
@@ -239,7 +235,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
        if (!fsc->debugfs_mdsc)
                goto out;
-        dout("da\n");
        fsc->debugfs_caps = debugfs_create_file("caps",
                                                   0400,
                                                   fsc->client->debugfs_dir,
@@ -248,7 +243,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
        if (!fsc->debugfs_caps)
                goto out;
-        dout("ea\n");
        fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
                                        0600,
                                        fsc->client->debugfs_dir,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0bc68de8edd7..1a867a3601ae 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -161,7 +161,7 @@ more:
        filp->f_pos = di->offset;
        err = filldir(dirent, dentry->d_name.name,
                      dentry->d_name.len, di->offset,
-                      dentry->d_inode->i_ino,
+                      ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
                      dentry->d_inode->i_mode >> 12);
        if (last) {
@@ -245,15 +245,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
                dout("readdir off 0 -> '.'\n");
                if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
-                            inode->i_ino, inode->i_mode >> 12) < 0)
+                            ceph_translate_ino(inode->i_sb, inode->i_ino),
+                            inode->i_mode >> 12) < 0)
                        return 0;
                filp->f_pos = 1;
                off = 1;
        }
        if (filp->f_pos == 1) {
+                ino_t ino = filp->f_dentry->d_parent->d_inode->i_ino;
                dout("readdir off 1 -> '..'\n");
                if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
-                            filp->f_dentry->d_parent->d_inode->i_ino,
+                            ceph_translate_ino(inode->i_sb, ino),
                            inode->i_mode >> 12) < 0)
                        return 0;
                filp->f_pos = 2;
@@ -377,7 +379,8 @@ more:
                if (filldir(dirent,
                            rinfo->dir_dname[off - fi->offset],
                            rinfo->dir_dname_len[off - fi->offset],
-                            pos, ino, ftype) < 0) {
+                            pos,
+                            ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
                        dout("filldir stopping us...\n");
                        return 0;
                }
@@ -409,7 +412,7 @@ more:
        spin_lock(&inode->i_lock);
        if (ci->i_release_count == fi->dir_release_count) {
                dout(" marking %p complete\n", inode);
-                ci->i_ceph_flags |= CEPH_I_COMPLETE;
+                /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
                ci->i_max_offset = filp->f_pos;
        }
        spin_unlock(&inode->i_lock);
@@ -496,6 +499,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
        /* .snap dir? */
        if (err == -ENOENT &&
+            ceph_snap(parent) == CEPH_NOSNAP &&
            strcmp(dentry->d_name.name,
                   fsc->mount_options->snapdir_name) == 0) {
                struct inode *inode = ceph_get_snapdir(parent);
@@ -992,7 +996,7 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *dir;
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        dir = dentry->d_parent->d_inode;
@@ -1023,34 +1027,13 @@ out_touch:
 }
 /*
- * When a dentry is released, clear the dir I_COMPLETE if it was part
+ * Release our ceph_dentry_info.
- * of the current dir gen or if this is in the snapshot namespace.
 */
-static void ceph_dentry_release(struct dentry *dentry)
+static void ceph_d_release(struct dentry *dentry)
 {
        struct ceph_dentry_info *di = ceph_dentry(dentry);
-        struct inode *parent_inode = NULL;
-        u64 snapid = CEPH_NOSNAP;
-        if (!IS_ROOT(dentry)) {
+        dout("d_release %p\n", dentry);
-                parent_inode = dentry->d_parent->d_inode;
-                if (parent_inode)
-                        snapid = ceph_snap(parent_inode);
-        }
-        dout("dentry_release %p parent %p\n", dentry, parent_inode);
-        if (parent_inode && snapid != CEPH_SNAPDIR) {
-                struct ceph_inode_info *ci = ceph_inode(parent_inode);
-                spin_lock(&parent_inode->i_lock);
-                if (ci->i_shared_gen == di->lease_shared_gen ||
-                    snapid <= CEPH_MAXSNAP) {
-                        dout(" clearing %p complete (d_release)\n",
-                             parent_inode);
-                        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-                        ci->i_release_count++;
-                }
-                spin_unlock(&parent_inode->i_lock);
-        }
        if (di) {
                ceph_dentry_lru_del(dentry);
                if (di->lease_session)
@@ -1275,14 +1258,14 @@ const struct inode_operations ceph_dir_iops = {
 const struct dentry_operations ceph_dentry_ops = {
        .d_revalidate = ceph_d_revalidate,
-        .d_release = ceph_dentry_release,
+        .d_release = ceph_d_release,
 };
 const struct dentry_operations ceph_snapdir_dentry_ops = {
        .d_revalidate = ceph_snapdir_d_revalidate,
-        .d_release = ceph_dentry_release,
+        .d_release = ceph_d_release,
 };
 const struct dentry_operations ceph_snap_dentry_ops = {
-        .d_release = ceph_dentry_release,
+        .d_release = ceph_d_release,
 };
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7d0e4a82d898..159b512d5a27 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -564,11 +564,19 @@ more:
                         * start_request so that a tid has been assigned.
                         */
                        spin_lock(&ci->i_unsafe_lock);
-                        list_add(&req->r_unsafe_item, &ci->i_unsafe_writes);
+                        list_add_tail(&req->r_unsafe_item,
+                                      &ci->i_unsafe_writes);
                        spin_unlock(&ci->i_unsafe_lock);
                        ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
                }
+                
                ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+                if (ret < 0 && req->r_safe_callback) {
+                        spin_lock(&ci->i_unsafe_lock);
+                        list_del_init(&req->r_unsafe_item);
+                        spin_unlock(&ci->i_unsafe_lock);
+                        ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+                }
        }
        if (file->f_flags & O_DIRECT)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e835eff551e3..b54c97da1c43 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -36,6 +36,13 @@ static void ceph_vmtruncate_work(struct work_struct *work);
 /*
 * find or create an inode, given the ceph ino number
 */
+static int ceph_set_ino_cb(struct inode *inode, void *data)
+{
+        ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
+        inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+        return 0;
+}
 struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
 {
        struct inode *inode;
@@ -707,13 +714,9 @@ static int fill_inode(struct inode *inode,
                    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
                    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
                        dout(" marking %p complete (empty)\n", inode);
-                        ci->i_ceph_flags |= CEPH_I_COMPLETE;
+                        /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
                        ci->i_max_offset = 2;
                }
-                /* it may be better to set st_size in getattr instead? */
-                if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
-                        inode->i_size = ci->i_rbytes;
                break;
        default:
                pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
@@ -1034,9 +1037,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                        dout("fill_trace doing d_move %p -> %p\n",
                             req->r_old_dentry, dn);
-                        /* d_move screws up d_subdirs order */
-                        ceph_i_clear(dir, CEPH_I_COMPLETE);
                        d_move(req->r_old_dentry, dn);
                        dout(" src %p '%.*s' dst %p '%.*s'\n",
                             req->r_old_dentry,
@@ -1048,12 +1048,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                           rehashing bug in vfs_rename_dir */
                        ceph_invalidate_dentry_lease(dn);
-                        /* take overwritten dentry's readdir offset */
+                        /*
-                        dout("dn %p gets %p offset %lld (old offset %lld)\n",
+                         * d_move() puts the renamed dentry at the end of
-                             req->r_old_dentry, dn, ceph_dentry(dn)->offset,
+                         * d_subdirs.  We need to assign it an appropriate
+                         * directory offset so we can behave when holding
+                         * I_COMPLETE.
+                         */
+                        ceph_set_dentry_offset(req->r_old_dentry);
+                        dout("dn %p gets new offset %lld\n", req->r_old_dentry, 
                             ceph_dentry(req->r_old_dentry)->offset);
-                        ceph_dentry(req->r_old_dentry)->offset =
-                                ceph_dentry(dn)->offset;
                        dn = req->r_old_dentry;  /* use old_dentry */
                        in = dn->d_inode;
@@ -1813,13 +1816,17 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
        err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
        if (!err) {
                generic_fillattr(inode, stat);
-                stat->ino = inode->i_ino;
+                stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
                if (ceph_snap(inode) != CEPH_NOSNAP)
                        stat->dev = ceph_snap(inode);
                else
                        stat->dev = 0;
                if (S_ISDIR(inode->i_mode)) {
-                        stat->size = ci->i_rbytes;
+                        if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
+                                                RBYTES))
+                                stat->size = ci->i_rbytes;
+                        else
+                                stat->size = ci->i_files + ci->i_subdirs;
                        stat->blocks = 0;
                        stat->blksize = 65536;
                }
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1e30d194a8e3..a1ee8fa3a8e7 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -693,9 +693,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                                dout("choose_mds %p %llx.%llx "
                                     "frag %u mds%d (%d/%d)\n",
                                     inode, ceph_vinop(inode),
-                                     frag.frag, frag.mds,
+                                     frag.frag, mds,
                                     (int)r, frag.ndist);
-                                return mds;
+                                if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+                                    CEPH_MDS_STATE_ACTIVE)
+                                        return mds;
                        }
                        /* since this file/dir wasn't known to be
@@ -708,7 +710,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                                dout("choose_mds %p %llx.%llx "
                                     "frag %u mds%d (auth)\n",
                                     inode, ceph_vinop(inode), frag.frag, mds);
-                                return mds;
+                                if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+                                    CEPH_MDS_STATE_ACTIVE)
+                                        return mds;
                        }
                }
        }
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 39c243acd062..f40b9139e437 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -584,10 +584,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
        if (lastinode)
                iput(lastinode);
-        dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino);
+        list_for_each_entry(child, &realm->children, child_item) {
-        list_for_each_entry(child, &realm->children, child_item)
+                dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n",
-                queue_realm_cap_snaps(child);
+                     realm, realm->ino, child, child->ino);
+                list_del_init(&child->dirty_item);
+                list_add(&child->dirty_item, &realm->dirty_item);
+        }
+        list_del_init(&realm->dirty_item);
        dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
 }
@@ -683,7 +687,9 @@ more:
         * queue cap snaps _after_ we've built the new snap contexts,
         * so that i_head_snapc can be set appropriately.
         */
-        list_for_each_entry(realm, &dirty_realms, dirty_item) {
+        while (!list_empty(&dirty_realms)) {
+                realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
+                                         dirty_item);
                queue_realm_cap_snaps(realm);
        }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index bf6f0f34082a..a9e78b4a258c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -131,6 +131,7 @@ enum {
        Opt_rbytes,
        Opt_norbytes,
        Opt_noasyncreaddir,
+        Opt_ino32,
 };
 static match_table_t fsopt_tokens = {
@@ -150,6 +151,7 @@ static match_table_t fsopt_tokens = {
        {Opt_rbytes, "rbytes"},
        {Opt_norbytes, "norbytes"},
        {Opt_noasyncreaddir, "noasyncreaddir"},
+        {Opt_ino32, "ino32"},
        {-1, NULL}
 };
@@ -225,6 +227,9 @@ static int parse_fsopt_token(char *c, void *private)
        case Opt_noasyncreaddir:
                fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
                break;
+        case Opt_ino32:
+                fsopt->flags |= CEPH_MOUNT_OPT_INO32;
+                break;
        default:
                BUG_ON(token);
        }
@@ -288,8 +293,10 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
        fsopt->sb_flags = flags;
        fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
-        fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
+        fsopt->rsize = CEPH_RSIZE_DEFAULT;
        fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+        fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
+        fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
        fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
@@ -368,7 +375,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
        if (fsopt->wsize)
                seq_printf(m, ",wsize=%d", fsopt->wsize);
-        if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+        if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
                seq_printf(m, ",rsize=%d", fsopt->rsize);
        if (fsopt->congestion_kb != default_congestion_kb())
                seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 20b907d76ae2..619fe719968f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -27,6 +27,7 @@
 #define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
 #define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
 #define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
+#define CEPH_MOUNT_OPT_INO32           (1<<8) /* 32 bit inos */
 #define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES)
@@ -35,6 +36,7 @@
 #define ceph_test_mount_opt(fsc, opt) \
        (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
+#define CEPH_RSIZE_DEFAULT             (512*1024) /* readahead */
 #define CEPH_MAX_READDIR_DEFAULT        1024
 #define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)
 #define CEPH_SNAPDIRNAME_DEFAULT        ".snap"
@@ -319,6 +321,16 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
        return container_of(inode, struct ceph_inode_info, vfs_inode);
 }
+static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
+{
+        return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
+}
+static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
+{
+        return (struct ceph_fs_client *)sb->s_fs_info;
+}
 static inline struct ceph_vino ceph_vino(struct inode *inode)
 {
        return ceph_inode(inode)->i_vino;
@@ -327,19 +339,49 @@ static inline struct ceph_vino ceph_vino(struct inode *inode)
 /*
 * ino_t is <64 bits on many architectures, blech.
 *
- * don't include snap in ino hash, at least for now.
+ *               i_ino (kernel inode)   st_ino (userspace)
+ * i386          32                     32
+ * x86_64+ino32  64                     32
+ * x86_64        64                     64
+ */
+static inline u32 ceph_ino_to_ino32(ino_t ino)
+{
+        ino ^= ino >> (sizeof(ino) * 8 - 32);
+        if (!ino)
+                ino = 1;
+        return ino;
+}
+/*
+ * kernel i_ino value
 */
 static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
 {
        ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
 #if BITS_PER_LONG == 32
-        ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
+        ino = ceph_ino_to_ino32(ino);
-        if (!ino)
-                ino = 1;
 #endif
        return ino;
 }
+/*
+ * user-visible ino (stat, filldir)
+ */
+#if BITS_PER_LONG == 32
+static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
+{
+        return ino;
+}
+#else
+static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
+{
+        if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
+                ino = ceph_ino_to_ino32(ino);
+        return ino;
+}
+#endif
 /* for printf-style formatting */
 #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
@@ -428,13 +470,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
        return ((loff_t)frag << 32) | (loff_t)off;
 }
-static inline int ceph_set_ino_cb(struct inode *inode, void *data)
-{
-        ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
-        inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
-        return 0;
-}
 /*
 * caps helpers
 */
@@ -503,15 +538,6 @@ extern void ceph_reservation_status(struct ceph_fs_client *client,
                                    int *total, int *avail, int *used,
                                    int *reserved, int *min);
-static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
-{
-        return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
-}
-static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
-{
-        return (struct ceph_fs_client *)sb->s_fs_info;
-}
 /*
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 6e12a6ba5f79..8c9eba6ef9df 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -219,6 +219,7 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
        struct rb_node **p;
        struct rb_node *parent = NULL;
        struct ceph_inode_xattr *xattr = NULL;
+        int name_len = strlen(name);
        int c;
        p = &ci->i_xattrs.index.rb_node;
@@ -226,6 +227,8 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
                parent = *p;
                xattr = rb_entry(parent, struct ceph_inode_xattr, node);
                c = strncmp(name, xattr->name, xattr->name_len);
+                if (c == 0 && name_len > xattr->name_len)
+                        c = 1;
                if (c < 0)
                        p = &(*p)->rb_left;
                else if (c > 0)
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index ee45648b0d1a..7cb0f7f847e4 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -3,6 +3,7 @@ config CIFS
        depends on INET
        select NLS
        select CRYPTO
+        select CRYPTO_MD4
        select CRYPTO_MD5
        select CRYPTO_HMAC
        select CRYPTO_ARC4
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 43b19dd39191..d87558448e3d 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
          link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
-          md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
+          cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
          readdir.o ioctl.o sess.o export.o
 cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
diff --git a/fs/cifs/README b/fs/cifs/README
index 46af99ab3614..fe1683590828 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -452,6 +452,11 @@ A partial list of the supported mount options follows:
                if oplock (caching token) is granted and held. Note that
                direct allows write operations larger than page size
                to be sent to the server.
+  strictcache   Use for switching on strict cache mode. In this mode the
+                client read from the cache all the time it has Oplock Level II,
+                otherwise - read from the server. All written data are stored
+                in the cache, but if the client doesn't have Exclusive Oplock,
+                it writes the data to the server.
  acl           Allow setfacl and getfacl to manage posix ACLs if server
                supports them.  (default)
  noacl         Do not allow setfacl and getfacl calls on this mount
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 7ed36536e754..0a265ad9e426 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -282,8 +282,6 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
        cFYI(1, "in %s", __func__);
        BUG_ON(IS_ROOT(mntpt));
-        xid = GetXid();
        /*
         * The MSDFS spec states that paths in DFS referral requests and
         * responses must be prefixed by a single '\' character instead of
@@ -293,20 +291,21 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
        mnt = ERR_PTR(-ENOMEM);
        full_path = build_path_from_dentry(mntpt);
        if (full_path == NULL)
-                goto free_xid;
+                goto cdda_exit;
        cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
        tlink = cifs_sb_tlink(cifs_sb);
-        mnt = ERR_PTR(-EINVAL);
        if (IS_ERR(tlink)) {
                mnt = ERR_CAST(tlink);
                goto free_full_path;
        }
        ses = tlink_tcon(tlink)->ses;
+        xid = GetXid();
        rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
                &num_referrals, &referrals,
                cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        FreeXid(xid);
        cifs_put_tlink(tlink);
@@ -339,8 +338,7 @@ success:
        free_dfs_info_array(referrals, num_referrals);
 free_full_path:
        kfree(full_path);
-free_xid:
+cdda_exit:
-        FreeXid(xid);
        cFYI(1, "leaving %s" , __func__);
        return mnt;
 }
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 1e7636b145a8..beeebf194234 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -372,6 +372,10 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
                ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
                                GFP_KERNEL);
+                if (!ppace) {
+                        cERROR(1, "DACL memory allocation error");
+                        return;
+                }
                for (i = 0; i < num_aces; ++i) {
                        ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 66f3d50d0676..a51585f9852b 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -24,7 +24,6 @@
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifs_debug.h"
-#include "md5.h"
 #include "cifs_unicode.h"
 #include "cifsproto.h"
 #include "ntlmssp.h"
@@ -37,11 +36,6 @@
 /* Note that the smb header signature field on input contains the
        sequence number before this function is called */
-extern void mdfour(unsigned char *out, unsigned char *in, int n);
-extern void E_md4hash(const unsigned char *passwd, unsigned char *p16);
-extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
-                       unsigned char *p24);
 static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
                                struct TCP_Server_Info *server, char *signature)
 {
@@ -234,6 +228,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 /* first calculate 24 bytes ntlm response and then 16 byte session key */
 int setup_ntlm_response(struct cifsSesInfo *ses)
 {
+        int rc = 0;
        unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
        char temp_key[CIFS_SESS_KEY_SIZE];
@@ -247,13 +242,26 @@ int setup_ntlm_response(struct cifsSesInfo *ses)
        }
        ses->auth_key.len = temp_len;
-        SMBNTencrypt(ses->password, ses->server->cryptkey,
+        rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
                        ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+        if (rc) {
+                cFYI(1, "%s Can't generate NTLM response, error: %d",
+                        __func__, rc);
+                return rc;
+        }
-        E_md4hash(ses->password, temp_key);
+        rc = E_md4hash(ses->password, temp_key);
-        mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
+        if (rc) {
+                cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
+                return rc;
+        }
-        return 0;
+        rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
+        if (rc)
+                cFYI(1, "%s Can't generate NTLM session key, error: %d",
+                        __func__, rc);
+        return rc;
 }
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -649,9 +657,10 @@ calc_seckey(struct cifsSesInfo *ses)
        get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
        tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
-        if (!tfm_arc4 || IS_ERR(tfm_arc4)) {
+        if (IS_ERR(tfm_arc4)) {
+                rc = PTR_ERR(tfm_arc4);
                cERROR(1, "could not allocate crypto API arc4\n");
-                return PTR_ERR(tfm_arc4);
+                return rc;
        }
        desc.tfm = tfm_arc4;
@@ -700,14 +709,13 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
        unsigned int size;
        server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
-        if (!server->secmech.hmacmd5 ||
+        if (IS_ERR(server->secmech.hmacmd5)) {
-                        IS_ERR(server->secmech.hmacmd5)) {
                cERROR(1, "could not allocate crypto hmacmd5\n");
                return PTR_ERR(server->secmech.hmacmd5);
        }
        server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
-        if (!server->secmech.md5 || IS_ERR(server->secmech.md5)) {
+        if (IS_ERR(server->secmech.md5)) {
                cERROR(1, "could not allocate crypto md5\n");
                rc = PTR_ERR(server->secmech.md5);
                goto crypto_allocate_md5_fail;
diff --git a/fs/cifs/cifsencrypt.h b/fs/cifs/cifsencrypt.h
deleted file mode 100644
index 15d2ec006474..000000000000
--- a/fs/cifs/cifsencrypt.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *   fs/cifs/cifsencrypt.h
- *
- *   Copyright (c) International Business Machines  Corp., 2005
- *   Author(s): Steve French (sfrench@us.ibm.com)
- *
- *   Externs for misc. small encryption routines
- *   so we do not have to put them in cifsproto.h
- *
- *   This library is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation; either version 2.1 of the License, or
- *   (at your option) any later version.
- *
- *   This library is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU Lesser General Public License for more details.
- *
- *   You should have received a copy of the GNU Lesser General Public License
- *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-/* md4.c */
-extern void mdfour(unsigned char *out, unsigned char *in, int n);
-/* smbdes.c */
-extern void E_P16(unsigned char *p14, unsigned char *p16);
-extern void E_P24(unsigned char *p21, const unsigned char *c8,
-                  unsigned char *p24);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index a8323f1dc1c4..f2970136d17d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -600,10 +600,17 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
        ssize_t written;
+        int rc;
        written = generic_file_aio_write(iocb, iov, nr_segs, pos);
-        if (!CIFS_I(inode)->clientCanCacheAll)
-                filemap_fdatawrite(inode->i_mapping);
+        if (CIFS_I(inode)->clientCanCacheAll)
+                return written;
+        rc = filemap_fdatawrite(inode->i_mapping);
+        if (rc)
+                cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode);
        return written;
 }
@@ -737,7 +744,7 @@ const struct file_operations cifs_file_strict_ops = {
        .read = do_sync_read,
        .write = do_sync_write,
        .aio_read = cifs_strict_readv,
-        .aio_write = cifs_file_aio_write,
+        .aio_write = cifs_strict_writev,
        .open = cifs_open,
        .release = cifs_close,
        .lock = cifs_lock,
@@ -793,7 +800,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
        .read = do_sync_read,
        .write = do_sync_write,
        .aio_read = cifs_strict_readv,
-        .aio_write = cifs_file_aio_write,
+        .aio_write = cifs_strict_writev,
        .open = cifs_open,
        .release = cifs_close,
        .fsync = cifs_strict_fsync,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index f23206d46531..a9371b6578c0 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -85,7 +85,9 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
 extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
                                 unsigned long nr_segs, loff_t pos);
 extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
-                         size_t write_size, loff_t *poffset);
+                               size_t write_size, loff_t *poffset);
+extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
+                                  unsigned long nr_segs, loff_t pos);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, int);
 extern int cifs_strict_fsync(struct file *, int);
@@ -125,5 +127,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.69"
+#define CIFS_VERSION   "1.71"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 5bfb75346cb0..17afb0fbcaed 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -166,6 +166,9 @@ struct TCP_Server_Info {
        struct socket *ssocket;
        struct sockaddr_storage dstaddr;
        struct sockaddr_storage srcaddr; /* locally bind to this IP */
+#ifdef CONFIG_NET_NS
+        struct net *net;
+#endif
        wait_queue_head_t response_q;
        wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
        struct list_head pending_mid_q;
@@ -185,6 +188,8 @@ struct TCP_Server_Info {
        /* multiplexed reads or writes */
        unsigned int maxBuf;    /* maxBuf specifies the maximum */
        /* message size the server can send or receive for non-raw SMBs */
+        /* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */
+        /* when socket is setup (and during reconnect) before NegProt sent */
        unsigned int max_rw;    /* maxRw specifies the maximum */
        /* message size the server can send or receive for */
        /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
@@ -217,6 +222,36 @@ struct TCP_Server_Info {
 };
 /*
+ * Macros to allow the TCP_Server_Info->net field and related code to drop out
+ * when CONFIG_NET_NS isn't set.
+ */
+#ifdef CONFIG_NET_NS
+static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
+{
+        return srv->net;
+}
+static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
+{
+        srv->net = net;
+}
+#else
+static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
+{
+        return &init_net;
+}
+static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
+{
+}
+#endif
+/*
 * Session structure.  One of these for each uid session with a particular host
 */
 struct cifsSesInfo {
@@ -619,7 +654,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   MID_REQUEST_SUBMITTED 2
 #define   MID_RESPONSE_RECEIVED 4
 #define   MID_RETRY_NEEDED      8 /* session closed while this request out */
-#define   MID_NO_RESP_NEEDED 0x10
+#define   MID_RESPONSE_MALFORMED 0x10
 /* Types of response buffer returned from SendReceive2 */
 #define   CIFS_NO_BUFFER        0    /* Response buffer not returned */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 982895fa7615..8096f27ad9a8 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -85,6 +85,8 @@ extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
 extern bool is_valid_oplock_break(struct smb_hdr *smb,
                                  struct TCP_Server_Info *);
 extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
+extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
+                            unsigned int bytes_written);
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
 extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
@@ -373,7 +375,7 @@ extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
 extern int cifs_verify_signature(struct smb_hdr *,
                                 struct TCP_Server_Info *server,
                                __u32 expected_sequence_number);
-extern void SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
+extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
 extern int setup_ntlm_response(struct cifsSesInfo *);
 extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
 extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
@@ -423,4 +425,11 @@ extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
 extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
                const unsigned char *path,
                struct cifs_sb_info *cifs_sb, int xid);
+extern int mdfour(unsigned char *, unsigned char *, int);
+extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
+extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
+                        unsigned char *p24);
+extern void E_P16(unsigned char *p14, unsigned char *p16);
+extern void E_P24(unsigned char *p21, const unsigned char *c8,
+                        unsigned char *p24);
 #endif                  /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 3106f5e5c633..904aa47e3515 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -136,9 +136,6 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
                }
        }
-        if (ses->status == CifsExiting)
-                return -EIO;
        /*
         * Give demultiplex thread up to 10 seconds to reconnect, should be
         * greater than cifs socket timeout which is 7 seconds
@@ -156,7 +153,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
                 * retrying until process is killed or server comes
                 * back on-line
                 */
-                if (!tcon->retry || ses->status == CifsExiting) {
+                if (!tcon->retry) {
                        cFYI(1, "gave up waiting on reconnect in smb_init");
                        return -EHOSTDOWN;
                }
@@ -4914,7 +4911,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
                   __u16 fid, __u32 pid_of_opener, bool SetAllocation)
 {
        struct smb_com_transaction2_sfi_req *pSMB  = NULL;
-        char *data_offset;
        struct file_end_of_file_info *parm_data;
        int rc = 0;
        __u16 params, param_offset, offset, byte_count, count;
@@ -4938,8 +4934,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
        param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
        offset = param_offset + params;
-        data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
        count = sizeof(struct file_end_of_file_info);
        pSMB->MaxParameterCount = cpu_to_le16(2);
        /* BB find exact max SMB PDU from sess structure BB */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 18d3c7724d6e..8d6c17ab593d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -55,9 +55,6 @@
 /* SMB echo "timeout" -- FIXME: tunable? */
 #define SMB_ECHO_INTERVAL (60 * HZ)
-extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
-                         unsigned char *p24);
 extern mempool_t *cifs_req_poolp;
 struct smb_vol {
@@ -87,6 +84,7 @@ struct smb_vol {
        bool no_xattr:1;   /* set if xattr (EA) support should be disabled*/
        bool server_ino:1; /* use inode numbers from server ie UniqueId */
        bool direct_io:1;
+        bool strict_io:1; /* strict cache behavior */
        bool remap:1;      /* set to remap seven reserved chars in filenames */
        bool posix_paths:1; /* unset to not ask for posix pathnames. */
        bool no_linux_ext:1;
@@ -339,8 +337,13 @@ cifs_echo_request(struct work_struct *work)
        struct TCP_Server_Info *server = container_of(work,
                                        struct TCP_Server_Info, echo.work);
-        /* no need to ping if we got a response recently */
+        /*
-        if (time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
+         * We cannot send an echo until the NEGOTIATE_PROTOCOL request is
+         * done, which is indicated by maxBuf != 0. Also, no need to ping if
+         * we got a response recently
+         */
+        if (server->maxBuf == 0 ||
+            time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
                goto requeue_echo;
        rc = CIFSSMBEcho(server);
@@ -580,14 +583,23 @@ incomplete_rcv:
                else if (reconnect == 1)
                        continue;
-                length += 4; /* account for rfc1002 hdr */
+                total_read += 4; /* account for rfc1002 hdr */
+                dump_smb(smb_buffer, total_read);
-                dump_smb(smb_buffer, length);
+                /*
-                if (checkSMB(smb_buffer, smb_buffer->Mid, total_read+4)) {
+                 * We know that we received enough to get to the MID as we
-                        cifs_dump_mem("Bad SMB: ", smb_buffer, 48);
+                 * checked the pdu_length earlier. Now check to see
-                        continue;
+                 * if the rest of the header is OK. We borrow the length
-                }
+                 * var for the rest of the loop to avoid a new stack var.
+                 *
+                 * 48 bytes is enough to display the header and a little bit
+                 * into the payload for debugging purposes.
+                 */
+                length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
+                if (length != 0)
+                        cifs_dump_mem("Bad SMB: ", smb_buffer,
+                                        min_t(unsigned int, total_read, 48));
                mid_entry = NULL;
                server->lstrp = jiffies;
@@ -599,7 +611,8 @@ incomplete_rcv:
                        if ((mid_entry->mid == smb_buffer->Mid) &&
                            (mid_entry->midState == MID_REQUEST_SUBMITTED) &&
                            (mid_entry->command == smb_buffer->Command)) {
-                                if (check2ndT2(smb_buffer,server->maxBuf) > 0) {
+                                if (length == 0 &&
+                                   check2ndT2(smb_buffer, server->maxBuf) > 0) {
                                        /* We have a multipart transact2 resp */
                                        isMultiRsp = true;
                                        if (mid_entry->resp_buf) {
@@ -634,12 +647,17 @@ incomplete_rcv:
                                mid_entry->resp_buf = smb_buffer;
                                mid_entry->largeBuf = isLargeBuf;
 multi_t2_fnd:
-                                mid_entry->midState = MID_RESPONSE_RECEIVED;
+                                if (length == 0)
-                                list_del_init(&mid_entry->qhead);
+                                        mid_entry->midState =
-                                mid_entry->callback(mid_entry);
+                                                        MID_RESPONSE_RECEIVED;
+                                else
+                                        mid_entry->midState =
+                                                        MID_RESPONSE_MALFORMED;
 #ifdef CONFIG_CIFS_STATS2
                                mid_entry->when_received = jiffies;
 #endif
+                                list_del_init(&mid_entry->qhead);
+                                mid_entry->callback(mid_entry);
                                break;
                        }
                        mid_entry = NULL;
@@ -655,6 +673,9 @@ multi_t2_fnd:
                                else
                                        smallbuf = NULL;
                        }
+                } else if (length != 0) {
+                        /* response sanity checks failed */
+                        continue;
                } else if (!is_valid_oplock_break(smb_buffer, server) &&
                           !isMultiRsp) {
                        cERROR(1, "No task to wake, unknown frame received! "
@@ -1344,6 +1365,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                        vol->direct_io = 1;
                } else if (strnicmp(data, "forcedirectio", 13) == 0) {
                        vol->direct_io = 1;
+                } else if (strnicmp(data, "strictcache", 11) == 0) {
+                        vol->strict_io = 1;
                } else if (strnicmp(data, "noac", 4) == 0) {
                        printk(KERN_WARNING "CIFS: Mount option noac not "
                                "supported. Instead set "
@@ -1568,6 +1591,9 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
        spin_lock(&cifs_tcp_ses_lock);
        list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+                if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
+                        continue;
                if (!match_address(server, addr,
                                   (struct sockaddr *)&vol->srcaddr))
                        continue;
@@ -1598,6 +1624,8 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
                return;
        }
+        put_net(cifs_net_ns(server));
        list_del_init(&server->tcp_ses_list);
        spin_unlock(&cifs_tcp_ses_lock);
@@ -1672,6 +1700,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                goto out_err;
        }
+        cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
        tcp_ses->hostname = extract_hostname(volume_info->UNC);
        if (IS_ERR(tcp_ses->hostname)) {
                rc = PTR_ERR(tcp_ses->hostname);
@@ -1752,6 +1781,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 out_err_crypto_release:
        cifs_crypto_shash_release(tcp_ses);
+        put_net(cifs_net_ns(tcp_ses));
 out_err:
        if (tcp_ses) {
                if (!IS_ERR(tcp_ses->hostname))
@@ -2263,8 +2294,8 @@ generic_ip_connect(struct TCP_Server_Info *server)
        }
        if (socket == NULL) {
-                rc = sock_create_kern(sfamily, SOCK_STREAM,
+                rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM,
-                                      IPPROTO_TCP, &socket);
+                                   IPPROTO_TCP, &socket, 1);
                if (rc < 0) {
                        cERROR(1, "Error %d creating socket", rc);
                        server->ssocket = NULL;
@@ -2576,6 +2607,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
        if (pvolume_info->multiuser)
                cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER |
                                            CIFS_MOUNT_NO_PERM);
+        if (pvolume_info->strict_io)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO;
        if (pvolume_info->direct_io) {
                cFYI(1, "mounting share using direct i/o");
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
@@ -2977,7 +3010,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                                         bcc_ptr);
                else
 #endif /* CIFS_WEAK_PW_HASH */
-                SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr);
+                rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
+                                        bcc_ptr);
                bcc_ptr += CIFS_AUTH_RESP_SIZE;
                if (ses->capabilities & CAP_UNICODE) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d7d65a70678e..c27d236738fc 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -346,7 +346,6 @@ int cifs_open(struct inode *inode, struct file *file)
        struct cifsTconInfo *tcon;
        struct tcon_link *tlink;
        struct cifsFileInfo *pCifsFile = NULL;
-        struct cifsInodeInfo *pCifsInode;
        char *full_path = NULL;
        bool posix_open_ok = false;
        __u16 netfid;
@@ -361,8 +360,6 @@ int cifs_open(struct inode *inode, struct file *file)
        }
        tcon = tlink_tcon(tlink);
-        pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
        full_path = build_path_from_dentry(file->f_path.dentry);
        if (full_path == NULL) {
                rc = -ENOMEM;
@@ -848,7 +845,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 }
 /* update the file size (if needed) after a write */
-static void
+void
 cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
                      unsigned int bytes_written)
 {
@@ -1146,7 +1143,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
        char *write_data;
        int rc = -EFAULT;
        int bytes_written = 0;
-        struct cifs_sb_info *cifs_sb;
        struct inode *inode;
        struct cifsFileInfo *open_file;
@@ -1154,7 +1150,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
                return -EFAULT;
        inode = page->mapping->host;
-        cifs_sb = CIFS_SB(inode->i_sb);
        offset += (loff_t)from;
        write_data = kmap(page);
@@ -1574,34 +1569,6 @@ int cifs_fsync(struct file *file, int datasync)
        return rc;
 }
-/* static void cifs_sync_page(struct page *page)
-{
-        struct address_space *mapping;
-        struct inode *inode;
-        unsigned long index = page->index;
-        unsigned int rpages = 0;
-        int rc = 0;
-        cFYI(1, "sync page %p", page);
-        mapping = page->mapping;
-        if (!mapping)
-                return 0;
-        inode = mapping->host;
-        if (!inode)
-                return; */
-/*      fill in rpages then
-        result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
-/*      cFYI(1, "rpages is %d for sync page of Index %ld", rpages, index);
-#if 0
-        if (rc < 0)
-                return rc;
-        return 0;
-#endif
-} */
 /*
 * As file closes, flush all cached write data for this inode checking
 * for write behind errors.
@@ -1619,13 +1586,215 @@ int cifs_flush(struct file *file, fl_owner_t id)
        return rc;
 }
+static int
+cifs_write_allocate_pages(struct page **pages, unsigned long num_pages)
+{
+        int rc = 0;
+        unsigned long i;
+        for (i = 0; i < num_pages; i++) {
+                pages[i] = alloc_page(__GFP_HIGHMEM);
+                if (!pages[i]) {
+                        /*
+                         * save number of pages we have already allocated and
+                         * return with ENOMEM error
+                         */
+                        num_pages = i;
+                        rc = -ENOMEM;
+                        goto error;
+                }
+        }
+        return rc;
+error:
+        for (i = 0; i < num_pages; i++)
+                put_page(pages[i]);
+        return rc;
+}
+static inline
+size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
+{
+        size_t num_pages;
+        size_t clen;
+        clen = min_t(const size_t, len, wsize);
+        num_pages = clen / PAGE_CACHE_SIZE;
+        if (clen % PAGE_CACHE_SIZE)
+                num_pages++;
+        if (cur_len)
+                *cur_len = clen;
+        return num_pages;
+}
+static ssize_t
+cifs_iovec_write(struct file *file, const struct iovec *iov,
+                 unsigned long nr_segs, loff_t *poffset)
+{
+        unsigned int written;
+        unsigned long num_pages, npages, i;
+        size_t copied, len, cur_len;
+        ssize_t total_written = 0;
+        struct kvec *to_send;
+        struct page **pages;
+        struct iov_iter it;
+        struct inode *inode;
+        struct cifsFileInfo *open_file;
+        struct cifsTconInfo *pTcon;
+        struct cifs_sb_info *cifs_sb;
+        int xid, rc;
+        len = iov_length(iov, nr_segs);
+        if (!len)
+                return 0;
+        rc = generic_write_checks(file, poffset, &len, 0);
+        if (rc)
+                return rc;
+        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+        num_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
+        pages = kmalloc(sizeof(struct pages *)*num_pages, GFP_KERNEL);
+        if (!pages)
+                return -ENOMEM;
+        to_send = kmalloc(sizeof(struct kvec)*(num_pages + 1), GFP_KERNEL);
+        if (!to_send) {
+                kfree(pages);
+                return -ENOMEM;
+        }
+        rc = cifs_write_allocate_pages(pages, num_pages);
+        if (rc) {
+                kfree(pages);
+                kfree(to_send);
+                return rc;
+        }
+        xid = GetXid();
+        open_file = file->private_data;
+        pTcon = tlink_tcon(open_file->tlink);
+        inode = file->f_path.dentry->d_inode;
+        iov_iter_init(&it, iov, nr_segs, len, 0);
+        npages = num_pages;
+        do {
+                size_t save_len = cur_len;
+                for (i = 0; i < npages; i++) {
+                        copied = min_t(const size_t, cur_len, PAGE_CACHE_SIZE);
+                        copied = iov_iter_copy_from_user(pages[i], &it, 0,
+                                                         copied);
+                        cur_len -= copied;
+                        iov_iter_advance(&it, copied);
+                        to_send[i+1].iov_base = kmap(pages[i]);
+                        to_send[i+1].iov_len = copied;
+                }
+                cur_len = save_len - cur_len;
+                do {
+                        if (open_file->invalidHandle) {
+                                rc = cifs_reopen_file(open_file, false);
+                                if (rc != 0)
+                                        break;
+                        }
+                        rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid,
+                                           cur_len, *poffset, &written,
+                                           to_send, npages, 0);
+                } while (rc == -EAGAIN);
+                for (i = 0; i < npages; i++)
+                        kunmap(pages[i]);
+                if (written) {
+                        len -= written;
+                        total_written += written;
+                        cifs_update_eof(CIFS_I(inode), *poffset, written);
+                        *poffset += written;
+                } else if (rc < 0) {
+                        if (!total_written)
+                                total_written = rc;
+                        break;
+                }
+                /* get length and number of kvecs of the next write */
+                npages = get_numpages(cifs_sb->wsize, len, &cur_len);
+        } while (len > 0);
+        if (total_written > 0) {
+                spin_lock(&inode->i_lock);
+                if (*poffset > inode->i_size)
+                        i_size_write(inode, *poffset);
+                spin_unlock(&inode->i_lock);
+        }
+        cifs_stats_bytes_written(pTcon, total_written);
+        mark_inode_dirty_sync(inode);
+        for (i = 0; i < num_pages; i++)
+                put_page(pages[i]);
+        kfree(to_send);
+        kfree(pages);
+        FreeXid(xid);
+        return total_written;
+}
+static ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
+                                unsigned long nr_segs, loff_t pos)
+{
+        ssize_t written;
+        struct inode *inode;
+        inode = iocb->ki_filp->f_path.dentry->d_inode;
+        /*
+         * BB - optimize the way when signing is disabled. We can drop this
+         * extra memory-to-memory copying and use iovec buffers for constructing
+         * write request.
+         */
+        written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos);
+        if (written > 0) {
+                CIFS_I(inode)->invalid_mapping = true;
+                iocb->ki_pos = pos;
+        }
+        return written;
+}
+ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
+                           unsigned long nr_segs, loff_t pos)
+{
+        struct inode *inode;
+        inode = iocb->ki_filp->f_path.dentry->d_inode;
+        if (CIFS_I(inode)->clientCanCacheAll)
+                return generic_file_aio_write(iocb, iov, nr_segs, pos);
+        /*
+         * In strict cache mode we need to write the data to the server exactly
+         * from the pos to pos+len-1 rather than flush all affected pages
+         * because it may cause a error with mandatory locks on these pages but
+         * not on the region from pos to ppos+len-1.
+         */
+        return cifs_user_writev(iocb, iov, nr_segs, pos);
+}
 static ssize_t
 cifs_iovec_read(struct file *file, const struct iovec *iov,
                 unsigned long nr_segs, loff_t *poffset)
 {
        int rc;
        int xid;
-        unsigned int total_read, bytes_read = 0;
+        ssize_t total_read;
+        unsigned int bytes_read = 0;
        size_t len, cur_len;
        int iov_offset = 0;
        struct cifs_sb_info *cifs_sb;
@@ -2313,7 +2482,6 @@ const struct address_space_operations cifs_addr_ops = {
        .set_page_dirty = __set_page_dirty_nobuffers,
        .releasepage = cifs_release_page,
        .invalidatepage = cifs_invalidate_page,
-        /* .sync_page = cifs_sync_page, */
        /* .direct_IO = */
 };
@@ -2331,6 +2499,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
        .set_page_dirty = __set_page_dirty_nobuffers,
        .releasepage = cifs_release_page,
        .invalidatepage = cifs_invalidate_page,
-        /* .sync_page = cifs_sync_page, */
        /* .direct_IO = */
 };
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 306769de2fb5..e8804d373404 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -28,7 +28,6 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
-#include "md5.h"
 #define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
 #define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
@@ -47,6 +46,45 @@
        md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
 static int
+symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
+{
+        int rc;
+        unsigned int size;
+        struct crypto_shash *md5;
+        struct sdesc *sdescmd5;
+        md5 = crypto_alloc_shash("md5", 0, 0);
+        if (IS_ERR(md5)) {
+                rc = PTR_ERR(md5);
+                cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc);
+                return rc;
+        }
+        size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
+        sdescmd5 = kmalloc(size, GFP_KERNEL);
+        if (!sdescmd5) {
+                rc = -ENOMEM;
+                cERROR(1, "%s: Memory allocation failure\n", __func__);
+                goto symlink_hash_err;
+        }
+        sdescmd5->shash.tfm = md5;
+        sdescmd5->shash.flags = 0x0;
+        rc = crypto_shash_init(&sdescmd5->shash);
+        if (rc) {
+                cERROR(1, "%s: Could not init md5 shash\n", __func__);
+                goto symlink_hash_err;
+        }
+        crypto_shash_update(&sdescmd5->shash, link_str, link_len);
+        rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
+symlink_hash_err:
+        crypto_free_shash(md5);
+        kfree(sdescmd5);
+        return rc;
+}
+static int
 CIFSParseMFSymlink(const u8 *buf,
                   unsigned int buf_len,
                   unsigned int *_link_len,
@@ -56,7 +94,6 @@ CIFSParseMFSymlink(const u8 *buf,
        unsigned int link_len;
        const char *md5_str1;
        const char *link_str;
-        struct MD5Context md5_ctx;
        u8 md5_hash[16];
        char md5_str2[34];
@@ -70,9 +107,11 @@ CIFSParseMFSymlink(const u8 *buf,
        if (rc != 1)
                return -EINVAL;
-        cifs_MD5_init(&md5_ctx);
+        rc = symlink_hash(link_len, link_str, md5_hash);
-        cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
+        if (rc) {
-        cifs_MD5_final(md5_hash, &md5_ctx);
+                cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
+                return rc;
+        }
        snprintf(md5_str2, sizeof(md5_str2),
                 CIFS_MF_SYMLINK_MD5_FORMAT,
@@ -94,9 +133,9 @@ CIFSParseMFSymlink(const u8 *buf,
 static int
 CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
 {
+        int rc;
        unsigned int link_len;
        unsigned int ofs;
-        struct MD5Context md5_ctx;
        u8 md5_hash[16];
        if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
@@ -107,9 +146,11 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
        if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
                return -ENAMETOOLONG;
-        cifs_MD5_init(&md5_ctx);
+        rc = symlink_hash(link_len, link_str, md5_hash);
-        cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
+        if (rc) {
-        cifs_MD5_final(md5_hash, &md5_ctx);
+                cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
+                return rc;
+        }
        snprintf(buf, buf_len,
                 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
diff --git a/fs/cifs/md4.c b/fs/cifs/md4.c
deleted file mode 100644
index a725c2609d67..000000000000
--- a/fs/cifs/md4.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
-   Unix SMB/Netbios implementation.
-   Version 1.9.
-   a implementation of MD4 designed for use in the SMB authentication protocol
-   Copyright (C) Andrew Tridgell 1997-1998.
-   Modified by Steve French (sfrench@us.ibm.com) 2002-2003
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2 of the License, or
-   (at your option) any later version.
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
-#include <linux/module.h>
-#include <linux/fs.h>
-#include "cifsencrypt.h"
-/* NOTE: This code makes no attempt to be fast! */
-static __u32
-F(__u32 X, __u32 Y, __u32 Z)
-{
-        return (X & Y) | ((~X) & Z);
-}
-static __u32
-G(__u32 X, __u32 Y, __u32 Z)
-{
-        return (X & Y) | (X & Z) | (Y & Z);
-}
-static __u32
-H(__u32 X, __u32 Y, __u32 Z)
-{
-        return X ^ Y ^ Z;
-}
-static __u32
-lshift(__u32 x, int s)
-{
-        x &= 0xFFFFFFFF;
-        return ((x << s) & 0xFFFFFFFF) | (x >> (32 - s));
-}
-#define ROUND1(a,b,c,d,k,s) (*a) = lshift((*a) + F(*b,*c,*d) + X[k], s)
-#define ROUND2(a,b,c,d,k,s) (*a) = lshift((*a) + G(*b,*c,*d) + X[k] + (__u32)0x5A827999,s)
-#define ROUND3(a,b,c,d,k,s) (*a) = lshift((*a) + H(*b,*c,*d) + X[k] + (__u32)0x6ED9EBA1,s)
-/* this applies md4 to 64 byte chunks */
-static void
-mdfour64(__u32 *M, __u32 *A, __u32 *B, __u32 *C, __u32 *D)
-{
-        int j;
-        __u32 AA, BB, CC, DD;
-        __u32 X[16];
-        for (j = 0; j < 16; j++)
-                X[j] = M[j];
-        AA = *A;
-        BB = *B;
-        CC = *C;
-        DD = *D;
-        ROUND1(A, B, C, D, 0, 3);
-        ROUND1(D, A, B, C, 1, 7);
-        ROUND1(C, D, A, B, 2, 11);
-        ROUND1(B, C, D, A, 3, 19);
-        ROUND1(A, B, C, D, 4, 3);
-        ROUND1(D, A, B, C, 5, 7);
-        ROUND1(C, D, A, B, 6, 11);
-        ROUND1(B, C, D, A, 7, 19);
-        ROUND1(A, B, C, D, 8, 3);
-        ROUND1(D, A, B, C, 9, 7);
-        ROUND1(C, D, A, B, 10, 11);
-        ROUND1(B, C, D, A, 11, 19);
-        ROUND1(A, B, C, D, 12, 3);
-        ROUND1(D, A, B, C, 13, 7);
-        ROUND1(C, D, A, B, 14, 11);
-        ROUND1(B, C, D, A, 15, 19);
-        ROUND2(A, B, C, D, 0, 3);
-        ROUND2(D, A, B, C, 4, 5);
-        ROUND2(C, D, A, B, 8, 9);
-        ROUND2(B, C, D, A, 12, 13);
-        ROUND2(A, B, C, D, 1, 3);
-        ROUND2(D, A, B, C, 5, 5);
-        ROUND2(C, D, A, B, 9, 9);
-        ROUND2(B, C, D, A, 13, 13);
-        ROUND2(A, B, C, D, 2, 3);
-        ROUND2(D, A, B, C, 6, 5);
-        ROUND2(C, D, A, B, 10, 9);
-        ROUND2(B, C, D, A, 14, 13);
-        ROUND2(A, B, C, D, 3, 3);
-        ROUND2(D, A, B, C, 7, 5);
-        ROUND2(C, D, A, B, 11, 9);
-        ROUND2(B, C, D, A, 15, 13);
-        ROUND3(A, B, C, D, 0, 3);
-        ROUND3(D, A, B, C, 8, 9);
-        ROUND3(C, D, A, B, 4, 11);
-        ROUND3(B, C, D, A, 12, 15);
-        ROUND3(A, B, C, D, 2, 3);
-        ROUND3(D, A, B, C, 10, 9);
-        ROUND3(C, D, A, B, 6, 11);
-        ROUND3(B, C, D, A, 14, 15);
-        ROUND3(A, B, C, D, 1, 3);
-        ROUND3(D, A, B, C, 9, 9);
-        ROUND3(C, D, A, B, 5, 11);
-        ROUND3(B, C, D, A, 13, 15);
-        ROUND3(A, B, C, D, 3, 3);
-        ROUND3(D, A, B, C, 11, 9);
-        ROUND3(C, D, A, B, 7, 11);
-        ROUND3(B, C, D, A, 15, 15);
-        *A += AA;
-        *B += BB;
-        *C += CC;
-        *D += DD;
-        *A &= 0xFFFFFFFF;
-        *B &= 0xFFFFFFFF;
-        *C &= 0xFFFFFFFF;
-        *D &= 0xFFFFFFFF;
-        for (j = 0; j < 16; j++)
-                X[j] = 0;
-}
-static void
-copy64(__u32 *M, unsigned char *in)
-{
-        int i;
-        for (i = 0; i < 16; i++)
-                M[i] = (in[i * 4 + 3] << 24) | (in[i * 4 + 2] << 16) |
-                    (in[i * 4 + 1] << 8) | (in[i * 4 + 0] << 0);
-}
-static void
-copy4(unsigned char *out, __u32 x)
-{
-        out[0] = x & 0xFF;
-        out[1] = (x >> 8) & 0xFF;
-        out[2] = (x >> 16) & 0xFF;
-        out[3] = (x >> 24) & 0xFF;
-}
-/* produce a md4 message digest from data of length n bytes */
-void
-mdfour(unsigned char *out, unsigned char *in, int n)
-{
-        unsigned char buf[128];
-        __u32 M[16];
-        __u32 b = n * 8;
-        int i;
-        __u32 A = 0x67452301;
-        __u32 B = 0xefcdab89;
-        __u32 C = 0x98badcfe;
-        __u32 D = 0x10325476;
-        while (n > 64) {
-                copy64(M, in);
-                mdfour64(M, &A, &B, &C, &D);
-                in += 64;
-                n -= 64;
-        }
-        for (i = 0; i < 128; i++)
-                buf[i] = 0;
-        memcpy(buf, in, n);
-        buf[n] = 0x80;
-        if (n <= 55) {
-                copy4(buf + 56, b);
-                copy64(M, buf);
-                mdfour64(M, &A, &B, &C, &D);
-        } else {
-                copy4(buf + 120, b);
-                copy64(M, buf);
-                mdfour64(M, &A, &B, &C, &D);
-                copy64(M, buf + 64);
-                mdfour64(M, &A, &B, &C, &D);
-        }
-        for (i = 0; i < 128; i++)
-                buf[i] = 0;
-        copy64(M, buf);
-        copy4(out, A);
-        copy4(out + 4, B);
-        copy4(out + 8, C);
-        copy4(out + 12, D);
-        A = B = C = D = 0;
-}
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
deleted file mode 100644
index 98b66a54c319..000000000000
--- a/fs/cifs/md5.c
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * This code implements the MD5 message-digest algorithm.
- * The algorithm is due to Ron Rivest.  This code was
- * written by Colin Plumb in 1993, no copyright is claimed.
- * This code is in the public domain; do with it what you wish.
- *
- * Equivalent code is available from RSA Data Security, Inc.
- * This code has been tested against that, and is equivalent,
- * except that you don't need to include two pages of legalese
- * with every copy.
- *
- * To compute the message digest of a chunk of bytes, declare an
- * MD5Context structure, pass it to cifs_MD5_init, call cifs_MD5_update as
- * needed on buffers full of bytes, and then call cifs_MD5_final, which
- * will fill a supplied 16-byte array with the digest.
- */
-/* This code slightly modified to fit into Samba by
-   abartlet@samba.org Jun 2001
-   and to fit the cifs vfs by
-   Steve French sfrench@us.ibm.com */
-#include <linux/string.h>
-#include "md5.h"
-static void MD5Transform(__u32 buf[4], __u32 const in[16]);
-/*
- * Note: this code is harmless on little-endian machines.
- */
-static void
-byteReverse(unsigned char *buf, unsigned longs)
-{
-        __u32 t;
-        do {
-                t = (__u32) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
-                    ((unsigned) buf[1] << 8 | buf[0]);
-                *(__u32 *) buf = t;
-                buf += 4;
-        } while (--longs);
-}
-/*
- * Start MD5 accumulation.  Set bit count to 0 and buffer to mysterious
- * initialization constants.
- */
-void
-cifs_MD5_init(struct MD5Context *ctx)
-{
-        ctx->buf[0] = 0x67452301;
-        ctx->buf[1] = 0xefcdab89;
-        ctx->buf[2] = 0x98badcfe;
-        ctx->buf[3] = 0x10325476;
-        ctx->bits[0] = 0;
-        ctx->bits[1] = 0;
-}
-/*
- * Update context to reflect the concatenation of another buffer full
- * of bytes.
- */
-void
-cifs_MD5_update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
-{
-        register __u32 t;
-        /* Update bitcount */
-        t = ctx->bits[0];
-        if ((ctx->bits[0] = t + ((__u32) len << 3)) < t)
-                ctx->bits[1]++; /* Carry from low to high */
-        ctx->bits[1] += len >> 29;
-        t = (t >> 3) & 0x3f;    /* Bytes already in shsInfo->data */
-        /* Handle any leading odd-sized chunks */
-        if (t) {
-                unsigned char *p = (unsigned char *) ctx->in + t;
-                t = 64 - t;
-                if (len < t) {
-                        memmove(p, buf, len);
-                        return;
-                }
-                memmove(p, buf, t);
-                byteReverse(ctx->in, 16);
-                MD5Transform(ctx->buf, (__u32 *) ctx->in);
-                buf += t;
-                len -= t;
-        }
-        /* Process data in 64-byte chunks */
-        while (len >= 64) {
-                memmove(ctx->in, buf, 64);
-                byteReverse(ctx->in, 16);
-                MD5Transform(ctx->buf, (__u32 *) ctx->in);
-                buf += 64;
-                len -= 64;
-        }
-        /* Handle any remaining bytes of data. */
-        memmove(ctx->in, buf, len);
-}
-/*
- * Final wrapup - pad to 64-byte boundary with the bit pattern
- * 1 0* (64-bit count of bits processed, MSB-first)
- */
-void
-cifs_MD5_final(unsigned char digest[16], struct MD5Context *ctx)
-{
-        unsigned int count;
-        unsigned char *p;
-        /* Compute number of bytes mod 64 */
-        count = (ctx->bits[0] >> 3) & 0x3F;
-        /* Set the first char of padding to 0x80.  This is safe since there is
-           always at least one byte free */
-        p = ctx->in + count;
-        *p++ = 0x80;
-        /* Bytes of padding needed to make 64 bytes */
-        count = 64 - 1 - count;
-        /* Pad out to 56 mod 64 */
-        if (count < 8) {
-                /* Two lots of padding:  Pad the first block to 64 bytes */
-                memset(p, 0, count);
-                byteReverse(ctx->in, 16);
-                MD5Transform(ctx->buf, (__u32 *) ctx->in);
-                /* Now fill the next block with 56 bytes */
-                memset(ctx->in, 0, 56);
-        } else {
-                /* Pad block to 56 bytes */
-                memset(p, 0, count - 8);
-        }
-        byteReverse(ctx->in, 14);
-        /* Append length in bits and transform */
-        ((__u32 *) ctx->in)[14] = ctx->bits[0];
-        ((__u32 *) ctx->in)[15] = ctx->bits[1];
-        MD5Transform(ctx->buf, (__u32 *) ctx->in);
-        byteReverse((unsigned char *) ctx->buf, 4);
-        memmove(digest, ctx->buf, 16);
-        memset(ctx, 0, sizeof(*ctx));   /* In case it's sensitive */
-}
-/* The four core functions - F1 is optimized somewhat */
-/* #define F1(x, y, z) (x & y | ~x & z) */
-#define F1(x, y, z) (z ^ (x & (y ^ z)))
-#define F2(x, y, z) F1(z, x, y)
-#define F3(x, y, z) (x ^ y ^ z)
-#define F4(x, y, z) (y ^ (x | ~z))
-/* This is the central step in the MD5 algorithm. */
-#define MD5STEP(f, w, x, y, z, data, s) \
-        (w += f(x, y, z) + data,  w = w<<s | w>>(32-s),  w += x)
-/*
- * The core of the MD5 algorithm, this alters an existing MD5 hash to
- * reflect the addition of 16 longwords of new data.  cifs_MD5_update blocks
- * the data and converts bytes into longwords for this routine.
- */
-static void
-MD5Transform(__u32 buf[4], __u32 const in[16])
-{
-        register __u32 a, b, c, d;
-        a = buf[0];
-        b = buf[1];
-        c = buf[2];
-        d = buf[3];
-        MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
-        MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
-        MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
-        MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
-        MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
-        MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
-        MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
-        MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
-        MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
-        MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
-        MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
-        MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
-        MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
-        MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
-        MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
-        MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
-        MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
-        MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
-        MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
-        MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
-        MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
-        MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
-        MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
-        MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
-        MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
-        MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
-        MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
-        MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
-        MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
-        MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
-        MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
-        MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
-        MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
-        MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
-        MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
-        MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
-        MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
-        MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
-        MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
-        MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
-        MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
-        MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
-        MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
-        MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
-        MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
-        MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
-        MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
-        MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
-        MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
-        MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
-        MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
-        MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
-        MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
-        MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
-        MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
-        MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
-        MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
-        MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
-        MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
-        MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
-        MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
-        MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
-        MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
-        MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
-        buf[0] += a;
-        buf[1] += b;
-        buf[2] += c;
-        buf[3] += d;
-}
-#if 0   /* currently unused */
-/***********************************************************************
- the rfc 2104 version of hmac_md5 initialisation.
-***********************************************************************/
-static void
-hmac_md5_init_rfc2104(unsigned char *key, int key_len,
-                      struct HMACMD5Context *ctx)
-{
-        int i;
-        /* if key is longer than 64 bytes reset it to key=MD5(key) */
-        if (key_len > 64) {
-                unsigned char tk[16];
-                struct MD5Context tctx;
-                cifs_MD5_init(&tctx);
-                cifs_MD5_update(&tctx, key, key_len);
-                cifs_MD5_final(tk, &tctx);
-                key = tk;
-                key_len = 16;
-        }
-        /* start out by storing key in pads */
-        memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
-        memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
-        memcpy(ctx->k_ipad, key, key_len);
-        memcpy(ctx->k_opad, key, key_len);
-        /* XOR key with ipad and opad values */
-        for (i = 0; i < 64; i++) {
-                ctx->k_ipad[i] ^= 0x36;
-                ctx->k_opad[i] ^= 0x5c;
-        }
-        cifs_MD5_init(&ctx->ctx);
-        cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
-}
-#endif
-/***********************************************************************
- the microsoft version of hmac_md5 initialisation.
-***********************************************************************/
-void
-hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
-                         struct HMACMD5Context *ctx)
-{
-        int i;
-        /* if key is longer than 64 bytes truncate it */
-        if (key_len > 64)
-                key_len = 64;
-        /* start out by storing key in pads */
-        memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
-        memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
-        memcpy(ctx->k_ipad, key, key_len);
-        memcpy(ctx->k_opad, key, key_len);
-        /* XOR key with ipad and opad values */
-        for (i = 0; i < 64; i++) {
-                ctx->k_ipad[i] ^= 0x36;
-                ctx->k_opad[i] ^= 0x5c;
-        }
-        cifs_MD5_init(&ctx->ctx);
-        cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
-}
-/***********************************************************************
- update hmac_md5 "inner" buffer
-***********************************************************************/
-void
-hmac_md5_update(const unsigned char *text, int text_len,
-                struct HMACMD5Context *ctx)
-{
-        cifs_MD5_update(&ctx->ctx, text, text_len);     /* then text of datagram */
-}
-/***********************************************************************
- finish off hmac_md5 "inner" buffer and generate outer one.
-***********************************************************************/
-void
-hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
-{
-        struct MD5Context ctx_o;
-        cifs_MD5_final(digest, &ctx->ctx);
-        cifs_MD5_init(&ctx_o);
-        cifs_MD5_update(&ctx_o, ctx->k_opad, 64);
-        cifs_MD5_update(&ctx_o, digest, 16);
-        cifs_MD5_final(digest, &ctx_o);
-}
-/***********************************************************
- single function to calculate an HMAC MD5 digest from data.
- use the microsoft hmacmd5 init method because the key is 16 bytes.
-************************************************************/
-#if 0 /* currently unused */
-static void
-hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
-         unsigned char *digest)
-{
-        struct HMACMD5Context ctx;
-        hmac_md5_init_limK_to_64(key, 16, &ctx);
-        if (data_len != 0)
-                hmac_md5_update(data, data_len, &ctx);
-        hmac_md5_final(digest, &ctx);
-}
-#endif
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
deleted file mode 100644
index 6fba8cb402fd..000000000000
--- a/fs/cifs/md5.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef MD5_H
-#define MD5_H
-#ifndef HEADER_MD5_H
-/* Try to avoid clashes with OpenSSL */
-#define HEADER_MD5_H
-#endif
-struct MD5Context {
-        __u32 buf[4];
-        __u32 bits[2];
-        unsigned char in[64];
-};
-#endif                          /* !MD5_H */
-#ifndef _HMAC_MD5_H
-struct HMACMD5Context {
-        struct MD5Context ctx;
-        unsigned char k_ipad[65];
-        unsigned char k_opad[65];
-};
-#endif                          /* _HMAC_MD5_H */
-void cifs_MD5_init(struct MD5Context *context);
-void cifs_MD5_update(struct MD5Context *context, unsigned char const *buf,
-                        unsigned len);
-void cifs_MD5_final(unsigned char digest[16], struct MD5Context *context);
-/* The following definitions come from lib/hmacmd5.c  */
-/* void hmac_md5_init_rfc2104(unsigned char *key, int key_len,
-                        struct HMACMD5Context *ctx);*/
-void hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
-                        struct HMACMD5Context *ctx);
-void hmac_md5_update(const unsigned char *text, int text_len,
-                        struct HMACMD5Context *ctx);
-void hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx);
-/* void hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
-                        unsigned char *digest);*/
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index a09e077ba925..2a930a752a78 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -236,10 +236,7 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
 {
        __u16 mid = 0;
        __u16 last_mid;
-        int   collision;
+        bool collision;
-        if (server == NULL)
-                return mid;
        spin_lock(&GlobalMid_Lock);
        last_mid = server->CurrentMid; /* we do not want to loop forever */
@@ -252,24 +249,38 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
        (and it would also have to have been a request that
         did not time out) */
        while (server->CurrentMid != last_mid) {
-                struct list_head *tmp;
                struct mid_q_entry *mid_entry;
+                unsigned int num_mids;
-                collision = 0;
+                collision = false;
                if (server->CurrentMid == 0)
                        server->CurrentMid++;
-                list_for_each(tmp, &server->pending_mid_q) {
+                num_mids = 0;
-                        mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+                list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
+                        ++num_mids;
-                        if ((mid_entry->mid == server->CurrentMid) &&
+                        if (mid_entry->mid == server->CurrentMid &&
-                            (mid_entry->midState == MID_REQUEST_SUBMITTED)) {
+                            mid_entry->midState == MID_REQUEST_SUBMITTED) {
                                /* This mid is in use, try a different one */
-                                collision = 1;
+                                collision = true;
                                break;
                        }
                }
-                if (collision == 0) {
+                /*
+                 * if we have more than 32k mids in the list, then something
+                 * is very wrong. Possibly a local user is trying to DoS the
+                 * box by issuing long-running calls and SIGKILL'ing them. If
+                 * we get to 2^16 mids then we're in big trouble as this
+                 * function could loop forever.
+                 *
+                 * Go ahead and assign out the mid in this situation, but force
+                 * an eventual reconnect to clean out the pending_mid_q.
+                 */
+                if (num_mids > 32768)
+                        server->tcpStatus = CifsNeedReconnect;
+                if (!collision) {
                        mid = server->CurrentMid;
                        break;
                }
@@ -381,29 +392,31 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 }
 static int
-checkSMBhdr(struct smb_hdr *smb, __u16 mid)
+check_smb_hdr(struct smb_hdr *smb, __u16 mid)
 {
-        /* Make sure that this really is an SMB, that it is a response,
+        /* does it have the right SMB "signature" ? */
-           and that the message ids match */
+        if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) {
-        if ((*(__le32 *) smb->Protocol == cpu_to_le32(0x424d53ff)) &&
+                cERROR(1, "Bad protocol string signature header 0x%x",
-                (mid == smb->Mid)) {
+                        *(unsigned int *)smb->Protocol);
-                if (smb->Flags & SMBFLG_RESPONSE)
+                return 1;
-                        return 0;
+        }
-                else {
-                /* only one valid case where server sends us request */
+        /* Make sure that message ids match */
-                        if (smb->Command == SMB_COM_LOCKING_ANDX)
+        if (mid != smb->Mid) {
-                                return 0;
+                cERROR(1, "Mids do not match. received=%u expected=%u",
-                        else
+                        smb->Mid, mid);
-                                cERROR(1, "Received Request not response");
+                return 1;
-                }
-        } else { /* bad signature or mid */
-                if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff))
-                        cERROR(1, "Bad protocol string signature header %x",
-                                *(unsigned int *) smb->Protocol);
-                if (mid != smb->Mid)
-                        cERROR(1, "Mids do not match");
        }
-        cERROR(1, "bad smb detected. The Mid=%d", smb->Mid);
+        /* if it's a response then accept */
+        if (smb->Flags & SMBFLG_RESPONSE)
+                return 0;
+        /* only one valid case where server sends us request */
+        if (smb->Command == SMB_COM_LOCKING_ANDX)
+                return 0;
+        cERROR(1, "Server sent request, not response. mid=%u", smb->Mid);
        return 1;
 }
@@ -448,7 +461,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
                return 1;
        }
-        if (checkSMBhdr(smb, mid))
+        if (check_smb_hdr(smb, mid))
                return 1;
        clc_len = smbCalcSize_LE(smb);
@@ -465,25 +478,26 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
                        if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
                                return 0; /* bcc wrapped */
                }
-                cFYI(1, "Calculated size %d vs length %d mismatch for mid %d",
+                cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
                                clc_len, 4 + len, smb->Mid);
-                /* Windows XP can return a few bytes too much, presumably
-                an illegal pad, at the end of byte range lock responses
+                if (4 + len < clc_len) {
-                so we allow for that three byte pad, as long as actual
+                        cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
-                received length is as long or longer than calculated length */
-                /* We have now had to extend this more, since there is a
-                case in which it needs to be bigger still to handle a
-                malformed response to transact2 findfirst from WinXP when
-                access denied is returned and thus bcc and wct are zero
-                but server says length is 0x21 bytes too long as if the server
-                forget to reset the smb rfc1001 length when it reset the
-                wct and bcc to minimum size and drop the t2 parms and data */
-                if ((4+len > clc_len) && (len <= clc_len + 512))
-                        return 0;
-                else {
-                        cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d",
                                        len, smb->Mid);
                        return 1;
+                } else if (len > clc_len + 512) {
+                        /*
+                         * Some servers (Windows XP in particular) send more
+                         * data than the lengths in the SMB packet would
+                         * indicate on certain calls (byte range locks and
+                         * trans2 find first calls in particular). While the
+                         * client can handle such a frame by ignoring the
+                         * trailing data, we choose limit the amount of extra
+                         * data to 512 bytes.
+                         */
+                        cERROR(1, "RFC1001 size %u more than 512 bytes larger "
+                                  "than SMB for mid=%u", len, smb->Mid);
+                        return 1;
                }
        }
        return 0;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 8d9189f64477..79f641eeda30 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -170,7 +170,7 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
 {
        int rc, alen, slen;
        const char *pct;
-        char *endp, scope_id[13];
+        char scope_id[13];
        struct sockaddr_in *s4 = (struct sockaddr_in *) dst;
        struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst;
@@ -197,9 +197,9 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
                memcpy(scope_id, pct + 1, slen);
                scope_id[slen] = '\0';
-                s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0);
+                rc = strict_strtoul(scope_id, 0,
-                if (endp != scope_id + slen)
+                                        (unsigned long *)&s6->sin6_scope_id);
-                        return 0;
+                rc = (rc == 0) ? 1 : 0;
        }
        return rc;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 7f25cc3d2256..f8e4cd2a7912 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -764,7 +764,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 {
        int rc = 0;
        int xid, i;
-        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
        struct cifsFileInfo *cifsFile = NULL;
        char *current_entry;
@@ -775,8 +774,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
        xid = GetXid();
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        /*
         * Ensure FindFirst doesn't fail before doing filldir() for '.' and
         * '..'. Otherwise we won't be able to notify VFS in case of failure.
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 1adc9625a344..16765703131b 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -656,13 +656,13 @@ ssetup_ntlmssp_authenticate:
        if (type == LANMAN) {
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-                char lnm_session_key[CIFS_SESS_KEY_SIZE];
+                char lnm_session_key[CIFS_AUTH_RESP_SIZE];
                pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
                /* no capabilities flags in old lanman negotiation */
-                pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
+                pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
                /* Calculate hash with password and copy into bcc_ptr.
                 * Encryption Key (stored as in cryptkey) gets used if the
@@ -675,8 +675,8 @@ ssetup_ntlmssp_authenticate:
                                        true : false, lnm_session_key);
                ses->flags |= CIFS_SES_LANMAN;
-                memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_SESS_KEY_SIZE);
+                memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
-                bcc_ptr += CIFS_SESS_KEY_SIZE;
+                bcc_ptr += CIFS_AUTH_RESP_SIZE;
                /* can not sign if LANMAN negotiated so no need
                to calculate signing key? but what if server
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index b6b6dcb500bf..04721485925d 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -45,7 +45,6 @@
   up with a different answer to the one above)
 */
 #include <linux/slab.h>
-#include "cifsencrypt.h"
 #define uchar unsigned char
 static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9,
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 192ea51af20f..b5041c849981 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -32,9 +32,8 @@
 #include "cifs_unicode.h"
 #include "cifspdu.h"
 #include "cifsglob.h"
-#include "md5.h"
 #include "cifs_debug.h"
-#include "cifsencrypt.h"
+#include "cifsproto.h"
 #ifndef false
 #define false 0
@@ -48,14 +47,58 @@
 #define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
 #define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val)))
-/*The following definitions come from  libsmb/smbencrypt.c  */
+/* produce a md4 message digest from data of length n bytes */
+int
+mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
+{
+        int rc;
+        unsigned int size;
+        struct crypto_shash *md4;
+        struct sdesc *sdescmd4;
+        md4 = crypto_alloc_shash("md4", 0, 0);
+        if (IS_ERR(md4)) {
+                rc = PTR_ERR(md4);
+                cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc);
+                return rc;
+        }
+        size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
+        sdescmd4 = kmalloc(size, GFP_KERNEL);
+        if (!sdescmd4) {
+                rc = -ENOMEM;
+                cERROR(1, "%s: Memory allocation failure\n", __func__);
+                goto mdfour_err;
+        }
+        sdescmd4->shash.tfm = md4;
+        sdescmd4->shash.flags = 0x0;
+        rc = crypto_shash_init(&sdescmd4->shash);
+        if (rc) {
+                cERROR(1, "%s: Could not init md4 shash\n", __func__);
+                goto mdfour_err;
+        }
+        crypto_shash_update(&sdescmd4->shash, link_str, link_len);
+        rc = crypto_shash_final(&sdescmd4->shash, md4_hash);
-void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
+mdfour_err:
-                unsigned char *p24);
+        crypto_free_shash(md4);
-void E_md4hash(const unsigned char *passwd, unsigned char *p16);
+        kfree(sdescmd4);
-static void SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
-                   unsigned char p24[24]);
+        return rc;
-void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24);
+}
+/* Does the des encryption from the NT or LM MD4 hash. */
+static void
+SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
+              unsigned char p24[24])
+{
+        unsigned char p21[21];
+        memset(p21, '\0', 21);
+        memcpy(p21, passwd, 16);
+        E_P24(p21, c8, p24);
+}
 /*
   This implements the X/Open SMB password encryption
@@ -118,9 +161,10 @@ _my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
 * Creates the MD4 Hash of the users password in NT UNICODE.
 */
-void
+int
 E_md4hash(const unsigned char *passwd, unsigned char *p16)
 {
+        int rc;
        int len;
        __u16 wpwd[129];
@@ -139,8 +183,10 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
        /* Calculate length in bytes */
        len = _my_wcslen(wpwd) * sizeof(__u16);
-        mdfour(p16, (unsigned char *) wpwd, len);
+        rc = mdfour(p16, (unsigned char *) wpwd, len);
        memset(wpwd, 0, 129 * 2);
+        return rc;
 }
 #if 0 /* currently unused */
@@ -212,19 +258,6 @@ ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
 }
 #endif
-/* Does the des encryption from the NT or LM MD4 hash. */
-static void
-SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
-              unsigned char p24[24])
-{
-        unsigned char p21[21];
-        memset(p21, '\0', 21);
-        memcpy(p21, passwd, 16);
-        E_P24(p21, c8, p24);
-}
 /* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
 #if 0 /* currently unused */
 static void
@@ -242,16 +275,21 @@ NTLMSSPOWFencrypt(unsigned char passwd[8],
 #endif
 /* Does the NT MD4 hash then des encryption. */
+int
-void
 SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
 {
+        int rc;
        unsigned char p21[21];
        memset(p21, '\0', 21);
-        E_md4hash(passwd, p21);
+        rc = E_md4hash(passwd, p21);
+        if (rc) {
+                cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
+                return rc;
+        }
        SMBOWFencrypt(p21, c8, p24);
+        return rc;
 }
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index c1ccca1a933f..46d8756f2b24 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -236,9 +236,9 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
                server->tcpStatus = CifsNeedReconnect;
        }
-        if (rc < 0) {
+        if (rc < 0 && rc != -EINTR)
                cERROR(1, "Error %d sending data on socket to server", rc);
-        } else
+        else
                rc = 0;
        /* Don't want to modify the buffer as a
@@ -359,6 +359,10 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
        if (rc)
                return rc;
+        /* enable signing if server requires it */
+        if (server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+                in_buf->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
        mutex_lock(&server->srv_mutex);
        mid = AllocMidQEntry(in_buf, server);
        if (mid == NULL) {
@@ -453,6 +457,9 @@ sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
        case MID_RETRY_NEEDED:
                rc = -EAGAIN;
                break;
+        case MID_RESPONSE_MALFORMED:
+                rc = -EIO;
+                break;
        default:
                cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
                        mid->mid, mid->midState);
@@ -570,17 +577,33 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 #endif
        mutex_unlock(&ses->server->srv_mutex);
-        cifs_small_buf_release(in_buf);
-        if (rc < 0)
+        if (rc < 0) {
+                cifs_small_buf_release(in_buf);
                goto out;
+        }
-        if (long_op == CIFS_ASYNC_OP)
+        if (long_op == CIFS_ASYNC_OP) {
+                cifs_small_buf_release(in_buf);
                goto out;
+        }
        rc = wait_for_response(ses->server, midQ);
-        if (rc != 0)
+        if (rc != 0) {
-                goto out;
+                send_nt_cancel(ses->server, in_buf, midQ);
+                spin_lock(&GlobalMid_Lock);
+                if (midQ->midState == MID_REQUEST_SUBMITTED) {
+                        midQ->callback = DeleteMidQEntry;
+                        spin_unlock(&GlobalMid_Lock);
+                        cifs_small_buf_release(in_buf);
+                        atomic_dec(&ses->server->inFlight);
+                        wake_up(&ses->server->request_q);
+                        return rc;
+                }
+                spin_unlock(&GlobalMid_Lock);
+        }
+        cifs_small_buf_release(in_buf);
        rc = sync_mid_result(midQ, ses->server);
        if (rc != 0) {
@@ -724,8 +747,19 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                goto out;
        rc = wait_for_response(ses->server, midQ);
-        if (rc != 0)
+        if (rc != 0) {
-                goto out;
+                send_nt_cancel(ses->server, in_buf, midQ);
+                spin_lock(&GlobalMid_Lock);
+                if (midQ->midState == MID_REQUEST_SUBMITTED) {
+                        /* no longer considered to be "in-flight" */
+                        midQ->callback = DeleteMidQEntry;
+                        spin_unlock(&GlobalMid_Lock);
+                        atomic_dec(&ses->server->inFlight);
+                        wake_up(&ses->server->request_q);
+                        return rc;
+                }
+                spin_unlock(&GlobalMid_Lock);
+        }
        rc = sync_mid_result(midQ, ses->server);
        if (rc != 0) {
@@ -922,10 +956,21 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                        }
                }
-                if (wait_for_response(ses->server, midQ) == 0) {
+                rc = wait_for_response(ses->server, midQ);
-                        /* We got the response - restart system call. */
+                if (rc) {
-                        rstart = 1;
+                        send_nt_cancel(ses->server, in_buf, midQ);
+                        spin_lock(&GlobalMid_Lock);
+                        if (midQ->midState == MID_REQUEST_SUBMITTED) {
+                                /* no longer considered to be "in-flight" */
+                                midQ->callback = DeleteMidQEntry;
+                                spin_unlock(&GlobalMid_Lock);
+                                return rc;
+                        }
+                        spin_unlock(&GlobalMid_Lock);
                }
+                /* We got the response - restart system call. */
+                rstart = 1;
        }
        rc = sync_mid_result(midQ, ses->server);
diff --git a/fs/coda/Makefile b/fs/coda/Makefile
index 6c22e61da397..1bab69a0d347 100644
--- a/fs/coda/Makefile
+++ b/fs/coda/Makefile
@@ -9,4 +9,4 @@ coda-objs := psdev.o cache.o cnode.o inode.o dir.o file.o upcall.o \
 # If you want debugging output, please uncomment the following line.
-# EXTRA_CFLAGS += -DDEBUG -DDEBUG_SMB_MALLOC=1
+# ccflags-y := -DDEBUG -DDEBUG_SMB_MALLOC=1
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index c6405ce3c50e..06d27a41807f 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -13,7 +13,6 @@
 #ifdef CONFIG_SYSCTL
 static struct ctl_table_header *fs_table_header;
-#endif
 static ctl_table coda_table[] = {
        {
@@ -40,7 +39,6 @@ static ctl_table coda_table[] = {
        {}
 };
-#ifdef CONFIG_SYSCTL
 static ctl_table fs_table[] = {
        {
                .procname       = "coda",
@@ -49,22 +47,18 @@ static ctl_table fs_table[] = {
        },
        {}
 };
-#endif
 void coda_sysctl_init(void)
 {
-#ifdef CONFIG_SYSCTL
        if ( !fs_table_header )
                fs_table_header = register_sysctl_table(fs_table);
-#endif
 }
 void coda_sysctl_clean(void)
 {
-#ifdef CONFIG_SYSCTL
        if ( fs_table_header ) {
                unregister_sysctl_table(fs_table_header);
                fs_table_header = NULL;
        }
-#endif
 }
+#endif
diff --git a/fs/compat.c b/fs/compat.c
index f6fd0a00e6cc..72fe6cda9108 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -262,35 +262,19 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
 */
 asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
 {
-        struct path path;
+        struct kstatfs tmp;
-        int error;
+        int error = user_statfs(pathname, &tmp);
+        if (!error)
-        error = user_path(pathname, &path);
+                error = put_compat_statfs(buf, &tmp);
-        if (!error) {
-                struct kstatfs tmp;
-                error = vfs_statfs(&path, &tmp);
-                if (!error)
-                        error = put_compat_statfs(buf, &tmp);
-                path_put(&path);
-        }
        return error;
 }
 asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf)
 {
-        struct file * file;
        struct kstatfs tmp;
-        int error;
+        int error = fd_statfs(fd, &tmp);
-        error = -EBADF;
-        file = fget(fd);
-        if (!file)
-                goto out;
-        error = vfs_statfs(&file->f_path, &tmp);
        if (!error)
                error = put_compat_statfs(buf, &tmp);
-        fput(file);
-out:
        return error;
 }
@@ -329,41 +313,29 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
 asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf)
 {
-        struct path path;
+        struct kstatfs tmp;
        int error;
        if (sz != sizeof(*buf))
                return -EINVAL;
-        error = user_path(pathname, &path);
+        error = user_statfs(pathname, &tmp);
-        if (!error) {
+        if (!error)
-                struct kstatfs tmp;
+                error = put_compat_statfs64(buf, &tmp);
-                error = vfs_statfs(&path, &tmp);
-                if (!error)
-                        error = put_compat_statfs64(buf, &tmp);
-                path_put(&path);
-        }
        return error;
 }
 asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf)
 {
-        struct file * file;
        struct kstatfs tmp;
        int error;
        if (sz != sizeof(*buf))
                return -EINVAL;
-        error = -EBADF;
+        error = fd_statfs(fd, &tmp);
-        file = fget(fd);
-        if (!file)
-                goto out;
-        error = vfs_statfs(&file->f_path, &tmp);
        if (!error)
                error = put_compat_statfs64(buf, &tmp);
-        fput(file);
-out:
        return error;
 }
@@ -1228,7 +1200,9 @@ compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
        file = fget_light(fd, &fput_needed);
        if (!file)
                return -EBADF;
-        ret = compat_readv(file, vec, vlen, &pos);
+        ret = -ESPIPE;
+        if (file->f_mode & FMODE_PREAD)
+                ret = compat_readv(file, vec, vlen, &pos);
        fput_light(file, fput_needed);
        return ret;
 }
@@ -1285,7 +1259,9 @@ compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
        file = fget_light(fd, &fput_needed);
        if (!file)
                return -EBADF;
-        ret = compat_writev(file, vec, vlen, &pos);
+        ret = -ESPIPE;
+        if (file->f_mode & FMODE_PWRITE)
+                ret = compat_writev(file, vec, vlen, &pos);
        fput_light(file, fput_needed);
        return ret;
 }
@@ -1695,9 +1671,6 @@ int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
-#define MAX_SELECT_SECONDS \
-        ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
 int compat_core_sys_select(int n, compat_ulong_t __user *inp,
        compat_ulong_t __user *outp, compat_ulong_t __user *exp,
        struct timespec *end_time)
@@ -2308,3 +2281,16 @@ asmlinkage long compat_sys_timerfd_gettime(int ufd,
 }
 #endif /* CONFIG_TIMERFD */
+#ifdef CONFIG_FHANDLE
+/*
+ * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
+ * doesn't set the O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_open_by_handle_at(int mountdirfd,
+                             struct file_handle __user *handle, int flags)
+{
+        return do_handle_open(mountdirfd, handle, flags);
+}
+#endif
diff --git a/fs/dcache.c b/fs/dcache.c
index 9f493ee4dcba..ad25c4cec7d5 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -176,6 +176,7 @@ static void d_free(struct dentry *dentry)
 /**
 * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * @dentry: the target dentry
 * After this call, in-progress rcu-walk path lookup will fail. This
 * should be called after unhashing, and after changing d_inode (if
 * the dentry has not already been unhashed).
@@ -281,6 +282,7 @@ static void dentry_lru_move_tail(struct dentry *dentry)
 /**
 * d_kill - kill dentry and return parent
 * @dentry: dentry to kill
+ * @parent: parent dentry
 *
 * The dentry must already be unhashed and removed from the LRU.
 *
@@ -294,8 +296,12 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
        __releases(parent->d_lock)
        __releases(dentry->d_inode->i_lock)
 {
-        dentry->d_parent = NULL;
        list_del(&dentry->d_u.d_child);
+        /*
+         * Inform try_to_ascend() that we are no longer attached to the
+         * dentry tree
+         */
+        dentry->d_flags |= DCACHE_DISCONNECTED;
        if (parent)
                spin_unlock(&parent->d_lock);
        dentry_iput(dentry);
@@ -1010,6 +1016,35 @@ void shrink_dcache_for_umount(struct super_block *sb)
 }
 /*
+ * This tries to ascend one level of parenthood, but
+ * we can race with renaming, so we need to re-check
+ * the parenthood after dropping the lock and check
+ * that the sequence number still matches.
+ */
+static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq)
+{
+        struct dentry *new = old->d_parent;
+        rcu_read_lock();
+        spin_unlock(&old->d_lock);
+        spin_lock(&new->d_lock);
+        /*
+         * might go back up the wrong parent if we have had a rename
+         * or deletion
+         */
+        if (new != old->d_parent ||
+                 (old->d_flags & DCACHE_DISCONNECTED) ||
+                 (!locked && read_seqretry(&rename_lock, seq))) {
+                spin_unlock(&new->d_lock);
+                new = NULL;
+        }
+        rcu_read_unlock();
+        return new;
+}
+/*
 * Search for at least 1 mount point in the dentry's subdirs.
 * We descend to the next level whenever the d_subdirs
 * list is non-empty and continue searching.
@@ -1064,24 +1099,10 @@ resume:
         * All done at this level ... ascend and resume the search.
         */
        if (this_parent != parent) {
-                struct dentry *tmp;
+                struct dentry *child = this_parent;
-                struct dentry *child;
+                this_parent = try_to_ascend(this_parent, locked, seq);
+                if (!this_parent)
-                tmp = this_parent->d_parent;
-                rcu_read_lock();
-                spin_unlock(&this_parent->d_lock);
-                child = this_parent;
-                this_parent = tmp;
-                spin_lock(&this_parent->d_lock);
-                /* might go back up the wrong parent if we have had a rename
-                 * or deletion */
-                if (this_parent != child->d_parent ||
-                         (!locked && read_seqretry(&rename_lock, seq))) {
-                        spin_unlock(&this_parent->d_lock);
-                        rcu_read_unlock();
                        goto rename_retry;
-                }
-                rcu_read_unlock();
                next = child->d_u.d_child.next;
                goto resume;
        }
@@ -1179,24 +1200,10 @@ resume:
         * All done at this level ... ascend and resume the search.
         */
        if (this_parent != parent) {
-                struct dentry *tmp;
+                struct dentry *child = this_parent;
-                struct dentry *child;
+                this_parent = try_to_ascend(this_parent, locked, seq);
+                if (!this_parent)
-                tmp = this_parent->d_parent;
-                rcu_read_lock();
-                spin_unlock(&this_parent->d_lock);
-                child = this_parent;
-                this_parent = tmp;
-                spin_lock(&this_parent->d_lock);
-                /* might go back up the wrong parent if we have had a rename
-                 * or deletion */
-                if (this_parent != child->d_parent ||
-                        (!locked && read_seqretry(&rename_lock, seq))) {
-                        spin_unlock(&this_parent->d_lock);
-                        rcu_read_unlock();
                        goto rename_retry;
-                }
-                rcu_read_unlock();
                next = child->d_u.d_child.next;
                goto resume;
        }
@@ -1521,6 +1528,28 @@ struct dentry * d_alloc_root(struct inode * root_inode)
 }
 EXPORT_SYMBOL(d_alloc_root);
+static struct dentry * __d_find_any_alias(struct inode *inode)
+{
+        struct dentry *alias;
+        if (list_empty(&inode->i_dentry))
+                return NULL;
+        alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias);
+        __dget(alias);
+        return alias;
+}
+static struct dentry * d_find_any_alias(struct inode *inode)
+{
+        struct dentry *de;
+        spin_lock(&inode->i_lock);
+        de = __d_find_any_alias(inode);
+        spin_unlock(&inode->i_lock);
+        return de;
+}
 /**
 * d_obtain_alias - find or allocate a dentry for a given inode
 * @inode: inode to allocate the dentry for
@@ -1550,7 +1579,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
        if (IS_ERR(inode))
                return ERR_CAST(inode);
-        res = d_find_alias(inode);
+        res = d_find_any_alias(inode);
        if (res)
                goto out_iput;
@@ -1563,7 +1592,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
        spin_lock(&inode->i_lock);
-        res = __d_find_alias(inode, 0);
+        res = __d_find_any_alias(inode);
        if (res) {
                spin_unlock(&inode->i_lock);
                dput(tmp);
@@ -1583,10 +1612,13 @@ struct dentry *d_obtain_alias(struct inode *inode)
        __bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
        spin_unlock(&tmp->d_lock);
        spin_unlock(&inode->i_lock);
+        security_d_instantiate(tmp, inode);
        return tmp;
 out_iput:
+        if (res && !IS_ERR(res))
+                security_d_instantiate(res, inode);
        iput(inode);
        return res;
 }
@@ -1779,7 +1811,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
         * false-negative result. d_lookup() protects against concurrent
         * renames using rename_lock seqlock.
         *
-         * See Documentation/vfs/dcache-locking.txt for more details.
+         * See Documentation/filesystems/path-lookup.txt for more details.
         */
        hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
                struct inode *i;
@@ -1899,7 +1931,7 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
         * false-negative result. d_lookup() protects against concurrent
         * renames using rename_lock seqlock.
         *
-         * See Documentation/vfs/dcache-locking.txt for more details.
+         * See Documentation/filesystems/path-lookup.txt for more details.
         */
        rcu_read_lock();
        
@@ -1973,7 +2005,7 @@ out:
 /**
 * d_validate - verify dentry provided from insecure source (deprecated)
 * @dentry: The dentry alleged to be valid child of @dparent
- * @parent: The parent dentry (known to be valid)
+ * @dparent: The parent dentry (known to be valid)
 *
 * An insecure source has sent us a dentry, here we verify it and dget() it.
 * This is used by ncpfs in its readdir implementation.
@@ -2918,28 +2950,14 @@ resume:
                spin_unlock(&dentry->d_lock);
        }
        if (this_parent != root) {
-                struct dentry *tmp;
+                struct dentry *child = this_parent;
-                struct dentry *child;
-                tmp = this_parent->d_parent;
                if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
                        this_parent->d_flags |= DCACHE_GENOCIDE;
                        this_parent->d_count--;
                }
-                rcu_read_lock();
+                this_parent = try_to_ascend(this_parent, locked, seq);
-                spin_unlock(&this_parent->d_lock);
+                if (!this_parent)
-                child = this_parent;
-                this_parent = tmp;
-                spin_lock(&this_parent->d_lock);
-                /* might go back up the wrong parent if we have had a rename
-                 * or deletion */
-                if (this_parent != child->d_parent ||
-                         (!locked && read_seqretry(&rename_lock, seq))) {
-                        spin_unlock(&this_parent->d_lock);
-                        rcu_read_unlock();
                        goto rename_retry;
-                }
-                rcu_read_unlock();
                next = child->d_u.d_child.next;
                goto resume;
        }
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 37a8ca7c1222..e7a7a2f07324 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -13,9 +13,6 @@
 *
 */
-/* uncomment to get debug messages from the debug filesystem, ah the irony. */
-/* #define DEBUG */
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -310,7 +307,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_symlink);
-static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
+static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
 {
        int ret = 0;
@@ -333,6 +330,7 @@ static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
                        dput(dentry);
                }
        }
+        return ret;
 }
 /**
@@ -351,7 +349,8 @@ static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
 void debugfs_remove(struct dentry *dentry)
 {
        struct dentry *parent;
-        
+        int ret;
        if (!dentry)
                return;
@@ -360,9 +359,10 @@ void debugfs_remove(struct dentry *dentry)
                return;
        mutex_lock(&parent->d_inode->i_mutex);
-        __debugfs_remove(dentry, parent);
+        ret = __debugfs_remove(dentry, parent);
        mutex_unlock(&parent->d_inode->i_mutex);
-        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+        if (!ret)
+                simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove);
@@ -540,17 +540,5 @@ static int __init debugfs_init(void)
        return retval;
 }
-static void __exit debugfs_exit(void)
-{
-        debugfs_registered = false;
-        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
-        unregister_filesystem(&debug_fs_type);
-        kobject_put(debug_kobj);
-}
 core_initcall(debugfs_init);
-module_exit(debugfs_exit);
-MODULE_LICENSE("GPL");
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 1bb547c9cad6..2f27e578d466 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -479,6 +479,7 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
        struct dentry *root = sb->s_root;
        struct pts_fs_info *fsi = DEVPTS_SB(sb);
        struct pts_mount_opts *opts = &fsi->mount_opts;
+        int ret = 0;
        char s[12];
        /* We're supposed to be given the slave end of a pty */
@@ -501,14 +502,17 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
        mutex_lock(&root->d_inode->i_mutex);
        dentry = d_alloc_name(root, s);
-        if (!IS_ERR(dentry)) {
+        if (dentry) {
                d_add(dentry, inode);
                fsnotify_create(root->d_inode, dentry);
+        } else {
+                iput(inode);
+                ret = -ENOMEM;
        }
        mutex_unlock(&root->d_inode->i_mutex);
-        return 0;
+        return ret;
 }
 struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
@@ -544,17 +548,12 @@ void devpts_pty_kill(struct tty_struct *tty)
        mutex_lock(&root->d_inode->i_mutex);
        dentry = d_find_alias(inode);
-        if (IS_ERR(dentry))
-                goto out;
-        if (dentry) {
-                inode->i_nlink--;
-                d_delete(dentry);
-                dput(dentry);   /* d_alloc_name() in devpts_pty_new() */
-        }
+        inode->i_nlink--;
+        d_delete(dentry);
+        dput(dentry);   /* d_alloc_name() in devpts_pty_new() */
        dput(dentry);           /* d_find_alias above */
-out:
        mutex_unlock(&root->d_inode->i_mutex);
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b044705eedd4..ac5f164170e3 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -645,11 +645,11 @@ static int dio_send_cur_page(struct dio *dio)
                /*
                 * See whether this new request is contiguous with the old.
                 *
-                 * Btrfs cannot handl having logically non-contiguous requests
+                 * Btrfs cannot handle having logically non-contiguous requests
-                 * submitted.  For exmple if you have
+                 * submitted.  For example if you have
                 *
                 * Logical:  [0-4095][HOLE][8192-12287]
-                 * Phyiscal: [0-4095]      [4096-8181]
+                 * Physical: [0-4095]      [4096-8191]
                 *
                 * We cannot submit those pages together as one BIO.  So if our
                 * current logical offset in the file does not equal what would
@@ -1110,11 +1110,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
            ((rw & READ) || (dio->result == dio->size)))
                ret = -EIOCBQUEUED;
-        if (ret != -EIOCBQUEUED) {
+        if (ret != -EIOCBQUEUED)
-                /* All IO is now issued, send it on its way */
-                blk_run_address_space(inode->i_mapping);
                dio_await_completion(dio);
-        }
        /*
         * Sync will always be dropping the final ref and completing the
@@ -1176,7 +1173,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        struct dio *dio;
        if (rw & WRITE)
-                rw = WRITE_ODIRECT_PLUG;
+                rw = WRITE_ODIRECT;
        if (bdev)
                bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 4314f0d48d85..abc49f292454 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -18,6 +18,7 @@
 #define WAKE_ASTS  0
+static uint64_t                 ast_seq_count;
 static struct list_head         ast_queue;
 static spinlock_t               ast_queue_lock;
 static struct task_struct *     astd_task;
@@ -25,40 +26,186 @@ static unsigned long		astd_wakeflags;
 static struct mutex             astd_running;
+static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb)
+{
+        int i;
+        log_print("last_bast %x %llu flags %x mode %d sb %d %x",
+                  lkb->lkb_id,
+                  (unsigned long long)lkb->lkb_last_bast.seq,
+                  lkb->lkb_last_bast.flags,
+                  lkb->lkb_last_bast.mode,
+                  lkb->lkb_last_bast.sb_status,
+                  lkb->lkb_last_bast.sb_flags);
+        log_print("last_cast %x %llu flags %x mode %d sb %d %x",
+                  lkb->lkb_id,
+                  (unsigned long long)lkb->lkb_last_cast.seq,
+                  lkb->lkb_last_cast.flags,
+                  lkb->lkb_last_cast.mode,
+                  lkb->lkb_last_cast.sb_status,
+                  lkb->lkb_last_cast.sb_flags);
+        for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
+                log_print("cb %x %llu flags %x mode %d sb %d %x",
+                          lkb->lkb_id,
+                          (unsigned long long)lkb->lkb_callbacks[i].seq,
+                          lkb->lkb_callbacks[i].flags,
+                          lkb->lkb_callbacks[i].mode,
+                          lkb->lkb_callbacks[i].sb_status,
+                          lkb->lkb_callbacks[i].sb_flags);
+        }
+}
 void dlm_del_ast(struct dlm_lkb *lkb)
 {
        spin_lock(&ast_queue_lock);
-        if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
+        if (!list_empty(&lkb->lkb_astqueue))
-                list_del(&lkb->lkb_astqueue);
+                list_del_init(&lkb->lkb_astqueue);
        spin_unlock(&ast_queue_lock);
 }
-void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode)
+int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+                         int status, uint32_t sbflags, uint64_t seq)
 {
+        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+        uint64_t prev_seq;
+        int prev_mode;
+        int i;
+        for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
+                if (lkb->lkb_callbacks[i].seq)
+                        continue;
+                /*
+                 * Suppress some redundant basts here, do more on removal.
+                 * Don't even add a bast if the callback just before it
+                 * is a bast for the same mode or a more restrictive mode.
+                 * (the addional > PR check is needed for PR/CW inversion)
+                 */
+                if ((i > 0) && (flags & DLM_CB_BAST) &&
+                    (lkb->lkb_callbacks[i-1].flags & DLM_CB_BAST)) {
+                        prev_seq = lkb->lkb_callbacks[i-1].seq;
+                        prev_mode = lkb->lkb_callbacks[i-1].mode;
+                        if ((prev_mode == mode) ||
+                            (prev_mode > mode && prev_mode > DLM_LOCK_PR)) {
+                                log_debug(ls, "skip %x add bast %llu mode %d "
+                                          "for bast %llu mode %d",
+                                          lkb->lkb_id,
+                                          (unsigned long long)seq,
+                                          mode,
+                                          (unsigned long long)prev_seq,
+                                          prev_mode);
+                                return 0;
+                        }
+                }
+                lkb->lkb_callbacks[i].seq = seq;
+                lkb->lkb_callbacks[i].flags = flags;
+                lkb->lkb_callbacks[i].mode = mode;
+                lkb->lkb_callbacks[i].sb_status = status;
+                lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF);
+                break;
+        }
+        if (i == DLM_CALLBACKS_SIZE) {
+                log_error(ls, "no callbacks %x %llu flags %x mode %d sb %d %x",
+                          lkb->lkb_id, (unsigned long long)seq,
+                          flags, mode, status, sbflags);
+                dlm_dump_lkb_callbacks(lkb);
+                return -1;
+        }
+        return 0;
+}
+int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                         struct dlm_callback *cb, int *resid)
+{
+        int i;
+        *resid = 0;
+        if (!lkb->lkb_callbacks[0].seq)
+                return -ENOENT;
+        /* oldest undelivered cb is callbacks[0] */
+        memcpy(cb, &lkb->lkb_callbacks[0], sizeof(struct dlm_callback));
+        memset(&lkb->lkb_callbacks[0], 0, sizeof(struct dlm_callback));
+        /* shift others down */
+        for (i = 1; i < DLM_CALLBACKS_SIZE; i++) {
+                if (!lkb->lkb_callbacks[i].seq)
+                        break;
+                memcpy(&lkb->lkb_callbacks[i-1], &lkb->lkb_callbacks[i],
+                       sizeof(struct dlm_callback));
+                memset(&lkb->lkb_callbacks[i], 0, sizeof(struct dlm_callback));
+                (*resid)++;
+        }
+        /* if cb is a bast, it should be skipped if the blocking mode is
+           compatible with the last granted mode */
+        if ((cb->flags & DLM_CB_BAST) && lkb->lkb_last_cast.seq) {
+                if (dlm_modes_compat(cb->mode, lkb->lkb_last_cast.mode)) {
+                        cb->flags |= DLM_CB_SKIP;
+                        log_debug(ls, "skip %x bast %llu mode %d "
+                                  "for cast %llu mode %d",
+                                  lkb->lkb_id,
+                                  (unsigned long long)cb->seq,
+                                  cb->mode,
+                                  (unsigned long long)lkb->lkb_last_cast.seq,
+                                  lkb->lkb_last_cast.mode);
+                        return 0;
+                }
+        }
+        if (cb->flags & DLM_CB_CAST) {
+                memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback));
+                lkb->lkb_last_cast_time = ktime_get();
+        }
+        if (cb->flags & DLM_CB_BAST) {
+                memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback));
+                lkb->lkb_last_bast_time = ktime_get();
+        }
+        return 0;
+}
+void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
+                 uint32_t sbflags)
+{
+        uint64_t seq;
+        int rv;
+        spin_lock(&ast_queue_lock);
+        seq = ++ast_seq_count;
        if (lkb->lkb_flags & DLM_IFL_USER) {
-                dlm_user_add_ast(lkb, type, mode);
+                spin_unlock(&ast_queue_lock);
+                dlm_user_add_ast(lkb, flags, mode, status, sbflags, seq);
                return;
        }
-        spin_lock(&ast_queue_lock);
+        rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq);
-        if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+        if (rv < 0) {
+                spin_unlock(&ast_queue_lock);
+                return;
+        }
+        if (list_empty(&lkb->lkb_astqueue)) {
                kref_get(&lkb->lkb_ref);
                list_add_tail(&lkb->lkb_astqueue, &ast_queue);
-                lkb->lkb_ast_first = type;
        }
-        /* sanity check, this should not happen */
-        if ((type == AST_COMP) && (lkb->lkb_ast_type & AST_COMP))
-                log_print("repeat cast %d castmode %d lock %x %s",
-                          mode, lkb->lkb_castmode,
-                          lkb->lkb_id, lkb->lkb_resource->res_name);
-        lkb->lkb_ast_type |= type;
-        if (type == AST_BAST)
-                lkb->lkb_bastmode = mode;
-        else
-                lkb->lkb_castmode = mode;
        spin_unlock(&ast_queue_lock);
        set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -72,7 +219,8 @@ static void process_asts(void)
        struct dlm_lkb *lkb;
        void (*castfn) (void *astparam);
        void (*bastfn) (void *astparam, int mode);
-        int type, first, bastmode, castmode, do_bast, do_cast, last_castmode;
+        struct dlm_callback callbacks[DLM_CALLBACKS_SIZE];
+        int i, rv, resid;
 repeat:
        spin_lock(&ast_queue_lock);
@@ -83,54 +231,45 @@ repeat:
                if (dlm_locking_stopped(ls))
                        continue;
-                list_del(&lkb->lkb_astqueue);
+                /* we remove from astqueue list and remove everything in
-                type = lkb->lkb_ast_type;
+                   lkb_callbacks before releasing the spinlock so empty
-                lkb->lkb_ast_type = 0;
+                   lkb_astqueue is always consistent with empty lkb_callbacks */
-                first = lkb->lkb_ast_first;
-                lkb->lkb_ast_first = 0;
+                list_del_init(&lkb->lkb_astqueue);
-                bastmode = lkb->lkb_bastmode;
-                castmode = lkb->lkb_castmode;
                castfn = lkb->lkb_astfn;
                bastfn = lkb->lkb_bastfn;
-                spin_unlock(&ast_queue_lock);
-                do_cast = (type & AST_COMP) && castfn;
+                memset(&callbacks, 0, sizeof(callbacks));
-                do_bast = (type & AST_BAST) && bastfn;
-                /* Skip a bast if its blocking mode is compatible with the
+                for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
-                   granted mode of the preceding cast. */
+                        rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid);
+                        if (rv < 0)
+                                break;
+                }
+                spin_unlock(&ast_queue_lock);
-                if (do_bast) {
+                if (resid) {
-                        if (first == AST_COMP)
+                        /* shouldn't happen, for loop should have removed all */
-                                last_castmode = castmode;
+                        log_error(ls, "callback resid %d lkb %x",
-                        else
+                                  resid, lkb->lkb_id);
-                                last_castmode = lkb->lkb_castmode_done;
-                        if (dlm_modes_compat(bastmode, last_castmode))
-                                do_bast = 0;
                }
-                if (first == AST_COMP) {
+                for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
-                        if (do_cast)
+                        if (!callbacks[i].seq)
-                                castfn(lkb->lkb_astparam);
+                                break;
-                        if (do_bast)
+                        if (callbacks[i].flags & DLM_CB_SKIP) {
-                                bastfn(lkb->lkb_astparam, bastmode);
+                                continue;
-                } else if (first == AST_BAST) {
+                        } else if (callbacks[i].flags & DLM_CB_BAST) {
-                        if (do_bast)
+                                bastfn(lkb->lkb_astparam, callbacks[i].mode);
-                                bastfn(lkb->lkb_astparam, bastmode);
+                        } else if (callbacks[i].flags & DLM_CB_CAST) {
-                        if (do_cast)
+                                lkb->lkb_lksb->sb_status = callbacks[i].sb_status;
+                                lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags;
                                castfn(lkb->lkb_astparam);
-                } else {
+                        }
-                        log_error(ls, "bad ast_first %d ast_type %d",
-                                  first, type);
                }
-                if (do_cast)
+                /* removes ref for ast_queue, may cause lkb to be freed */
-                        lkb->lkb_castmode_done = castmode;
-                if (do_bast)
-                        lkb->lkb_bastmode_done = bastmode;
-                /* this removes the reference added by dlm_add_ast
-                   and may result in the lkb being freed */
                dlm_put_lkb(lkb);
                cond_resched();
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index bcb1aaba519d..8aa89c9b5611 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -13,8 +13,13 @@
 #ifndef __ASTD_DOT_H__
 #define __ASTD_DOT_H__
-void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode);
 void dlm_del_ast(struct dlm_lkb *lkb);
+int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+                         int status, uint32_t sbflags, uint64_t seq);
+int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                         struct dlm_callback *cb, int *resid);
+void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
+                 uint32_t sbflags);
 void dlm_astd_wake(void);
 int dlm_astd_start(void);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index b54bca03d92f..0d329ff8ed4c 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -977,9 +977,9 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
 /* Config file defaults */
 #define DEFAULT_TCP_PORT       21064
 #define DEFAULT_BUFFER_SIZE     4096
-#define DEFAULT_RSBTBL_SIZE      256
+#define DEFAULT_RSBTBL_SIZE     1024
 #define DEFAULT_LKBTBL_SIZE     1024
-#define DEFAULT_DIRTBL_SIZE      512
+#define DEFAULT_DIRTBL_SIZE     1024
 #define DEFAULT_RECOVER_TIMER      5
 #define DEFAULT_TOSS_SECS         10
 #define DEFAULT_SCAN_SECS          5
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 6b42ba807dfd..59779237e2b4 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -257,12 +257,12 @@ static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
                        lkb->lkb_status,
                        lkb->lkb_grmode,
                        lkb->lkb_rqmode,
-                        lkb->lkb_bastmode,
+                        lkb->lkb_last_bast.mode,
                        rsb_lookup,
                        lkb->lkb_wait_type,
                        lkb->lkb_lvbseq,
                        (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
-                        (unsigned long long)ktime_to_ns(lkb->lkb_time_bast));
+                        (unsigned long long)ktime_to_ns(lkb->lkb_last_bast_time));
        return rv;
 }
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index f632b58cd222..b94204913011 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -192,11 +192,6 @@ struct dlm_args {
 * lkb is a process copy, the nodeid specifies the lock master.
 */
-/* lkb_ast_type */
-#define AST_COMP                1
-#define AST_BAST                2
 /* lkb_status */
 #define DLM_LKSTS_WAITING       1
@@ -217,6 +212,20 @@ struct dlm_args {
 #define DLM_IFL_USER            0x00000001
 #define DLM_IFL_ORPHAN          0x00000002
+#define DLM_CALLBACKS_SIZE      6
+#define DLM_CB_CAST             0x00000001
+#define DLM_CB_BAST             0x00000002
+#define DLM_CB_SKIP             0x00000004
+struct dlm_callback {
+        uint64_t                seq;
+        uint32_t                flags;          /* DLM_CBF_ */
+        int                     sb_status;      /* copy to lksb status */
+        uint8_t                 sb_flags;       /* copy to lksb flags */
+        int8_t                  mode; /* rq mode of bast, gr mode of cast */
+};
 struct dlm_lkb {
        struct dlm_rsb          *lkb_resource;  /* the rsb */
        struct kref             lkb_ref;
@@ -236,13 +245,6 @@ struct dlm_lkb {
        int8_t                  lkb_wait_type;  /* type of reply waiting for */
        int8_t                  lkb_wait_count;
-        int8_t                  lkb_ast_type;   /* type of ast queued for */
-        int8_t                  lkb_ast_first;  /* type of first ast queued */
-        int8_t                  lkb_bastmode;   /* req mode of queued bast */
-        int8_t                  lkb_castmode;   /* gr mode of queued cast */
-        int8_t                  lkb_bastmode_done; /* last delivered bastmode */
-        int8_t                  lkb_castmode_done; /* last delivered castmode */
        struct list_head        lkb_idtbl_list; /* lockspace lkbtbl */
        struct list_head        lkb_statequeue; /* rsb g/c/w list */
@@ -251,10 +253,15 @@ struct dlm_lkb {
        struct list_head        lkb_astqueue;   /* need ast to be sent */
        struct list_head        lkb_ownqueue;   /* list of locks for a process */
        struct list_head        lkb_time_list;
-        ktime_t                 lkb_time_bast;  /* for debugging */
        ktime_t                 lkb_timestamp;
        unsigned long           lkb_timeout_cs;
+        struct dlm_callback     lkb_callbacks[DLM_CALLBACKS_SIZE];
+        struct dlm_callback     lkb_last_cast;
+        struct dlm_callback     lkb_last_bast;
+        ktime_t                 lkb_last_cast_time;     /* for debugging */
+        ktime_t                 lkb_last_bast_time;     /* for debugging */
        char                    *lkb_lvbptr;
        struct dlm_lksb         *lkb_lksb;      /* caller's status block */
        void                    (*lkb_astfn) (void *astparam);
@@ -544,8 +551,6 @@ struct dlm_user_args {
                                          (dlm_user_proc) on the struct file,
                                          the process's locks point back to it*/
        struct dlm_lksb         lksb;
-        int                     old_mode;
-        int                     update_user_lvb;
        struct dlm_lksb __user  *user_lksb;
        void __user             *castparam;
        void __user             *castaddr;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 64e5f3efdd81..04b8c449303f 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -160,10 +160,10 @@ static const int __quecvt_compat_matrix[8][8] = {
 void dlm_print_lkb(struct dlm_lkb *lkb)
 {
        printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
-               "     status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
+               "     status %d rqmode %d grmode %d wait_type %d\n",
               lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
               lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
-               lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
+               lkb->lkb_grmode, lkb->lkb_wait_type);
 }
 static void dlm_print_rsb(struct dlm_rsb *r)
@@ -305,10 +305,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
                rv = -EDEADLK;
        }
-        lkb->lkb_lksb->sb_status = rv;
+        dlm_add_ast(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, lkb->lkb_sbflags);
-        lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
-        dlm_add_ast(lkb, AST_COMP, lkb->lkb_grmode);
 }
 static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -319,13 +316,10 @@ static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
 static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
 {
-        lkb->lkb_time_bast = ktime_get();
        if (is_master_copy(lkb)) {
-                lkb->lkb_bastmode = rqmode; /* printed by debugfs */
                send_bast(r, lkb, rqmode);
        } else {
-                dlm_add_ast(lkb, AST_BAST, rqmode);
+                dlm_add_ast(lkb, DLM_CB_BAST, rqmode, 0, 0);
        }
 }
@@ -600,6 +594,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
        INIT_LIST_HEAD(&lkb->lkb_ownqueue);
        INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
        INIT_LIST_HEAD(&lkb->lkb_time_list);
+        INIT_LIST_HEAD(&lkb->lkb_astqueue);
        get_random_bytes(&bucket, sizeof(bucket));
        bucket &= (ls->ls_lkbtbl_size - 1);
@@ -2819,9 +2814,9 @@ static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
           not from lkb fields */
        if (lkb->lkb_bastfn)
-                ms->m_asts |= AST_BAST;
+                ms->m_asts |= DLM_CB_BAST;
        if (lkb->lkb_astfn)
-                ms->m_asts |= AST_COMP;
+                ms->m_asts |= DLM_CB_CAST;
        /* compare with switch in create_message; send_remove() doesn't
           use send_args() */
@@ -3122,8 +3117,8 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
        lkb->lkb_grmode = DLM_LOCK_IV;
        lkb->lkb_rqmode = ms->m_rqmode;
-        lkb->lkb_bastfn = (ms->m_asts & AST_BAST) ? &fake_bastfn : NULL;
+        lkb->lkb_bastfn = (ms->m_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
-        lkb->lkb_astfn = (ms->m_asts & AST_COMP) ? &fake_astfn : NULL;
+        lkb->lkb_astfn = (ms->m_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
        if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
                /* lkb was just created so there won't be an lvb yet */
@@ -4412,8 +4407,8 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
        lkb->lkb_grmode = rl->rl_grmode;
        /* don't set lkb_status because add_lkb wants to itself */
-        lkb->lkb_bastfn = (rl->rl_asts & AST_BAST) ? &fake_bastfn : NULL;
+        lkb->lkb_bastfn = (rl->rl_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
-        lkb->lkb_astfn = (rl->rl_asts & AST_COMP) ? &fake_astfn : NULL;
+        lkb->lkb_astfn = (rl->rl_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
        if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
                int lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
@@ -4589,7 +4584,6 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
        error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
                              fake_astfn, ua, fake_bastfn, &args);
        lkb->lkb_flags |= DLM_IFL_USER;
-        ua->old_mode = DLM_LOCK_IV;
        if (error) {
                __put_lkb(ls, lkb);
@@ -4658,7 +4652,6 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
        ua->bastparam = ua_tmp->bastparam;
        ua->bastaddr = ua_tmp->bastaddr;
        ua->user_lksb = ua_tmp->user_lksb;
-        ua->old_mode = lkb->lkb_grmode;
        error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs,
                              fake_astfn, ua, fake_bastfn, &args);
@@ -4917,8 +4910,9 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
        }
        list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
-                lkb->lkb_ast_type = 0;
+                memset(&lkb->lkb_callbacks, 0,
-                list_del(&lkb->lkb_astqueue);
+                       sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
+                list_del_init(&lkb->lkb_astqueue);
                dlm_put_lkb(lkb);
        }
@@ -4958,7 +4952,9 @@ static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
        spin_lock(&proc->asts_spin);
        list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
-                list_del(&lkb->lkb_astqueue);
+                memset(&lkb->lkb_callbacks, 0,
+                       sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
+                list_del_init(&lkb->lkb_astqueue);
                dlm_put_lkb(lkb);
        }
        spin_unlock(&proc->asts_spin);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 9c64ae9e4c1a..bffa1e73b9a9 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1468,15 +1468,15 @@ static void work_stop(void)
 static int work_start(void)
 {
-        recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM |
+        recv_workqueue = alloc_workqueue("dlm_recv",
-                                         WQ_HIGHPRI | WQ_FREEZEABLE, 0);
+                                         WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
        if (!recv_workqueue) {
                log_print("can't start dlm_recv");
                return -ENOMEM;
        }
-        send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM |
+        send_workqueue = alloc_workqueue("dlm_send",
-                                         WQ_HIGHPRI | WQ_FREEZEABLE, 0);
+                                         WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
        if (!send_workqueue) {
                log_print("can't start dlm_send");
                destroy_workqueue(recv_workqueue);
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 3c83a49a48a3..f10a50f24e8f 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -321,9 +321,9 @@ static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
        rl->rl_wait_type = cpu_to_le16(lkb->lkb_wait_type);
        if (lkb->lkb_bastfn)
-                rl->rl_asts |= AST_BAST;
+                rl->rl_asts |= DLM_CB_BAST;
        if (lkb->lkb_astfn)
-                rl->rl_asts |= AST_COMP;
+                rl->rl_asts |= DLM_CB_CAST;
        rl->rl_namelen = cpu_to_le16(r->res_length);
        memcpy(rl->rl_name, r->res_name, r->res_length);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 66d6c16bf440..d5ab3fe7c198 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -24,6 +24,7 @@
 #include "lock.h"
 #include "lvb_table.h"
 #include "user.h"
+#include "ast.h"
 static const char name_prefix[] = "dlm";
 static const struct file_operations device_fops;
@@ -152,19 +153,16 @@ static void compat_output(struct dlm_lock_result *res,
   not related to the lifetime of the lkb struct which is managed
   entirely by refcount. */
-static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
+static int lkb_is_endoflife(int mode, int status)
 {
-        switch (sb_status) {
+        switch (status) {
        case -DLM_EUNLOCK:
                return 1;
        case -DLM_ECANCEL:
        case -ETIMEDOUT:
        case -EDEADLK:
-                if (lkb->lkb_grmode == DLM_LOCK_IV)
-                        return 1;
-                break;
        case -EAGAIN:
-                if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV)
+                if (mode == DLM_LOCK_IV)
                        return 1;
                break;
        }
@@ -174,12 +172,13 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
 /* we could possibly check if the cancel of an orphan has resulted in the lkb
   being removed and then remove that lkb from the orphans list and free it */
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
+void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
+                      int status, uint32_t sbflags, uint64_t seq)
 {
        struct dlm_ls *ls;
        struct dlm_user_args *ua;
        struct dlm_user_proc *proc;
-        int eol = 0, ast_type;
+        int rv;
        if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
                return;
@@ -200,49 +199,29 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
        ua = lkb->lkb_ua;
        proc = ua->proc;
-        if (type == AST_BAST && ua->bastaddr == NULL)
+        if ((flags & DLM_CB_BAST) && ua->bastaddr == NULL)
                goto out;
+        if ((flags & DLM_CB_CAST) && lkb_is_endoflife(mode, status))
+                lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
        spin_lock(&proc->asts_spin);
-        ast_type = lkb->lkb_ast_type;
+        rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq);
-        lkb->lkb_ast_type |= type;
+        if (rv < 0) {
-        if (type == AST_BAST)
+                spin_unlock(&proc->asts_spin);
-                lkb->lkb_bastmode = mode;
+                goto out;
-        else
+        }
-                lkb->lkb_castmode = mode;
-        if (!ast_type) {
+        if (list_empty(&lkb->lkb_astqueue)) {
                kref_get(&lkb->lkb_ref);
                list_add_tail(&lkb->lkb_astqueue, &proc->asts);
-                lkb->lkb_ast_first = type;
                wake_up_interruptible(&proc->wait);
        }
-        if (type == AST_COMP && (ast_type & AST_COMP))
-                log_debug(ls, "ast overlap %x status %x %x",
-                          lkb->lkb_id, ua->lksb.sb_status, lkb->lkb_flags);
-        eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type);
-        if (eol) {
-                lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
-        }
-        /* We want to copy the lvb to userspace when the completion
-           ast is read if the status is 0, the lock has an lvb and
-           lvb_ops says we should.  We could probably have set_lvb_lock()
-           set update_user_lvb instead and not need old_mode */
-        if ((lkb->lkb_ast_type & AST_COMP) &&
-            (lkb->lkb_lksb->sb_status == 0) &&
-            lkb->lkb_lksb->sb_lvbptr &&
-            dlm_lvb_operations[ua->old_mode + 1][lkb->lkb_grmode + 1])
-                ua->update_user_lvb = 1;
-        else
-                ua->update_user_lvb = 0;
        spin_unlock(&proc->asts_spin);
-        if (eol) {
+        if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
+                /* N.B. spin_lock locks_spin, not asts_spin */
                spin_lock(&proc->locks_spin);
                if (!list_empty(&lkb->lkb_ownqueue)) {
                        list_del_init(&lkb->lkb_ownqueue);
@@ -705,8 +684,9 @@ static int device_close(struct inode *inode, struct file *file)
        return 0;
 }
-static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
+static int copy_result_to_user(struct dlm_user_args *ua, int compat,
-                               int mode, char __user *buf, size_t count)
+                               uint32_t flags, int mode, int copy_lvb,
+                               char __user *buf, size_t count)
 {
 #ifdef CONFIG_COMPAT
        struct dlm_lock_result32 result32;
@@ -730,7 +710,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
           notes that a new blocking AST address and parameter are set even if
           the conversion fails, so maybe we should just do that. */
-        if (type == AST_BAST) {
+        if (flags & DLM_CB_BAST) {
                result.user_astaddr = ua->bastaddr;
                result.user_astparam = ua->bastparam;
                result.bast_mode = mode;
@@ -750,8 +730,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
        /* copy lvb to userspace if there is one, it's been updated, and
           the user buffer has space for it */
-        if (ua->update_user_lvb && ua->lksb.sb_lvbptr &&
+        if (copy_lvb && ua->lksb.sb_lvbptr && count >= len + DLM_USER_LVB_LEN) {
-            count >= len + DLM_USER_LVB_LEN) {
                if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
                                 DLM_USER_LVB_LEN)) {
                        error = -EFAULT;
@@ -801,13 +780,12 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
        struct dlm_user_proc *proc = file->private_data;
        struct dlm_lkb *lkb;
        DECLARE_WAITQUEUE(wait, current);
-        int error = 0, removed;
+        struct dlm_callback cb;
-        int ret_type, ret_mode;
+        int rv, resid, copy_lvb = 0;
-        int bastmode, castmode, do_bast, do_cast;
        if (count == sizeof(struct dlm_device_version)) {
-                error = copy_version_to_user(buf, count);
+                rv = copy_version_to_user(buf, count);
-                return error;
+                return rv;
        }
        if (!proc) {
@@ -854,92 +832,57 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
                }
        }
-        /* there may be both completion and blocking asts to return for
+        /* if we empty lkb_callbacks, we don't want to unlock the spinlock
-           the lkb, don't remove lkb from asts list unless no asts remain */
+           without removing lkb_astqueue; so empty lkb_astqueue is always
+           consistent with empty lkb_callbacks */
        lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
-        removed = 0;
+        rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid);
-        ret_type = 0;
+        if (rv < 0) {
-        ret_mode = 0;
+                /* this shouldn't happen; lkb should have been removed from
-        do_bast = lkb->lkb_ast_type & AST_BAST;
+                   list when resid was zero */
-        do_cast = lkb->lkb_ast_type & AST_COMP;
+                log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id);
-        bastmode = lkb->lkb_bastmode;
+                list_del_init(&lkb->lkb_astqueue);
-        castmode = lkb->lkb_castmode;
+                spin_unlock(&proc->asts_spin);
+                /* removes ref for proc->asts, may cause lkb to be freed */
-        /* when both are queued figure out which to do first and
+                dlm_put_lkb(lkb);
-           switch first so the other goes in the next read */
+                goto try_another;
-        if (do_cast && do_bast) {
-                if (lkb->lkb_ast_first == AST_COMP) {
-                        ret_type = AST_COMP;
-                        ret_mode = castmode;
-                        lkb->lkb_ast_type &= ~AST_COMP;
-                        lkb->lkb_ast_first = AST_BAST;
-                } else {
-                        ret_type = AST_BAST;
-                        ret_mode = bastmode;
-                        lkb->lkb_ast_type &= ~AST_BAST;
-                        lkb->lkb_ast_first = AST_COMP;
-                }
-        } else {
-                ret_type = lkb->lkb_ast_first;
-                ret_mode = (ret_type == AST_COMP) ? castmode : bastmode;
-                lkb->lkb_ast_type &= ~ret_type;
-                lkb->lkb_ast_first = 0;
        }
+        if (!resid)
+                list_del_init(&lkb->lkb_astqueue);
+        spin_unlock(&proc->asts_spin);
-        /* if we're doing a bast but the bast is unnecessary, then
+        if (cb.flags & DLM_CB_SKIP) {
-           switch to do nothing or do a cast if that was needed next */
+                /* removes ref for proc->asts, may cause lkb to be freed */
+                if (!resid)
-        if ((ret_type == AST_BAST) &&
+                        dlm_put_lkb(lkb);
-            dlm_modes_compat(bastmode, lkb->lkb_castmode_done)) {
+                goto try_another;
-                ret_type = 0;
-                ret_mode = 0;
-                if (do_cast) {
-                        ret_type = AST_COMP;
-                        ret_mode = castmode;
-                        lkb->lkb_ast_type &= ~AST_COMP;
-                        lkb->lkb_ast_first = 0;
-                }
        }
-        if (lkb->lkb_ast_first != lkb->lkb_ast_type) {
+        if (cb.flags & DLM_CB_CAST) {
-                log_print("device_read %x ast_first %x ast_type %x",
+                int old_mode, new_mode;
-                          lkb->lkb_id, lkb->lkb_ast_first, lkb->lkb_ast_type);
-        }
-        if (!lkb->lkb_ast_type) {
+                old_mode = lkb->lkb_last_cast.mode;
-                list_del(&lkb->lkb_astqueue);
+                new_mode = cb.mode;
-                removed = 1;
-        }
-        spin_unlock(&proc->asts_spin);
-        if (ret_type) {
+                if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr &&
-                error = copy_result_to_user(lkb->lkb_ua,
+                    dlm_lvb_operations[old_mode + 1][new_mode + 1])
-                                test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+                        copy_lvb = 1;
-                                ret_type, ret_mode, buf, count);
-                if (ret_type == AST_COMP)
+                lkb->lkb_lksb->sb_status = cb.sb_status;
-                        lkb->lkb_castmode_done = castmode;
+                lkb->lkb_lksb->sb_flags = cb.sb_flags;
-                if (ret_type == AST_BAST)
-                        lkb->lkb_bastmode_done = bastmode;
        }
-        /* removes reference for the proc->asts lists added by
+        rv = copy_result_to_user(lkb->lkb_ua,
-           dlm_user_add_ast() and may result in the lkb being freed */
+                                 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+                                 cb.flags, cb.mode, copy_lvb, buf, count);
-        if (removed)
+        /* removes ref for proc->asts, may cause lkb to be freed */
+        if (!resid)
                dlm_put_lkb(lkb);
-        /* the bast that was queued was eliminated (see unnecessary above),
+        return rv;
-           leaving nothing to return */
-        if (!ret_type)
-                goto try_another;
-        return error;
 }
 static unsigned int device_poll(struct file *file, poll_table *wait)
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index f196091dd7ff..00499ab8835f 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -9,7 +9,8 @@
 #ifndef __USER_DOT_H__
 #define __USER_DOT_H__
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode);
+void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
+                      int status, uint32_t sbflags, uint64_t seq);
 int dlm_user_init(void);
 void dlm_user_exit(void);
 int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 2195c213ab2f..98b77c89494c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -8,6 +8,7 @@
 #include <linux/writeback.h>
 #include <linux/sysctl.h>
 #include <linux/gfp.h>
+#include "internal.h"
 /* A global variable is a bit ugly, but it keeps the code simple */
 int sysctl_drop_caches;
@@ -16,20 +17,23 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 {
        struct inode *inode, *toput_inode = NULL;
-        spin_lock(&inode_lock);
+        spin_lock(&inode_sb_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+                spin_lock(&inode->i_lock);
-                        continue;
+                if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
-                if (inode->i_mapping->nrpages == 0)
+                    (inode->i_mapping->nrpages == 0)) {
+                        spin_unlock(&inode->i_lock);
                        continue;
+                }
                __iget(inode);
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode->i_lock);
+                spin_unlock(&inode_sb_list_lock);
                invalidate_mapping_pages(inode->i_mapping, 0, -1);
                iput(toput_inode);
                toput_inode = inode;
-                spin_lock(&inode_lock);
+                spin_lock(&inode_sb_list_lock);
        }
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_sb_list_lock);
        iput(toput_inode);
 }
@@ -45,7 +49,11 @@ static void drop_slab(void)
 int drop_caches_sysctl_handler(ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
-        proc_dointvec_minmax(table, write, buffer, length, ppos);
+        int ret;
+        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+        if (ret)
+                return ret;
        if (write) {
                if (sysctl_drop_caches & 1)
                        iterate_supers(drop_pagecache_sb, NULL);
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 6fc4f319b550..534c1d46e69e 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -46,24 +46,28 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct dentry *lower_dentry;
        struct vfsmount *lower_mnt;
-        struct dentry *dentry_save;
+        struct dentry *dentry_save = NULL;
-        struct vfsmount *vfsmount_save;
+        struct vfsmount *vfsmount_save = NULL;
        int rc = 1;
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
        if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
                goto out;
-        dentry_save = nd->path.dentry;
+        if (nd) {
-        vfsmount_save = nd->path.mnt;
+                dentry_save = nd->path.dentry;
-        nd->path.dentry = lower_dentry;
+                vfsmount_save = nd->path.mnt;
-        nd->path.mnt = lower_mnt;
+                nd->path.dentry = lower_dentry;
+                nd->path.mnt = lower_mnt;
+        }
        rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd);
-        nd->path.dentry = dentry_save;
+        if (nd) {
-        nd->path.mnt = vfsmount_save;
+                nd->path.dentry = dentry_save;
+                nd->path.mnt = vfsmount_save;
+        }
        if (dentry->d_inode) {
                struct inode *lower_inode =
                        ecryptfs_inode_to_lower(dentry->d_inode);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index dbc84ed96336..e00753496e3e 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -632,8 +632,7 @@ int ecryptfs_interpose(struct dentry *hidden_dentry,
                       u32 flags);
 int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                                        struct dentry *lower_dentry,
-                                        struct inode *ecryptfs_dir_inode,
+                                        struct inode *ecryptfs_dir_inode);
-                                        struct nameidata *ecryptfs_nd);
 int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
                                         size_t *decrypted_name_size,
                                         struct dentry *ecryptfs_dentry,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 81e10e6a9443..7d1050e254f9 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -317,6 +317,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 const struct file_operations ecryptfs_dir_fops = {
        .readdir = ecryptfs_readdir,
+        .read = generic_read_dir,
        .unlocked_ioctl = ecryptfs_unlocked_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl = ecryptfs_compat_ioctl,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index bd33f87a1907..b592938a84bc 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -74,16 +74,20 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
        unsigned int flags_save;
        int rc;
-        dentry_save = nd->path.dentry;
+        if (nd) {
-        vfsmount_save = nd->path.mnt;
+                dentry_save = nd->path.dentry;
-        flags_save = nd->flags;
+                vfsmount_save = nd->path.mnt;
-        nd->path.dentry = lower_dentry;
+                flags_save = nd->flags;
-        nd->path.mnt = lower_mnt;
+                nd->path.dentry = lower_dentry;
-        nd->flags &= ~LOOKUP_OPEN;
+                nd->path.mnt = lower_mnt;
+                nd->flags &= ~LOOKUP_OPEN;
+        }
        rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
-        nd->path.dentry = dentry_save;
+        if (nd) {
-        nd->path.mnt = vfsmount_save;
+                nd->path.dentry = dentry_save;
-        nd->flags = flags_save;
+                nd->path.mnt = vfsmount_save;
+                nd->flags = flags_save;
+        }
        return rc;
 }
@@ -241,8 +245,7 @@ out:
 */
 int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                                        struct dentry *lower_dentry,
-                                        struct inode *ecryptfs_dir_inode,
+                                        struct inode *ecryptfs_dir_inode)
-                                        struct nameidata *ecryptfs_nd)
 {
        struct dentry *lower_dir_dentry;
        struct vfsmount *lower_mnt;
@@ -290,8 +293,6 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                goto out;
        if (special_file(lower_inode->i_mode))
                goto out;
-        if (!ecryptfs_nd)
-                goto out;
        /* Released in this function */
        page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
        if (!page_virt) {
@@ -349,75 +350,6 @@ out:
 }
 /**
- * ecryptfs_new_lower_dentry
- * @name: The name of the new dentry.
- * @lower_dir_dentry: Parent directory of the new dentry.
- * @nd: nameidata from last lookup.
- *
- * Create a new dentry or get it from lower parent dir.
- */
-static struct dentry *
-ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry,
-                          struct nameidata *nd)
-{
-        struct dentry *new_dentry;
-        struct dentry *tmp;
-        struct inode *lower_dir_inode;
-        lower_dir_inode = lower_dir_dentry->d_inode;
-        tmp = d_alloc(lower_dir_dentry, name);
-        if (!tmp)
-                return ERR_PTR(-ENOMEM);
-        mutex_lock(&lower_dir_inode->i_mutex);
-        new_dentry = lower_dir_inode->i_op->lookup(lower_dir_inode, tmp, nd);
-        mutex_unlock(&lower_dir_inode->i_mutex);
-        if (!new_dentry)
-                new_dentry = tmp;
-        else
-                dput(tmp);
-        return new_dentry;
-}
-/**
- * ecryptfs_lookup_one_lower
- * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
- * @lower_dir_dentry: lower parent directory
- * @name: lower file name
- *
- * Get the lower dentry from vfs. If lower dentry does not exist yet,
- * create it.
- */
-static struct dentry *
-ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry,
-                          struct dentry *lower_dir_dentry, struct qstr *name)
-{
-        struct nameidata nd;
-        struct vfsmount *lower_mnt;
-        int err;
-        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
-                                    ecryptfs_dentry->d_parent));
-        err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd);
-        mntput(lower_mnt);
-        if (!err) {
-                /* we dont need the mount */
-                mntput(nd.path.mnt);
-                return nd.path.dentry;
-        }
-        if (err != -ENOENT)
-                return ERR_PTR(err);
-        /* create a new lower dentry */
-        return ecryptfs_new_lower_dentry(name, lower_dir_dentry, &nd);
-}
-/**
 * ecryptfs_lookup
 * @ecryptfs_dir_inode: The eCryptfs directory inode
 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
@@ -434,7 +366,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        size_t encrypted_and_encoded_name_size;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
        struct dentry *lower_dir_dentry, *lower_dentry;
-        struct qstr lower_name;
        int rc = 0;
        if ((ecryptfs_dentry->d_name.len == 1
@@ -444,20 +375,14 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                goto out_d_drop;
        }
        lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
-        lower_name.name = ecryptfs_dentry->d_name.name;
+        mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
-        lower_name.len = ecryptfs_dentry->d_name.len;
+        lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
-        lower_name.hash = ecryptfs_dentry->d_name.hash;
+                                      lower_dir_dentry,
-        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
+                                      ecryptfs_dentry->d_name.len);
-                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
+        mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
-                                lower_dir_dentry->d_inode, &lower_name);
-                if (rc < 0)
-                        goto out_d_drop;
-        }
-        lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
-                                                 lower_dir_dentry, &lower_name);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
-                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
+                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
                                "[%d] on lower_dentry = [%s]\n", __func__, rc,
                                encrypted_and_encoded_name);
                goto out_d_drop;
@@ -479,28 +404,21 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                       "filename; rc = [%d]\n", __func__, rc);
                goto out_d_drop;
        }
-        lower_name.name = encrypted_and_encoded_name;
+        mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
-        lower_name.len = encrypted_and_encoded_name_size;
+        lower_dentry = lookup_one_len(encrypted_and_encoded_name,
-        lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
+                                      lower_dir_dentry,
-        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
+                                      encrypted_and_encoded_name_size);
-                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
+        mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
-                                lower_dir_dentry->d_inode, &lower_name);
-                if (rc < 0)
-                        goto out_d_drop;
-        }
-        lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
-                                                 lower_dir_dentry, &lower_name);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
-                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
+                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
                                "[%d] on lower_dentry = [%s]\n", __func__, rc,
                                encrypted_and_encoded_name);
                goto out_d_drop;
        }
 lookup_and_interpose:
        rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
-                                                 ecryptfs_dir_inode,
+                                                 ecryptfs_dir_inode);
-                                                 ecryptfs_nd);
        goto out;
 out_d_drop:
        d_drop(ecryptfs_dentry);
@@ -1092,6 +1010,8 @@ int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry),
                         ecryptfs_dentry_to_lower(dentry), &lower_stat);
        if (!rc) {
+                fsstack_copy_attr_all(dentry->d_inode,
+                                      ecryptfs_inode_to_lower(dentry->d_inode));
                generic_fillattr(dentry->d_inode, stat);
                stat->blocks = lower_stat.blocks;
        }
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index a8e7797b9477..9c13412e6c99 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -23,7 +23,6 @@ static sector_t _efs_bmap(struct address_space *mapping, sector_t block)
 }
 static const struct address_space_operations efs_aops = {
        .readpage = efs_readpage,
-        .sync_page = block_sync_page,
        .bmap = _efs_bmap
 };
diff --git a/fs/eventfd.c b/fs/eventfd.c
index e0194b3e14d6..d9a591773919 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -99,7 +99,7 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_get);
 * @ctx: [in] Pointer to eventfd context.
 *
 * The eventfd context reference must have been previously acquired either
- * with eventfd_ctx_get() or eventfd_ctx_fdget()).
+ * with eventfd_ctx_get() or eventfd_ctx_fdget().
 */
 void eventfd_ctx_put(struct eventfd_ctx *ctx)
 {
@@ -146,9 +146,9 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
 * @ctx: [in] Pointer to eventfd context.
 * @wait: [in] Wait queue to be removed.
- * @cnt: [out] Pointer to the 64bit conter value.
+ * @cnt: [out] Pointer to the 64-bit counter value.
 *
- * Returns zero if successful, or the following error codes:
+ * Returns %0 if successful, or the following error codes:
 *
 * -EAGAIN      : The operation would have blocked.
 *
@@ -175,11 +175,11 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
 * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
 * @ctx: [in] Pointer to eventfd context.
 * @no_wait: [in] Different from zero if the operation should not block.
- * @cnt: [out] Pointer to the 64bit conter value.
+ * @cnt: [out] Pointer to the 64-bit counter value.
 *
- * Returns zero if successful, or the following error codes:
+ * Returns %0 if successful, or the following error codes:
 *
- * -EAGAIN      : The operation would have blocked but @no_wait was nonzero.
+ * -EAGAIN      : The operation would have blocked but @no_wait was non-zero.
 * -ERESTARTSYS : A signal interrupted the wait operation.
 *
 * If @no_wait is zero, the function might sleep until the eventfd internal
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cc8a9b7d6064..ed38801b57a7 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -62,7 +62,14 @@
 * This mutex is acquired by ep_free() during the epoll file
 * cleanup path and it is also acquired by eventpoll_release_file()
 * if a file has been pushed inside an epoll set and it is then
- * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL).
+ * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
+ * It is also acquired when inserting an epoll fd onto another epoll
+ * fd. We do this so that we walk the epoll tree and ensure that this
+ * insertion does not create a cycle of epoll file descriptors, which
+ * could lead to deadlock. We need a global mutex to prevent two
+ * simultaneous inserts (A into B and B into A) from racing and
+ * constructing a cycle without either insert observing that it is
+ * going to.
 * It is possible to drop the "ep->mtx" and to use the global
 * mutex "epmutex" (together with "ep->lock") to have it working,
 * but having "ep->mtx" will make the interface more scalable.
@@ -145,11 +152,11 @@ struct epitem {
 /*
 * This structure is stored inside the "private_data" member of the file
- * structure and rapresent the main data sructure for the eventpoll
+ * structure and represents the main data structure for the eventpoll
 * interface.
 */
 struct eventpoll {
-        /* Protect the this structure access */
+        /* Protect the access to this structure */
        spinlock_t lock;
        /*
@@ -224,6 +231,9 @@ static long max_user_watches __read_mostly;
 */
 static DEFINE_MUTEX(epmutex);
+/* Used to check for epoll file descriptor inclusion loops */
+static struct nested_calls poll_loop_ncalls;
 /* Used for safe wake up implementation */
 static struct nested_calls poll_safewake_ncalls;
@@ -306,6 +316,19 @@ static void ep_nested_calls_init(struct nested_calls *ncalls)
 }
 /**
+ * ep_events_available - Checks if ready events might be available.
+ *
+ * @ep: Pointer to the eventpoll context.
+ *
+ * Returns: Returns a value different than zero if ready events are available,
+ *          or zero otherwise.
+ */
+static inline int ep_events_available(struct eventpoll *ep)
+{
+        return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
+}
+/**
 * ep_call_nested - Perform a bound (possibly) nested call, by checking
 *                  that the recursion limit is not exceeded, and that
 *                  the same nested call (by the meaning of same cookie) is
@@ -783,7 +806,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
 /*
 * This is the callback that is passed to the wait queue wakeup
- * machanism. It is called by the stored file descriptors when they
+ * mechanism. It is called by the stored file descriptors when they
 * have events to report.
 */
 static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
@@ -814,9 +837,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
                goto out_unlock;
        /*
-         * If we are trasfering events to userspace, we can hold no locks
+         * If we are transferring events to userspace, we can hold no locks
         * (because we're accessing user memory, and because of linux f_op->poll()
-         * semantics). All the events that happens during that period of time are
+         * semantics). All the events that happen during that period of time are
         * chained in ep->ovflist and requeued later on.
         */
        if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
@@ -1114,31 +1137,63 @@ static int ep_send_events(struct eventpoll *ep,
        return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
 }
+static inline struct timespec ep_set_mstimeout(long ms)
+{
+        struct timespec now, ts = {
+                .tv_sec = ms / MSEC_PER_SEC,
+                .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
+        };
+        ktime_get_ts(&now);
+        return timespec_add_safe(now, ts);
+}
+/**
+ * ep_poll - Retrieves ready events, and delivers them to the caller supplied
+ *           event buffer.
+ *
+ * @ep: Pointer to the eventpoll context.
+ * @events: Pointer to the userspace buffer where the ready events should be
+ *          stored.
+ * @maxevents: Size (in terms of number of events) of the caller event buffer.
+ * @timeout: Maximum timeout for the ready events fetch operation, in
+ *           milliseconds. If the @timeout is zero, the function will not block,
+ *           while if the @timeout is less than zero, the function will block
+ *           until at least one event has been retrieved (or an error
+ *           occurred).
+ *
+ * Returns: Returns the number of ready events which have been fetched, or an
+ *          error code, in case of error.
+ */
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
                   int maxevents, long timeout)
 {
-        int res, eavail, timed_out = 0;
+        int res = 0, eavail, timed_out = 0;
        unsigned long flags;
-        long slack;
+        long slack = 0;
        wait_queue_t wait;
-        struct timespec end_time;
        ktime_t expires, *to = NULL;
        if (timeout > 0) {
-                ktime_get_ts(&end_time);
+                struct timespec end_time = ep_set_mstimeout(timeout);
-                timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC);
                slack = select_estimate_accuracy(&end_time);
                to = &expires;
                *to = timespec_to_ktime(end_time);
        } else if (timeout == 0) {
+                /*
+                 * Avoid the unnecessary trip to the wait queue loop, if the
+                 * caller specified a non blocking operation.
+                 */
                timed_out = 1;
+                spin_lock_irqsave(&ep->lock, flags);
+                goto check_events;
        }
-retry:
+fetch_events:
        spin_lock_irqsave(&ep->lock, flags);
-        res = 0;
+        if (!ep_events_available(ep)) {
-        if (list_empty(&ep->rdllist)) {
                /*
                 * We don't have any available event to return to the caller.
                 * We need to sleep here, and we will be wake up by
@@ -1154,7 +1209,7 @@ retry:
                         * to TASK_INTERRUPTIBLE before doing the checks.
                         */
                        set_current_state(TASK_INTERRUPTIBLE);
-                        if (!list_empty(&ep->rdllist) || timed_out)
+                        if (ep_events_available(ep) || timed_out)
                                break;
                        if (signal_pending(current)) {
                                res = -EINTR;
@@ -1171,8 +1226,9 @@ retry:
                set_current_state(TASK_RUNNING);
        }
+check_events:
        /* Is it worth to try to dig for events ? */
-        eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
+        eavail = ep_events_available(ep);
        spin_unlock_irqrestore(&ep->lock, flags);
@@ -1183,11 +1239,67 @@ retry:
         */
        if (!res && eavail &&
            !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
-                goto retry;
+                goto fetch_events;
        return res;
 }
+/**
+ * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
+ *                      API, to verify that adding an epoll file inside another
+ *                      epoll structure, does not violate the constraints, in
+ *                      terms of closed loops, or too deep chains (which can
+ *                      result in excessive stack usage).
+ *
+ * @priv: Pointer to the epoll file to be currently checked.
+ * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
+ *          data structure pointer.
+ * @call_nests: Current dept of the @ep_call_nested() call stack.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ *          structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
+{
+        int error = 0;
+        struct file *file = priv;
+        struct eventpoll *ep = file->private_data;
+        struct rb_node *rbp;
+        struct epitem *epi;
+        mutex_lock(&ep->mtx);
+        for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+                epi = rb_entry(rbp, struct epitem, rbn);
+                if (unlikely(is_file_epoll(epi->ffd.file))) {
+                        error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+                                               ep_loop_check_proc, epi->ffd.file,
+                                               epi->ffd.file->private_data, current);
+                        if (error != 0)
+                                break;
+                }
+        }
+        mutex_unlock(&ep->mtx);
+        return error;
+}
+/**
+ * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
+ *                 another epoll file (represented by @ep) does not create
+ *                 closed loops or too deep chains.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @file: Pointer to the epoll file to be checked.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ *          structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check(struct eventpoll *ep, struct file *file)
+{
+        return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+                              ep_loop_check_proc, file, ep, current);
+}
 /*
 * Open an eventpoll file descriptor.
 */
@@ -1236,6 +1348,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                struct epoll_event __user *, event)
 {
        int error;
+        int did_lock_epmutex = 0;
        struct file *file, *tfile;
        struct eventpoll *ep;
        struct epitem *epi;
@@ -1277,6 +1390,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
         */
        ep = file->private_data;
+        /*
+         * When we insert an epoll file descriptor, inside another epoll file
+         * descriptor, there is the change of creating closed loops, which are
+         * better be handled here, than in more critical paths.
+         *
+         * We hold epmutex across the loop check and the insert in this case, in
+         * order to prevent two separate inserts from racing and each doing the
+         * insert "at the same time" such that ep_loop_check passes on both
+         * before either one does the insert, thereby creating a cycle.
+         */
+        if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
+                mutex_lock(&epmutex);
+                did_lock_epmutex = 1;
+                error = -ELOOP;
+                if (ep_loop_check(ep, tfile) != 0)
+                        goto error_tgt_fput;
+        }
        mutex_lock(&ep->mtx);
        /*
@@ -1312,6 +1444,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        mutex_unlock(&ep->mtx);
 error_tgt_fput:
+        if (unlikely(did_lock_epmutex))
+                mutex_unlock(&epmutex);
        fput(tfile);
 error_fput:
        fput(file);
@@ -1431,6 +1566,12 @@ static int __init eventpoll_init(void)
                EP_ITEM_COST;
        BUG_ON(max_user_watches < 0);
+        /*
+         * Initialize the structure used to perform epoll file descriptor
+         * inclusion loops checks.
+         */
+        ep_nested_calls_init(&poll_loop_ncalls);
        /* Initialize the structure used to perform safe poll wait head wake ups */
        ep_nested_calls_init(&poll_safewake_ncalls);
diff --git a/fs/exec.c b/fs/exec.c
index c62efcb959c7..5e62d26a4fec 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -115,13 +115,16 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
        struct file *file;
        char *tmp = getname(library);
        int error = PTR_ERR(tmp);
+        static const struct open_flags uselib_flags = {
+                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+                .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
+                .intent = LOOKUP_OPEN
+        };
        if (IS_ERR(tmp))
                goto out;
-        file = do_filp_open(AT_FDCWD, tmp,
+        file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
-                                O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
-                                MAY_READ | MAY_EXEC | MAY_OPEN);
        putname(tmp);
        error = PTR_ERR(file);
        if (IS_ERR(file))
@@ -721,10 +724,13 @@ struct file *open_exec(const char *name)
 {
        struct file *file;
        int err;
+        static const struct open_flags open_exec_flags = {
+                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+                .acc_mode = MAY_EXEC | MAY_OPEN,
+                .intent = LOOKUP_OPEN
+        };
-        file = do_filp_open(AT_FDCWD, name,
+        file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW);
-                                O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
-                                MAY_EXEC | MAY_OPEN);
        if (IS_ERR(file))
                goto out;
@@ -1869,7 +1875,7 @@ static void wait_for_dump_helpers(struct file *file)
 /*
- * uhm_pipe_setup
+ * umh_pipe_setup
 * helper function to customize the process used
 * to collect the core in userspace.  Specifically
 * it sets up a pipe and installs it as fd 0 (stdin)
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index f0d520312d8b..5e74ad3d4009 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -53,10 +53,14 @@
 #define EXOFS_ROOT_ID   0x10002 /* object ID for root directory */
 /* exofs Application specific page/attribute */
+/* Inode attrs */
 # define EXOFS_APAGE_FS_DATA    (OSD_APAGE_APP_DEFINED_FIRST + 3)
 # define EXOFS_ATTR_INODE_DATA  1
 # define EXOFS_ATTR_INODE_FILE_LAYOUT   2
 # define EXOFS_ATTR_INODE_DIR_LAYOUT    3
+/* Partition attrs */
+# define EXOFS_APAGE_SB_DATA    (0xF0000000U + 3)
+# define EXOFS_ATTR_SB_STATS    1
 /*
 * The maximum number of files we can have is limited by the size of the
@@ -86,8 +90,8 @@ enum {
 */
 enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
 struct exofs_fscb {
-        __le64  s_nextid;       /* Highest object ID used */
+        __le64  s_nextid;       /* Only used after mkfs */
-        __le64  s_numfiles;     /* Number of files on fs */
+        __le64  s_numfiles;     /* Only used after mkfs */
        __le32  s_version;      /* == EXOFS_FSCB_VER */
        __le16  s_magic;        /* Magic signature */
        __le16  s_newfs;        /* Non-zero if this is a new fs */
@@ -98,6 +102,16 @@ struct exofs_fscb {
 } __packed;
 /*
+ * This struct is set on the FS partition's attributes.
+ * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together
+ * with the create command, to atomically persist the sb writeable information.
+ */
+struct exofs_sb_stats {
+        __le64  s_nextid;       /* Highest object ID used */
+        __le64  s_numfiles;     /* Number of files on fs */
+} __packed;
+/*
 * Describes the raid used in the FS. It is part of the device table.
 * This here is taken from the pNFS-objects definition. In exofs we
 * use one raid policy through-out the filesystem. (NOTE: the funny
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index dcc941d82d67..d0941c6a1f72 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -124,7 +124,7 @@ out:
 Ebadsize:
        EXOFS_ERR("ERROR [exofs_check_page]: "
-                "size of directory #%lu is not a multiple of chunk size",
+                "size of directory(0x%lx) is not a multiple of chunk size\n",
                dir->i_ino
        );
        goto fail;
@@ -142,8 +142,8 @@ Espan:
        goto bad_entry;
 bad_entry:
        EXOFS_ERR(
-                "ERROR [exofs_check_page]: bad entry in directory #%lu: %s - "
+                "ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - "
-                "offset=%lu, inode=%llu, rec_len=%d, name_len=%d",
+                "offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n",
                dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
                _LLU(le64_to_cpu(p->inode_no)),
                rec_len, p->name_len);
@@ -151,8 +151,8 @@ bad_entry:
 Eend:
        p = (struct exofs_dir_entry *)(kaddr + offs);
        EXOFS_ERR("ERROR [exofs_check_page]: "
-                "entry in directory #%lu spans the page boundary"
+                "entry in directory(0x%lx) spans the page boundary"
-                "offset=%lu, inode=%llu",
+                "offset=%lu, inode=0x%llx\n",
                dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
                _LLU(le64_to_cpu(p->inode_no)));
 fail:
@@ -261,9 +261,8 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                struct page *page = exofs_get_page(inode, n);
                if (IS_ERR(page)) {
-                        EXOFS_ERR("ERROR: "
+                        EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
-                                   "bad page in #%lu",
+                                  inode->i_ino);
-                                   inode->i_ino);
                        filp->f_pos += PAGE_CACHE_SIZE - offset;
                        return PTR_ERR(page);
                }
@@ -283,7 +282,8 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                for (; (char *)de <= limit; de = exofs_next_entry(de)) {
                        if (de->rec_len == 0) {
                                EXOFS_ERR("ERROR: "
-                                        "zero-length directory entry");
+                                     "zero-length entry in directory(0x%lx)\n",
+                                     inode->i_ino);
                                exofs_put_page(page);
                                return -EIO;
                        }
@@ -342,9 +342,9 @@ struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
                        kaddr += exofs_last_byte(dir, n) - reclen;
                        while ((char *) de <= kaddr) {
                                if (de->rec_len == 0) {
-                                        EXOFS_ERR(
+                                        EXOFS_ERR("ERROR: zero-length entry in "
-                                                "ERROR: exofs_find_entry: "
+                                                  "directory(0x%lx)\n",
-                                                "zero-length directory entry");
+                                                  dir->i_ino);
                                        exofs_put_page(page);
                                        goto out;
                                }
@@ -472,7 +472,8 @@ int exofs_add_link(struct dentry *dentry, struct inode *inode)
                        }
                        if (de->rec_len == 0) {
                                EXOFS_ERR("ERROR: exofs_add_link: "
-                                        "zero-length directory entry");
+                                      "zero-length entry in directory(0x%lx)\n",
+                                      inode->i_ino);
                                err = -EIO;
                                goto out_unlock;
                        }
@@ -491,7 +492,8 @@ int exofs_add_link(struct dentry *dentry, struct inode *inode)
                exofs_put_page(page);
        }
-        EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode);
+        EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=0x%lx\n",
+                  dentry, inode->i_ino);
        return -EINVAL;
 got_it:
@@ -542,7 +544,8 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
        while (de < dir) {
                if (de->rec_len == 0) {
                        EXOFS_ERR("ERROR: exofs_delete_entry:"
-                                "zero-length directory entry");
+                                  "zero-length entry in directory(0x%lx)\n",
+                                  inode->i_ino);
                        err = -EIO;
                        goto out;
                }
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 2dc925fa1010..c965806c2821 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -77,7 +77,7 @@ struct exofs_layout {
 * our extension to the in-memory superblock
 */
 struct exofs_sb_info {
-        struct exofs_fscb s_fscb;               /* Written often, pre-allocate*/
+        struct exofs_sb_stats s_ess;            /* Written often, pre-allocate*/
        int             s_timeout;              /* timeout for OSD operations */
        uint64_t        s_nextid;               /* highest object ID used     */
        uint32_t        s_numfiles;             /* number of files on fs      */
@@ -256,6 +256,8 @@ static inline int exofs_oi_read(struct exofs_i_info *oi,
 }
 /* inode.c               */
+unsigned exofs_max_io_pages(struct exofs_layout *layout,
+                            unsigned expected_pages);
 int exofs_setattr(struct dentry *, struct iattr *);
 int exofs_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
@@ -279,7 +281,7 @@ int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
                    struct inode *);
 /* super.c               */
-int exofs_sync_fs(struct super_block *sb, int wait);
+int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
 /*********************
 * operation vectors *
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index b905c79b4f0a..45ca323d8363 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -45,22 +45,8 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
 static int exofs_file_fsync(struct file *filp, int datasync)
 {
        int ret;
-        struct inode *inode = filp->f_mapping->host;
-        struct super_block *sb;
-        if (!(inode->i_state & I_DIRTY))
-                return 0;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return 0;
-        ret = sync_inode_metadata(inode, 1);
-        /* This is a good place to write the sb */
-        /* TODO: Sechedule an sb-sync on create */
-        sb = inode->i_sb;
-        if (sb->s_dirt)
-                exofs_sync_fs(sb, 1);
+        ret = sync_inode_metadata(filp->f_mapping->host, 1);
        return ret;
 }
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 42685424817b..8472c098445d 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -43,6 +43,17 @@ enum { BIO_MAX_PAGES_KMALLOC =
                PAGE_SIZE / sizeof(struct page *),
 };
+unsigned exofs_max_io_pages(struct exofs_layout *layout,
+                            unsigned expected_pages)
+{
+        unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
+        /* TODO: easily support bio chaining */
+        pages =  min_t(unsigned, pages,
+                       layout->group_width * BIO_MAX_PAGES_KMALLOC);
+        return pages;
+}
 struct page_collect {
        struct exofs_sb_info *sbi;
        struct inode *inode;
@@ -97,8 +108,7 @@ static void _pcol_reset(struct page_collect *pcol)
 static int pcol_try_alloc(struct page_collect *pcol)
 {
-        unsigned pages = min_t(unsigned, pcol->expected_pages,
+        unsigned pages;
-                          MAX_PAGES_KMALLOC);
        if (!pcol->ios) { /* First time allocate io_state */
                int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
@@ -108,8 +118,7 @@ static int pcol_try_alloc(struct page_collect *pcol)
        }
        /* TODO: easily support bio chaining */
-        pages =  min_t(unsigned, pages,
+        pages =  exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages);
-                       pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC);
        for (; pages; pages >>= 1) {
                pcol->pages = kmalloc(pages * sizeof(struct page *),
@@ -350,8 +359,10 @@ static int readpage_strip(void *data, struct page *page)
                if (!pcol->read_4_write)
                        unlock_page(page);
-                EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
+                EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx "
-                             " splitting\n", inode->i_ino, page->index);
+                             "read_4_write=%d index=0x%lx end_index=0x%lx "
+                             "splitting\n", inode->i_ino, len,
+                             pcol->read_4_write, page->index, end_index);
                return read_exec(pcol);
        }
@@ -722,11 +733,28 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
         /* read modify write */
        if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+                loff_t i_size = i_size_read(mapping->host);
+                pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+                size_t rlen;
+                if (page->index < end_index)
+                        rlen = PAGE_CACHE_SIZE;
+                else if (page->index == end_index)
+                        rlen = i_size & ~PAGE_CACHE_MASK;
+                else
+                        rlen = 0;
+                if (!rlen) {
+                        clear_highpage(page);
+                        SetPageUptodate(page);
+                        goto out;
+                }
                ret = _readpage(page, true);
                if (ret) {
                        /*SetPageError was done by _readpage. Is it ok?*/
                        unlock_page(page);
-                        EXOFS_DBGMSG("__readpage_filler failed\n");
+                        EXOFS_DBGMSG("__readpage failed\n");
                }
        }
 out:
@@ -795,7 +823,6 @@ const struct address_space_operations exofs_aops = {
        .direct_IO      = NULL, /* TODO: Should be trivial to do */
        /* With these NULL has special meaning or default is not exported */
-        .sync_page      = NULL,
        .get_xip_mem    = NULL,
        .migratepage    = NULL,
        .launder_page   = NULL,
@@ -1074,6 +1101,7 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi)
        }
        return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
 }
 /*
 * Callback function from exofs_new_inode().  The important thing is that we
 * set the obj_created flag so that other methods know that the object exists on
@@ -1132,7 +1160,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        sbi = sb->s_fs_info;
        inode->i_mapping->backing_dev_info = sb->s_bdi;
-        sb->s_dirt = 1;
        inode_init_owner(inode, dir, mode);
        inode->i_ino = sbi->s_nextid++;
        inode->i_blkbits = EXOFS_BLKSHIFT;
@@ -1143,6 +1170,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        spin_unlock(&sbi->s_next_gen_lock);
        insert_inode_hash(inode);
+        exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
        mark_inode_dirty(inode);
        ret = exofs_get_io_state(&sbi->layout, &ios);
@@ -1273,7 +1302,8 @@ out:
 int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-        return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+        /* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */
+        return exofs_update_inode(inode, 1);
 }
 /*
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 264e95d02830..4d70db110cfc 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -272,7 +272,6 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                err = exofs_set_link(new_dir, new_de, new_page, old_inode);
                new_inode->i_ctime = CURRENT_TIME;
                if (dir_de)
@@ -286,12 +285,9 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        if (new_dir->i_nlink >= EXOFS_LINK_MAX)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = exofs_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
@@ -299,7 +295,7 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
        old_inode->i_ctime = CURRENT_TIME;
        exofs_delete_entry(old_de, old_page);
-        inode_dec_link_count(old_inode);
+        mark_inode_dirty(old_inode);
        if (dir_de) {
                err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 8c6c4669b381..06065bd37fc3 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -48,6 +48,7 @@
 * struct to hold what we get from mount options
 */
 struct exofs_mountopt {
+        bool is_osdname;
        const char *dev_name;
        uint64_t pid;
        int timeout;
@@ -56,7 +57,7 @@ struct exofs_mountopt {
 /*
 * exofs-specific mount-time options.
 */
-enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
+enum { Opt_name, Opt_pid, Opt_to, Opt_err };
 /*
 * Our mount-time options.  These should ideally be 64-bit unsigned, but the
@@ -64,6 +65,7 @@ enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
 * sufficient for most applications now.
 */
 static match_table_t tokens = {
+        {Opt_name, "osdname=%s"},
        {Opt_pid, "pid=%u"},
        {Opt_to, "to=%u"},
        {Opt_err, NULL}
@@ -94,6 +96,14 @@ static int parse_options(char *options, struct exofs_mountopt *opts)
                token = match_token(p, tokens, args);
                switch (token) {
+                case Opt_name:
+                        opts->dev_name = match_strdup(&args[0]);
+                        if (unlikely(!opts->dev_name)) {
+                                EXOFS_ERR("Error allocating dev_name");
+                                return -ENOMEM;
+                        }
+                        opts->is_osdname = true;
+                        break;
                case Opt_pid:
                        if (0 == match_strlcpy(str, &args[0], sizeof(str)))
                                return -EINVAL;
@@ -203,6 +213,101 @@ static void destroy_inodecache(void)
 static const struct super_operations exofs_sops;
 static const struct export_operations exofs_export_ops;
+static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
+        EXOFS_APAGE_SB_DATA,
+        EXOFS_ATTR_SB_STATS,
+        sizeof(struct exofs_sb_stats));
+static int __sbi_read_stats(struct exofs_sb_info *sbi)
+{
+        struct osd_attr attrs[] = {
+                [0] = g_attr_sb_stats,
+        };
+        struct exofs_io_state *ios;
+        int ret;
+        ret = exofs_get_io_state(&sbi->layout, &ios);
+        if (unlikely(ret)) {
+                EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+                return ret;
+        }
+        ios->cred = sbi->s_cred;
+        ios->in_attr = attrs;
+        ios->in_attr_len = ARRAY_SIZE(attrs);
+        ret = exofs_sbi_read(ios);
+        if (unlikely(ret)) {
+                EXOFS_ERR("Error reading super_block stats => %d\n", ret);
+                goto out;
+        }
+        ret = extract_attr_from_ios(ios, &attrs[0]);
+        if (ret) {
+                EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__);
+                goto out;
+        }
+        if (attrs[0].len) {
+                struct exofs_sb_stats *ess;
+                if (unlikely(attrs[0].len != sizeof(*ess))) {
+                        EXOFS_ERR("%s: Wrong version of exofs_sb_stats "
+                                  "size(%d) != expected(%zd)\n",
+                                  __func__, attrs[0].len, sizeof(*ess));
+                        goto out;
+                }
+                ess = attrs[0].val_ptr;
+                sbi->s_nextid = le64_to_cpu(ess->s_nextid);
+                sbi->s_numfiles = le32_to_cpu(ess->s_numfiles);
+        }
+out:
+        exofs_put_io_state(ios);
+        return ret;
+}
+static void stats_done(struct exofs_io_state *ios, void *p)
+{
+        exofs_put_io_state(ios);
+        /* Good thanks nothing to do anymore */
+}
+/* Asynchronously write the stats attribute */
+int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
+{
+        struct osd_attr attrs[] = {
+                [0] = g_attr_sb_stats,
+        };
+        struct exofs_io_state *ios;
+        int ret;
+        ret = exofs_get_io_state(&sbi->layout, &ios);
+        if (unlikely(ret)) {
+                EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+                return ret;
+        }
+        sbi->s_ess.s_nextid   = cpu_to_le64(sbi->s_nextid);
+        sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
+        attrs[0].val_ptr = &sbi->s_ess;
+        ios->cred = sbi->s_cred;
+        ios->done = stats_done;
+        ios->private = sbi;
+        ios->out_attr = attrs;
+        ios->out_attr_len = ARRAY_SIZE(attrs);
+        ret = exofs_sbi_write(ios);
+        if (unlikely(ret)) {
+                EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
+                exofs_put_io_state(ios);
+        }
+        return ret;
+}
 /*
 * Write the superblock to the OSD
 */
@@ -213,18 +318,25 @@ int exofs_sync_fs(struct super_block *sb, int wait)
        struct exofs_io_state *ios;
        int ret = -ENOMEM;
-        lock_super(sb);
+        fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
+        if (unlikely(!fscb))
+                return -ENOMEM;
        sbi = sb->s_fs_info;
-        fscb = &sbi->s_fscb;
+        /* NOTE: We no longer dirty the super_block anywhere in exofs. The
+         * reason we write the fscb here on unmount is so we can stay backwards
+         * compatible with fscb->s_version == 1. (What we are not compatible
+         * with is if a new version FS crashed and then we try to mount an old
+         * version). Otherwise the exofs_fscb is read-only from mkfs time. All
+         * the writeable info is set in exofs_sbi_write_stats() above.
+         */
        ret = exofs_get_io_state(&sbi->layout, &ios);
-        if (ret)
+        if (unlikely(ret))
                goto out;
-        /* Note: We only write the changing part of the fscb. .i.e upto the
+        lock_super(sb);
-         *       the fscb->s_dev_table_oid member. There is no read-modify-write
-         *       here.
-         */
        ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
        memset(fscb, 0, ios->length);
        fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
@@ -239,16 +351,17 @@ int exofs_sync_fs(struct super_block *sb, int wait)
        ios->cred = sbi->s_cred;
        ret = exofs_sbi_write(ios);
-        if (unlikely(ret)) {
+        if (unlikely(ret))
                EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
-                goto out;
+        else
-        }
+                sb->s_dirt = 0;
-        sb->s_dirt = 0;
+        unlock_super(sb);
 out:
        EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
        exofs_put_io_state(ios);
-        unlock_super(sb);
+        kfree(fscb);
        return ret;
 }
@@ -292,13 +405,14 @@ static void exofs_put_super(struct super_block *sb)
        int num_pend;
        struct exofs_sb_info *sbi = sb->s_fs_info;
-        if (sb->s_dirt)
-                exofs_write_super(sb);
        /* make sure there are no pending commands */
        for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
             num_pend = atomic_read(&sbi->s_curr_pending)) {
                wait_queue_head_t wq;
+                printk(KERN_NOTICE "%s: !!Pending operations in flight. "
+                       "This is a BUG. please report to osd-dev@open-osd.org\n",
+                       __func__);
                init_waitqueue_head(&wq);
                wait_event_timeout(wq,
                                  (atomic_read(&sbi->s_curr_pending) == 0),
@@ -390,6 +504,23 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
        return 0;
 }
+static unsigned __ra_pages(struct exofs_layout *layout)
+{
+        const unsigned _MIN_RA = 32; /* min 128K read-ahead */
+        unsigned ra_pages = layout->group_width * layout->stripe_unit /
+                                PAGE_SIZE;
+        unsigned max_io_pages = exofs_max_io_pages(layout, ~0);
+        ra_pages *= 2; /* two stripes */
+        if (ra_pages < _MIN_RA)
+                ra_pages = roundup(_MIN_RA, ra_pages / 2);
+        if (ra_pages > max_io_pages)
+                ra_pages = max_io_pages;
+        return ra_pages;
+}
 /* @odi is valid only as long as @fscb_dev is valid */
 static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
                             struct osd_dev_info *odi)
@@ -495,7 +626,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
                }
                od = osduld_info_lookup(&odi);
-                if (unlikely(IS_ERR(od))) {
+                if (IS_ERR(od)) {
                        ret = PTR_ERR(od);
                        EXOFS_ERR("ERROR: device requested is not found "
                                  "osd_name-%s =>%d\n", odi.osdname, ret);
@@ -558,9 +689,17 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                goto free_bdi;
        /* use mount options to fill superblock */
-        od = osduld_path_lookup(opts->dev_name);
+        if (opts->is_osdname) {
+                struct osd_dev_info odi = {.systemid_len = 0};
+                odi.osdname_len = strlen(opts->dev_name);
+                odi.osdname = (u8 *)opts->dev_name;
+                od = osduld_info_lookup(&odi);
+        } else {
+                od = osduld_path_lookup(opts->dev_name);
+        }
        if (IS_ERR(od)) {
-                ret = PTR_ERR(od);
+                ret = -EINVAL;
                goto free_sbi;
        }
@@ -594,6 +733,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                goto free_sbi;
        sb->s_magic = le16_to_cpu(fscb.s_magic);
+        /* NOTE: we read below to be backward compatible with old versions */
        sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
        sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
@@ -604,7 +744,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                ret = -EINVAL;
                goto free_sbi;
        }
-        if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) {
+        if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) {
                EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
                          EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
                ret = -EINVAL;
@@ -622,7 +762,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                        goto free_sbi;
        }
+        __sbi_read_stats(sbi);
        /* set up operation vectors */
+        sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
        sb->s_bdi = &sbi->bdi;
        sb->s_fs_info = sbi;
        sb->s_op = &exofs_sops;
@@ -652,6 +795,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0],
                            sbi->layout.s_pid);
+        if (opts->is_osdname)
+                kfree(opts->dev_name);
        return 0;
 free_sbi:
@@ -660,6 +805,8 @@ free_bdi:
        EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
                  opts->dev_name, sbi->layout.s_pid, ret);
        exofs_free_sbi(sbi);
+        if (opts->is_osdname)
+                kfree(opts->dev_name);
        return ret;
 }
@@ -677,7 +824,8 @@ static struct dentry *exofs_mount(struct file_system_type *type,
        if (ret)
                return ERR_PTR(ret);
-        opts.dev_name = dev_name;
+        if (!opts.dev_name)
+                opts.dev_name = dev_name;
        return mount_nodev(type, flags, &opts, exofs_fill_super);
 }
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 4b6825740dd5..b05acb796135 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -320,9 +320,14 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid,
        struct inode * inode = dentry->d_inode;
        int len = *max_len;
        int type = FILEID_INO32_GEN;
-        
-        if (len < 2 || (connectable && len < 4))
+        if (connectable && (len < 4)) {
+                *max_len = 4;
+                return 255;
+        } else if (len < 2) {
+                *max_len = 2;
                return 255;
+        }
        len = 2;
        fid->i32.ino = inode->i_ino;
@@ -369,6 +374,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
        /*
         * Try to get any dentry for the given file handle from the filesystem.
         */
+        if (!nop || !nop->fh_to_dentry)
+                return ERR_PTR(-ESTALE);
        result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
        if (!result)
                result = ERR_PTR(-ESTALE);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 7b4180554a62..abea5a17c764 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -406,7 +406,7 @@ ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
                return -EINVAL;
        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(dentry->d_inode))
+        if (!inode_owner_or_capable(dentry->d_inode))
                return -EPERM;
        if (value) {
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 6346a2acf326..645be9e7ee47 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -110,7 +110,7 @@ extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
 extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
 /* ialloc.c */
-extern struct inode * ext2_new_inode (struct inode *, int);
+extern struct inode * ext2_new_inode (struct inode *, int, const struct qstr *);
 extern void ext2_free_inode (struct inode *);
 extern unsigned long ext2_count_free_inodes (struct super_block *);
 extern void ext2_check_inodes_bitmap (struct super_block *);
@@ -174,3 +174,9 @@ ext2_group_first_block_no(struct super_block *sb, unsigned long group_no)
        return group_no * (ext2_fsblk_t)EXT2_BLOCKS_PER_GROUP(sb) +
                le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block);
 }
+#define ext2_set_bit    __test_and_set_bit_le
+#define ext2_clear_bit  __test_and_clear_bit_le
+#define ext2_test_bit   test_bit_le
+#define ext2_find_first_zero_bit        find_first_zero_bit_le
+#define ext2_find_next_zero_bit         find_next_zero_bit_le
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index ad70479aabff..ee9ed31948e1 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -429,7 +429,8 @@ found:
        return group;
 }
-struct inode *ext2_new_inode(struct inode *dir, int mode)
+struct inode *ext2_new_inode(struct inode *dir, int mode,
+                             const struct qstr *qstr)
 {
        struct super_block *sb;
        struct buffer_head *bitmap_bh = NULL;
@@ -585,7 +586,7 @@ got:
        if (err)
                goto fail_free_drop;
-        err = ext2_init_security(inode,dir);
+        err = ext2_init_security(inode, dir, qstr);
        if (err)
                goto fail_free_drop;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 40ad210a5049..c47f706878b5 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -860,7 +860,6 @@ const struct address_space_operations ext2_aops = {
        .readpage               = ext2_readpage,
        .readpages              = ext2_readpages,
        .writepage              = ext2_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext2_write_begin,
        .write_end              = ext2_write_end,
        .bmap                   = ext2_bmap,
@@ -880,7 +879,6 @@ const struct address_space_operations ext2_nobh_aops = {
        .readpage               = ext2_readpage,
        .readpages              = ext2_readpages,
        .writepage              = ext2_nobh_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext2_nobh_write_begin,
        .write_end              = nobh_write_end,
        .bmap                   = ext2_bmap,
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index e7431309bdca..f81e250ac5c4 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -39,7 +39,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                if (ret)
                        return ret;
-                if (!is_owner_or_cap(inode)) {
+                if (!inode_owner_or_capable(inode)) {
                        ret = -EACCES;
                        goto setflags_out;
                }
@@ -89,7 +89,7 @@ setflags_out:
        case EXT2_IOC_GETVERSION:
                return put_user(inode->i_generation, (int __user *) arg);
        case EXT2_IOC_SETVERSION:
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EPERM;
                ret = mnt_want_write(filp->f_path.mnt);
                if (ret)
@@ -115,7 +115,7 @@ setflags_out:
                if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
                        return -ENOTTY;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                if (get_user(rsv_window_size, (int __user *)arg))
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 2e1d8341d827..ed5c5d496ee9 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -104,7 +104,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, st
        dquot_initialize(dir);
-        inode = ext2_new_inode(dir, mode);
+        inode = ext2_new_inode(dir, mode, &dentry->d_name);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
@@ -133,7 +133,7 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_
        dquot_initialize(dir);
-        inode = ext2_new_inode (dir, mode);
+        inode = ext2_new_inode (dir, mode, &dentry->d_name);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                init_special_inode(inode, inode->i_mode, rdev);
@@ -159,7 +159,7 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
        dquot_initialize(dir);
-        inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO);
+        inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO, &dentry->d_name);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out;
@@ -230,7 +230,7 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
        inode_inc_link_count(dir);
-        inode = ext2_new_inode (dir, S_IFDIR | mode);
+        inode = ext2_new_inode(dir, S_IFDIR | mode, &dentry->d_name);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_dir;
@@ -344,7 +344,6 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
                new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                ext2_set_link(new_dir, new_de, new_page, old_inode, 1);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
@@ -356,12 +355,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
                        if (new_dir->i_nlink >= EXT2_LINK_MAX)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = ext2_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
@@ -369,12 +365,11 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
        /*
         * Like most other Unix systems, set the ctime for inodes on a
         * rename.
-         * inode_dec_link_count() will mark the inode dirty.
         */
        old_inode->i_ctime = CURRENT_TIME_SEC;
+        mark_inode_dirty(old_inode);
        ext2_delete_entry (old_de, old_page);
-        inode_dec_link_count(old_inode);
        if (dir_de) {
                if (old_dir != new_dir)
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index a1a1c2184616..5e41cccff762 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -116,9 +116,11 @@ exit_ext2_xattr(void)
 # endif  /* CONFIG_EXT2_FS_XATTR */
 #ifdef CONFIG_EXT2_FS_SECURITY
-extern int ext2_init_security(struct inode *inode, struct inode *dir);
+extern int ext2_init_security(struct inode *inode, struct inode *dir,
+                              const struct qstr *qstr);
 #else
-static inline int ext2_init_security(struct inode *inode, struct inode *dir)
+static inline int ext2_init_security(struct inode *inode, struct inode *dir,
+                                     const struct qstr *qstr)
 {
        return 0;
 }
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 3004e15d5da5..5d979b4347b0 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -47,14 +47,15 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name,
 }
 int
-ext2_init_security(struct inode *inode, struct inode *dir)
+ext2_init_security(struct inode *inode, struct inode *dir,
+                   const struct qstr *qstr)
 {
        int err;
        size_t len;
        void *value;
        char *name;
-        err = security_inode_init_security(inode, dir, &name, &value, &len);
+        err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index e4fa49e6c539..9d021c0d472a 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -435,7 +435,7 @@ ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
                return -EINVAL;
        if (!test_opt(inode->i_sb, POSIX_ACL))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EPERM;
        if (value) {
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 045995c8ce5a..153242187fce 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1991,6 +1991,7 @@ ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
                spin_unlock(sb_bgl_lock(sbi, group));
                percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
+                free_blocks -= next - start;
                /* Do not issue a TRIM on extents smaller than minblocks */
                if ((next - start) < minblocks)
                        goto free_extent;
@@ -2040,7 +2041,7 @@ free_extent:
                cond_resched();
                /* No more suitable extents */
-                if ((free_blocks - count) < minblocks)
+                if (free_blocks < minblocks)
                        break;
        }
@@ -2090,7 +2091,8 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
        ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
        int ret = 0;
-        start = range->start >> sb->s_blocksize_bits;
+        start = (range->start >> sb->s_blocksize_bits) +
+                le32_to_cpu(es->s_first_data_block);
        len = range->len >> sb->s_blocksize_bits;
        minlen = range->minlen >> sb->s_blocksize_bits;
        trimmed = 0;
@@ -2099,10 +2101,6 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
                return -EINVAL;
        if (start >= max_blks)
                goto out;
-        if (start < le32_to_cpu(es->s_first_data_block)) {
-                len -= le32_to_cpu(es->s_first_data_block) - start;
-                start = le32_to_cpu(es->s_first_data_block);
-        }
        if (start + len > max_blks)
                len = max_blks - start;
@@ -2129,10 +2127,15 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
                if (free_blocks < minlen)
                        continue;
-                if (len >= EXT3_BLOCKS_PER_GROUP(sb))
+                /*
-                        len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block);
+                 * For all the groups except the last one, last block will
-                else
+                 * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to
+                 * change it for the last group in which case first_block +
+                 * len < EXT3_BLOCKS_PER_GROUP(sb).
+                 */
+                if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb))
                        last_block = first_block + len;
+                len -= last_block - first_block;
                ret = ext3_trim_all_free(sb, group, first_block,
                                        last_block, minlen);
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 9724aef22460..bfc2dc43681d 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -404,7 +404,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent)
 * For other inodes, search forward from the parent directory's block
 * group to find a free inode.
 */
-struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
+struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
+                             const struct qstr *qstr, int mode)
 {
        struct super_block *sb;
        struct buffer_head *bitmap_bh = NULL;
@@ -589,7 +590,7 @@ got:
        if (err)
                goto fail_free_drop;
-        err = ext3_init_security(handle,inode, dir);
+        err = ext3_init_security(handle, inode, dir, qstr);
        if (err)
                goto fail_free_drop;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index ae94f6d949f5..fe2541d250e4 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1894,7 +1894,6 @@ static const struct address_space_operations ext3_ordered_aops = {
        .readpage               = ext3_readpage,
        .readpages              = ext3_readpages,
        .writepage              = ext3_ordered_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext3_write_begin,
        .write_end              = ext3_ordered_write_end,
        .bmap                   = ext3_bmap,
@@ -1910,7 +1909,6 @@ static const struct address_space_operations ext3_writeback_aops = {
        .readpage               = ext3_readpage,
        .readpages              = ext3_readpages,
        .writepage              = ext3_writeback_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext3_write_begin,
        .write_end              = ext3_writeback_write_end,
        .bmap                   = ext3_bmap,
@@ -1926,7 +1924,6 @@ static const struct address_space_operations ext3_journalled_aops = {
        .readpage               = ext3_readpage,
        .readpages              = ext3_readpages,
        .writepage              = ext3_journalled_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext3_write_begin,
        .write_end              = ext3_journalled_write_end,
        .set_page_dirty         = ext3_journalled_set_page_dirty,
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index fc080dd561f7..f4090bd2f345 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -38,7 +38,7 @@ long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                unsigned int oldflags;
                unsigned int jflag;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                if (get_user(flags, (int __user *) arg))
@@ -123,7 +123,7 @@ flags_out:
                __u32 generation;
                int err;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EPERM;
                err = mnt_want_write(filp->f_path.mnt);
@@ -192,7 +192,7 @@ setversion_out:
                if (err)
                        return err;
-                if (!is_owner_or_cap(inode)) {
+                if (!inode_owner_or_capable(inode)) {
                        err = -EACCES;
                        goto setrsvsz_out;
                }
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index b27ba71810ec..32f3b8695859 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1540,8 +1540,8 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        goto cleanup;
                node2 = (struct dx_node *)(bh2->b_data);
                entries2 = node2->entries;
+                memset(&node2->fake, 0, sizeof(struct fake_dirent));
                node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
-                node2->fake.inode = 0;
                BUFFER_TRACE(frame->bh, "get_write_access");
                err = ext3_journal_get_write_access(handle, frame->bh);
                if (err)
@@ -1710,7 +1710,7 @@ retry:
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
-        inode = ext3_new_inode (handle, dir, mode);
+        inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext3_file_inode_operations;
@@ -1746,7 +1746,7 @@ retry:
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
-        inode = ext3_new_inode (handle, dir, mode);
+        inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                init_special_inode(inode, inode->i_mode, rdev);
@@ -1784,7 +1784,7 @@ retry:
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
-        inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
+        inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
@@ -2206,7 +2206,7 @@ retry:
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
-        inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+        inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
@@ -2253,13 +2253,6 @@ static int ext3_link (struct dentry * old_dentry,
        dquot_initialize(dir);
-        /*
-         * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-         * otherwise has the potential to corrupt the orphan inode list.
-         */
-        if (inode->i_nlink == 0)
-                return -ENOENT;
 retry:
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 85c8cc8f2473..071689f86e18 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1464,6 +1464,13 @@ static void ext3_orphan_cleanup (struct super_block * sb,
                return;
        }
+        /* Check if feature set allows readwrite operations */
+        if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) {
+                ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+                         "unknown ROCOMPAT features");
+                return;
+        }
        if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
                if (es->s_last_orphan)
                        jbd_debug(1, "Errors on filesystem, "
@@ -1936,6 +1943,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        sb->s_qcop = &ext3_qctl_operations;
        sb->dq_op = &ext3_quota_operations;
 #endif
+        memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
        mutex_init(&sbi->s_orphan_lock);
        mutex_init(&sbi->s_resize_lock);
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 377fe7201169..2be4f69bfa64 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -128,10 +128,10 @@ exit_ext3_xattr(void)
 #ifdef CONFIG_EXT3_FS_SECURITY
 extern int ext3_init_security(handle_t *handle, struct inode *inode,
-                                struct inode *dir);
+                              struct inode *dir, const struct qstr *qstr);
 #else
 static inline int ext3_init_security(handle_t *handle, struct inode *inode,
-                                struct inode *dir)
+                                     struct inode *dir, const struct qstr *qstr)
 {
        return 0;
 }
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 03a99bfc59f9..b8d9f83aa5c5 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -49,14 +49,15 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name,
 }
 int
-ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+                   const struct qstr *qstr)
 {
        int err;
        size_t len;
        void *value;
        char *name;
-        err = security_inode_init_security(inode, dir, &name, &value, &len);
+        err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index e0270d1f8d82..21eacd7b7d79 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -433,7 +433,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
                return -EINVAL;
        if (!test_opt(inode->i_sb, POSIX_ACL))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EPERM;
        if (value) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0c8d97b56f34..4daaf2b753f4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -848,6 +848,7 @@ struct ext4_inode_info {
        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
        /* current io_end structure for async DIO write*/
        ext4_io_end_t *cur_aio_dio;
+        atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
        spinlock_t i_block_reservation_lock;
@@ -922,14 +923,14 @@ struct ext4_inode_info {
 #define test_opt2(sb, opt)              (EXT4_SB(sb)->s_mount_opt2 & \
                                         EXT4_MOUNT2_##opt)
-#define ext4_set_bit                    ext2_set_bit
+#define ext4_set_bit                    __test_and_set_bit_le
 #define ext4_set_bit_atomic             ext2_set_bit_atomic
-#define ext4_clear_bit                  ext2_clear_bit
+#define ext4_clear_bit                  __test_and_clear_bit_le
 #define ext4_clear_bit_atomic           ext2_clear_bit_atomic
-#define ext4_test_bit                   ext2_test_bit
+#define ext4_test_bit                   test_bit_le
-#define ext4_find_first_zero_bit        ext2_find_first_zero_bit
+#define ext4_find_first_zero_bit        find_first_zero_bit_le
-#define ext4_find_next_zero_bit         ext2_find_next_zero_bit
+#define ext4_find_next_zero_bit         find_next_zero_bit_le
-#define ext4_find_next_bit              ext2_find_next_bit
+#define ext4_find_next_bit              find_next_bit_le
 /*
 * Maximal mount counts between two filesystem checks
@@ -2119,6 +2120,15 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+/* For ioend & aio unwritten conversion wait queues */
+#define EXT4_WQ_HASH_SZ         37
+#define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
+                                            EXT4_WQ_HASH_SZ])
+#define ext4_aio_mutex(v)  (&ext4__aio_mutex[((unsigned long)(v)) %\
+                                             EXT4_WQ_HASH_SZ])
+extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
 #endif  /* __KERNEL__ */
 #endif  /* _EXT4_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 63a75810b7c3..7516fb9c0bd5 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -131,7 +131,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                 * fragmenting the file system's free space.  Maybe we
                 * should have some hueristics or some way to allow
                 * userspace to pass a hint to file system,
-                 * especiially if the latter case turns out to be
+                 * especially if the latter case turns out to be
                 * common.
                 */
                ex = path[depth].p_ext;
@@ -2844,7 +2844,7 @@ fix_extent_len:
 * ext4_get_blocks_dio_write() when DIO to write
 * to an uninitialized extent.
 *
- * Writing to an uninitized extent may result in splitting the uninitialized
+ * Writing to an uninitialized extent may result in splitting the uninitialized
 * extent into multiple /initialized uninitialized extents (up to three)
 * There are three possibilities:
 *   a> There is no split required: Entire extent should be uninitialized
@@ -3174,9 +3174,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                 * that this IO needs to convertion to written when IO is
                 * completed
                 */
-                if (io)
+                if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
                        io->flag = EXT4_IO_END_UNWRITTEN;
-                else
+                        atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+                } else
                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                if (ext4_should_dioread_nolock(inode))
                        map->m_flags |= EXT4_MAP_UNINIT;
@@ -3463,9 +3464,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                 * that we need to perform convertion when IO is done.
                 */
                if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-                        if (io)
+                        if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
                                io->flag = EXT4_IO_END_UNWRITTEN;
-                        else
+                                atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+                        } else
                                ext4_set_inode_state(inode,
                                                     EXT4_STATE_DIO_UNWRITTEN);
                }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2e8322c8aa88..7b80d543b89e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,47 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
        return 0;
 }
+static void ext4_aiodio_wait(struct inode *inode)
+{
+        wait_queue_head_t *wq = ext4_ioend_wq(inode);
+        wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
+}
+/*
+ * This tests whether the IO in question is block-aligned or not.
+ * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
+ * are converted to written only after the IO is complete.  Until they are
+ * mapped, these blocks appear as holes, so dio_zero_block() will assume that
+ * it needs to zero out portions of the start and/or end block.  If 2 AIO
+ * threads are at work on the same unwritten block, they must be synchronized
+ * or one thread will zero the other's data, causing corruption.
+ */
+static int
+ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
+                   unsigned long nr_segs, loff_t pos)
+{
+        struct super_block *sb = inode->i_sb;
+        int blockmask = sb->s_blocksize - 1;
+        size_t count = iov_length(iov, nr_segs);
+        loff_t final_size = pos + count;
+        if (pos >= inode->i_size)
+                return 0;
+        if ((pos & blockmask) || (final_size & blockmask))
+                return 1;
+        return 0;
+}
 static ssize_t
 ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
                unsigned long nr_segs, loff_t pos)
 {
        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+        int unaligned_aio = 0;
+        int ret;
        /*
         * If we have encountered a bitmap-format file, the size limit
@@ -78,9 +114,31 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
                        nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
                                              sbi->s_bitmap_maxbytes - pos);
                }
+        } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
+                   !is_sync_kiocb(iocb))) {
+                unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
        }
-        return generic_file_aio_write(iocb, iov, nr_segs, pos);
+        /* Unaligned direct AIO must be serialized; see comment above */
+        if (unaligned_aio) {
+                static unsigned long unaligned_warn_time;
+                /* Warn about this once per day */
+                if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
+                        ext4_msg(inode->i_sb, KERN_WARNING,
+                                 "Unaligned AIO/DIO on inode %ld by %s; "
+                                 "performance will be poor.",
+                                 inode->i_ino, current->comm);
+                mutex_lock(ext4_aio_mutex(inode));
+                ext4_aiodio_wait(inode);
+        }
+        ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+        if (unaligned_aio)
+                mutex_unlock(ext4_aio_mutex(inode));
+        return ret;
 }
 static const struct vm_operations_struct ext4_file_vm_ops = {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index eb9097aec6f0..78b79e1bd7ed 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1042,7 +1042,7 @@ got:
        if (err)
                goto fail_free_drop;
-        err = ext4_init_security(handle, inode, dir);
+        err = ext4_init_security(handle, inode, dir, qstr);
        if (err)
                goto fail_free_drop;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9f7f9e49914f..9297ad46c465 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3903,7 +3903,6 @@ static const struct address_space_operations ext4_ordered_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_ordered_write_end,
        .bmap                   = ext4_bmap,
@@ -3919,7 +3918,6 @@ static const struct address_space_operations ext4_writeback_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_writeback_write_end,
        .bmap                   = ext4_bmap,
@@ -3935,7 +3933,6 @@ static const struct address_space_operations ext4_journalled_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_journalled_write_end,
        .set_page_dirty         = ext4_journalled_set_page_dirty,
@@ -3951,7 +3948,6 @@ static const struct address_space_operations ext4_da_aops = {
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
        .writepages             = ext4_da_writepages,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_da_write_begin,
        .write_end              = ext4_da_write_end,
        .bmap                   = ext4_bmap,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index eb3bc2fe647e..a84faa110bcd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -38,7 +38,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                unsigned int oldflags;
                unsigned int jflag;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                if (get_user(flags, (int __user *) arg))
@@ -146,7 +146,7 @@ flags_out:
                __u32 generation;
                int err;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EPERM;
                err = mnt_want_write(filp->f_path.mnt);
@@ -298,7 +298,7 @@ mext_out:
        case EXT4_IOC_MIGRATE:
        {
                int err;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                err = mnt_want_write(filp->f_path.mnt);
@@ -320,7 +320,7 @@ mext_out:
        case EXT4_IOC_ALLOC_DA_BLKS:
        {
                int err;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                err = mnt_want_write(filp->f_path.mnt);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 851f49b2f9d2..d1fe09aea73d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -342,10 +342,15 @@ static struct kmem_cache *ext4_free_ext_cachep;
 /* We create slab caches for groupinfo data structures based on the
 * superblock block size.  There will be one per mounted filesystem for
 * each unique s_blocksize_bits */
-#define NR_GRPINFO_CACHES       \
+#define NR_GRPINFO_CACHES 8
-        (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
+static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
+        "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
+        "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
+        "ext4_groupinfo_64k", "ext4_groupinfo_128k"
+};
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -2414,6 +2419,55 @@ err_freesgi:
        return -ENOMEM;
 }
+static void ext4_groupinfo_destroy_slabs(void)
+{
+        int i;
+        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+                if (ext4_groupinfo_caches[i])
+                        kmem_cache_destroy(ext4_groupinfo_caches[i]);
+                ext4_groupinfo_caches[i] = NULL;
+        }
+}
+static int ext4_groupinfo_create_slab(size_t size)
+{
+        static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
+        int slab_size;
+        int blocksize_bits = order_base_2(size);
+        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+        struct kmem_cache *cachep;
+        if (cache_index >= NR_GRPINFO_CACHES)
+                return -EINVAL;
+        if (unlikely(cache_index < 0))
+                cache_index = 0;
+        mutex_lock(&ext4_grpinfo_slab_create_mutex);
+        if (ext4_groupinfo_caches[cache_index]) {
+                mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+                return 0;       /* Already created */
+        }
+        slab_size = offsetof(struct ext4_group_info,
+                                bb_counters[blocksize_bits + 2]);
+        cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
+                                        slab_size, 0, SLAB_RECLAIM_ACCOUNT,
+                                        NULL);
+        mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+        if (!cachep) {
+                printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n");
+                return -ENOMEM;
+        }
+        ext4_groupinfo_caches[cache_index] = cachep;
+        return 0;
+}
 int ext4_mb_init(struct super_block *sb, int needs_recovery)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2421,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        unsigned offset;
        unsigned max;
        int ret;
-        int cache_index;
-        struct kmem_cache *cachep;
-        char *namep = NULL;
        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
@@ -2440,30 +2491,9 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
                goto out;
        }
-        cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+        ret = ext4_groupinfo_create_slab(sb->s_blocksize);
-        cachep = ext4_groupinfo_caches[cache_index];
+        if (ret < 0)
-        if (!cachep) {
+                goto out;
-                char name[32];
-                int len = offsetof(struct ext4_group_info,
-                                        bb_counters[sb->s_blocksize_bits + 2]);
-                sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
-                namep = kstrdup(name, GFP_KERNEL);
-                if (!namep) {
-                        ret = -ENOMEM;
-                        goto out;
-                }
-                /* Need to free the kmem_cache_name() when we
-                 * destroy the slab */
-                cachep = kmem_cache_create(namep, len, 0,
-                                             SLAB_RECLAIM_ACCOUNT, NULL);
-                if (!cachep) {
-                        ret = -ENOMEM;
-                        goto out;
-                }
-                ext4_groupinfo_caches[cache_index] = cachep;
-        }
        /* order 0 is regular bitmap */
        sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
@@ -2520,7 +2550,6 @@ out:
        if (ret) {
                kfree(sbi->s_mb_offsets);
                kfree(sbi->s_mb_maxs);
-                kfree(namep);
        }
        return ret;
 }
@@ -2734,7 +2763,6 @@ int __init ext4_init_mballoc(void)
 void ext4_exit_mballoc(void)
 {
-        int i;
        /*
         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
         * before destroying the slab cache.
@@ -2743,15 +2771,7 @@ void ext4_exit_mballoc(void)
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
        kmem_cache_destroy(ext4_free_ext_cachep);
+        ext4_groupinfo_destroy_slabs();
-        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
-                struct kmem_cache *cachep = ext4_groupinfo_caches[i];
-                if (cachep) {
-                        char *name = (char *)kmem_cache_name(cachep);
-                        kmem_cache_destroy(cachep);
-                        kfree(name);
-                }
-        }
        ext4_remove_debugfs_entry();
 }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5485390d32c5..e781b7ea5630 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2304,13 +2304,6 @@ static int ext4_link(struct dentry *old_dentry,
        dquot_initialize(dir);
-        /*
-         * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-         * otherwise has the potential to corrupt the orphan inode list.
-         */
-        if (inode->i_nlink == 0)
-                return -ENOENT;
 retry:
        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7270dcfca92a..e2cd90e4bb7c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,14 +32,8 @@
 static struct kmem_cache *io_page_cachep, *io_end_cachep;
-#define WQ_HASH_SZ              37
-#define to_ioend_wq(v)  (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
-static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
 int __init ext4_init_pageio(void)
 {
-        int i;
        io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
        if (io_page_cachep == NULL)
                return -ENOMEM;
@@ -48,9 +42,6 @@ int __init ext4_init_pageio(void)
                kmem_cache_destroy(io_page_cachep);
                return -ENOMEM;
        }
-        for (i = 0; i < WQ_HASH_SZ; i++)
-                init_waitqueue_head(&ioend_wq[i]);
        return 0;
 }
@@ -62,7 +53,7 @@ void ext4_exit_pageio(void)
 void ext4_ioend_wait(struct inode *inode)
 {
-        wait_queue_head_t *wq = to_ioend_wq(inode);
+        wait_queue_head_t *wq = ext4_ioend_wq(inode);
        wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
 }
@@ -87,7 +78,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
        for (i = 0; i < io->num_io_pages; i++)
                put_io_page(io->pages[i]);
        io->num_io_pages = 0;
-        wq = to_ioend_wq(io->inode);
+        wq = ext4_ioend_wq(io->inode);
        if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
            waitqueue_active(wq))
                wake_up_all(wq);
@@ -102,6 +93,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
        struct inode *inode = io->inode;
        loff_t offset = io->offset;
        ssize_t size = io->size;
+        wait_queue_head_t *wq;
        int ret = 0;
        ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
@@ -126,7 +118,16 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
        if (io->iocb)
                aio_complete(io->iocb, io->result, 0);
        /* clear the DIO AIO unwritten flag */
-        io->flag &= ~EXT4_IO_END_UNWRITTEN;
+        if (io->flag & EXT4_IO_END_UNWRITTEN) {
+                io->flag &= ~EXT4_IO_END_UNWRITTEN;
+                /* Wake up anyone waiting on unwritten extent conversion */
+                wq = ext4_ioend_wq(io->inode);
+                if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
+                    waitqueue_active(wq)) {
+                        wake_up_all(wq);
+                }
+        }
        return ret;
 }
@@ -190,6 +191,7 @@ static void ext4_end_bio(struct bio *bio, int error)
        struct inode *inode;
        unsigned long flags;
        int i;
+        sector_t bi_sector = bio->bi_sector;
        BUG_ON(!io_end);
        bio->bi_private = NULL;
@@ -207,9 +209,7 @@ static void ext4_end_bio(struct bio *bio, int error)
                if (error)
                        SetPageError(page);
                BUG_ON(!head);
-                if (head->b_size == PAGE_CACHE_SIZE)
+                if (head->b_size != PAGE_CACHE_SIZE) {
-                        clear_buffer_dirty(head);
-                else {
                        loff_t offset;
                        loff_t io_end_offset = io_end->offset + io_end->size;
@@ -221,7 +221,6 @@ static void ext4_end_bio(struct bio *bio, int error)
                                        if (error)
                                                buffer_io_error(bh);
-                                        clear_buffer_dirty(bh);
                                }
                                if (buffer_delay(bh))
                                        partial_write = 1;
@@ -257,7 +256,7 @@ static void ext4_end_bio(struct bio *bio, int error)
                             (unsigned long long) io_end->offset,
                             (long) io_end->size,
                             (unsigned long long)
-                             bio->bi_sector >> (inode->i_blkbits - 9));
+                             bi_sector >> (inode->i_blkbits - 9));
        }
        /* Add the io_end to per-inode completed io list*/
@@ -311,8 +310,7 @@ static int io_submit_init(struct ext4_io_submit *io,
        io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
        io->io_bio = bio;
-        io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?
+        io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
-                        WRITE_SYNC_PLUG : WRITE);
        io->io_next_block = bh->b_blocknr;
        return 0;
 }
@@ -380,6 +378,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
        blocksize = 1 << inode->i_blkbits;
+        BUG_ON(!PageLocked(page));
        BUG_ON(PageWriteback(page));
        set_page_writeback(page);
        ClearPageError(page);
@@ -397,12 +396,14 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
        for (bh = head = page_buffers(page), block_start = 0;
             bh != head || !block_start;
             block_start = block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                if (block_start >= len) {
                        clear_buffer_dirty(bh);
                        set_buffer_uptodate(bh);
                        continue;
                }
+                clear_buffer_dirty(bh);
                ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
                if (ret) {
                        /*
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 48ce561fafac..203f9e4a70be 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -77,6 +77,7 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
                       const char *dev_name, void *data);
 static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
+static void ext4_clear_request_list(void);
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
@@ -832,6 +833,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei->i_sync_tid = 0;
        ei->i_datasync_tid = 0;
        atomic_set(&ei->i_ioend_count, 0);
+        atomic_set(&ei->i_aiodio_unwritten, 0);
        return &ei->vfs_inode;
 }
@@ -2716,6 +2718,8 @@ static void ext4_unregister_li_request(struct super_block *sb)
        mutex_unlock(&ext4_li_info->li_list_mtx);
 }
+static struct task_struct *ext4_lazyinit_task;
 /*
 * This is the function where ext4lazyinit thread lives. It walks
 * through the request list searching for next scheduled filesystem.
@@ -2784,6 +2788,10 @@ cont_thread:
                if (time_before(jiffies, next_wakeup))
                        schedule();
                finish_wait(&eli->li_wait_daemon, &wait);
+                if (kthread_should_stop()) {
+                        ext4_clear_request_list();
+                        goto exit_thread;
+                }
        }
 exit_thread:
@@ -2808,6 +2816,7 @@ exit_thread:
        wake_up(&eli->li_wait_task);
        kfree(ext4_li_info);
+        ext4_lazyinit_task = NULL;
        ext4_li_info = NULL;
        mutex_unlock(&ext4_li_mtx);
@@ -2830,11 +2839,10 @@ static void ext4_clear_request_list(void)
 static int ext4_run_lazyinit_thread(void)
 {
-        struct task_struct *t;
+        ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
+                                         ext4_li_info, "ext4lazyinit");
-        t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
+        if (IS_ERR(ext4_lazyinit_task)) {
-        if (IS_ERR(t)) {
+                int err = PTR_ERR(ext4_lazyinit_task);
-                int err = PTR_ERR(t);
                ext4_clear_request_list();
                del_timer_sync(&ext4_li_info->li_timer);
                kfree(ext4_li_info);
@@ -2985,16 +2993,10 @@ static void ext4_destroy_lazyinit_thread(void)
         * If thread exited earlier
         * there's nothing to be done.
         */
-        if (!ext4_li_info)
+        if (!ext4_li_info || !ext4_lazyinit_task)
                return;
-        ext4_clear_request_list();
+        kthread_stop(ext4_lazyinit_task);
-        while (ext4_li_info->li_task) {
-                wake_up(&ext4_li_info->li_wait_daemon);
-                wait_event(ext4_li_info->li_wait_task,
-                           ext4_li_info->li_task == NULL);
-        }
 }
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
@@ -3413,6 +3415,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_qcop = &ext4_qctl_operations;
        sb->dq_op = &ext4_quota_operations;
 #endif
+        memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
        mutex_init(&sbi->s_orphan_lock);
        mutex_init(&sbi->s_resize_lock);
@@ -3507,7 +3511,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
 no_journal:
-        EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
+        /*
+         * The maximum number of concurrent works can be high and
+         * concurrency isn't really necessary.  Limit it to 1.
+         */
+        EXT4_SB(sb)->dio_unwritten_wq =
+                alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM, 1);
        if (!EXT4_SB(sb)->dio_unwritten_wq) {
                printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
                goto failed_mount_wq;
@@ -4768,7 +4777,7 @@ static struct file_system_type ext4_fs_type = {
        .fs_flags       = FS_REQUIRES_DEV,
 };
-int __init ext4_init_feat_adverts(void)
+static int __init ext4_init_feat_adverts(void)
 {
        struct ext4_features *ef;
        int ret = -ENOMEM;
@@ -4792,23 +4801,44 @@ out:
        return ret;
 }
+static void ext4_exit_feat_adverts(void)
+{
+        kobject_put(&ext4_feat->f_kobj);
+        wait_for_completion(&ext4_feat->f_kobj_unregister);
+        kfree(ext4_feat);
+}
+/* Shared across all ext4 file systems */
+wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
 static int __init ext4_init_fs(void)
 {
-        int err;
+        int i, err;
        ext4_check_flag_values();
+        for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
+                mutex_init(&ext4__aio_mutex[i]);
+                init_waitqueue_head(&ext4__ioend_wq[i]);
+        }
        err = ext4_init_pageio();
        if (err)
                return err;
        err = ext4_init_system_zone();
        if (err)
-                goto out5;
+                goto out7;
        ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
        if (!ext4_kset)
-                goto out4;
+                goto out6;
        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
+        if (!ext4_proc_root)
+                goto out5;
        err = ext4_init_feat_adverts();
+        if (err)
+                goto out4;
        err = ext4_init_mballoc();
        if (err)
@@ -4838,12 +4868,14 @@ out1:
 out2:
        ext4_exit_mballoc();
 out3:
-        kfree(ext4_feat);
+        ext4_exit_feat_adverts();
+out4:
        remove_proc_entry("fs/ext4", NULL);
+out5:
        kset_unregister(ext4_kset);
-out4:
+out6:
        ext4_exit_system_zone();
-out5:
+out7:
        ext4_exit_pageio();
        return err;
 }
@@ -4857,6 +4889,7 @@ static void __exit ext4_exit_fs(void)
        destroy_inodecache();
        ext4_exit_xattr();
        ext4_exit_mballoc();
+        ext4_exit_feat_adverts();
        remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
        ext4_exit_system_zone();
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 1ef16520b950..25b7387ff183 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -145,10 +145,10 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 #ifdef CONFIG_EXT4_FS_SECURITY
 extern int ext4_init_security(handle_t *handle, struct inode *inode,
-                                struct inode *dir);
+                              struct inode *dir, const struct qstr *qstr);
 #else
 static inline int ext4_init_security(handle_t *handle, struct inode *inode,
-                                struct inode *dir)
+                                     struct inode *dir, const struct qstr *qstr)
 {
        return 0;
 }
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 9b21268e121c..007c3bfbf094 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -49,14 +49,15 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
 }
 int
-ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+                   const struct qstr *qstr)
 {
        int err;
        size_t len;
        void *value;
        char *name;
-        err = security_inode_init_security(inode, dir, &name, &value, &len);
+        err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 86753fe10bd1..8d68690bdcf1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -236,7 +236,6 @@ static const struct address_space_operations fat_aops = {
        .readpages      = fat_readpages,
        .writepage      = fat_writepage,
        .writepages     = fat_writepages,
-        .sync_page      = block_sync_page,
        .write_begin    = fat_write_begin,
        .write_end      = fat_write_end,
        .direct_IO      = fat_direct_IO,
@@ -757,8 +756,10 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
        struct inode *inode =  de->d_inode;
        u32 ipos_h, ipos_m, ipos_l;
-        if (len < 5)
+        if (len < 5) {
+                *lenp = 5;
                return 255; /* no room */
+        }
        ipos_h = MSDOS_I(inode)->i_pos >> 8;
        ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index f88f752babd9..adae3fb7451a 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -43,7 +43,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
 static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        /* This is not negative dentry. Always valid. */
@@ -54,7 +54,7 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 {
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        /*
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ecc8b3954ed6..22764c7c8382 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -131,7 +131,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 SYSCALL_DEFINE1(dup, unsigned int, fildes)
 {
        int ret = -EBADF;
-        struct file *file = fget(fildes);
+        struct file *file = fget_raw(fildes);
        if (file) {
                ret = get_unused_fd();
@@ -159,7 +159,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
        /* O_NOATIME can only be set by the owner or superuser */
        if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EPERM;
        /* required for strict SunOS emulation */
@@ -426,15 +426,35 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
        return err;
 }
+static int check_fcntl_cmd(unsigned cmd)
+{
+        switch (cmd) {
+        case F_DUPFD:
+        case F_DUPFD_CLOEXEC:
+        case F_GETFD:
+        case F_SETFD:
+        case F_GETFL:
+                return 1;
+        }
+        return 0;
+}
 SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 {       
        struct file *filp;
        long err = -EBADF;
-        filp = fget(fd);
+        filp = fget_raw(fd);
        if (!filp)
                goto out;
+        if (unlikely(filp->f_mode & FMODE_PATH)) {
+                if (!check_fcntl_cmd(cmd)) {
+                        fput(filp);
+                        goto out;
+                }
+        }
        err = security_file_fcntl(filp, cmd, arg);
        if (err) {
                fput(filp);
@@ -456,10 +476,17 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
        long err;
        err = -EBADF;
-        filp = fget(fd);
+        filp = fget_raw(fd);
        if (!filp)
                goto out;
+        if (unlikely(filp->f_mode & FMODE_PATH)) {
+                if (!check_fcntl_cmd(cmd)) {
+                        fput(filp);
+                        goto out;
+                }
+        }
        err = security_file_fcntl(filp, cmd, arg);
        if (err) {
                fput(filp);
@@ -808,14 +835,14 @@ static int __init fcntl_init(void)
         * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
         * is defined as O_NONBLOCK on some platforms and not on others.
         */
-        BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+        BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
                O_RDONLY        | O_WRONLY      | O_RDWR        |
                O_CREAT         | O_EXCL        | O_NOCTTY      |
                O_TRUNC         | O_APPEND      | /* O_NONBLOCK | */
                __O_SYNC        | O_DSYNC       | FASYNC        |
                O_DIRECT        | O_LARGEFILE   | O_DIRECTORY   |
                O_NOFOLLOW      | O_NOATIME     | O_CLOEXEC     |
-                FMODE_EXEC
+                __FMODE_EXEC    | O_PATH
                ));
        fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/fhandle.c b/fs/fhandle.c
new file mode 100644
index 000000000000..bf93ad2bee07
--- /dev/null
+++ b/fs/fhandle.c
@@ -0,0 +1,265 @@
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/exportfs.h>
+#include <linux/fs_struct.h>
+#include <linux/fsnotify.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+static long do_sys_name_to_handle(struct path *path,
+                                  struct file_handle __user *ufh,
+                                  int __user *mnt_id)
+{
+        long retval;
+        struct file_handle f_handle;
+        int handle_dwords, handle_bytes;
+        struct file_handle *handle = NULL;
+        /*
+         * We need t make sure wether the file system
+         * support decoding of the file handle
+         */
+        if (!path->mnt->mnt_sb->s_export_op ||
+            !path->mnt->mnt_sb->s_export_op->fh_to_dentry)
+                return -EOPNOTSUPP;
+        if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
+                return -EFAULT;
+        if (f_handle.handle_bytes > MAX_HANDLE_SZ)
+                return -EINVAL;
+        handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+                         GFP_KERNEL);
+        if (!handle)
+                return -ENOMEM;
+        /* convert handle size to  multiple of sizeof(u32) */
+        handle_dwords = f_handle.handle_bytes >> 2;
+        /* we ask for a non connected handle */
+        retval = exportfs_encode_fh(path->dentry,
+                                    (struct fid *)handle->f_handle,
+                                    &handle_dwords,  0);
+        handle->handle_type = retval;
+        /* convert handle size to bytes */
+        handle_bytes = handle_dwords * sizeof(u32);
+        handle->handle_bytes = handle_bytes;
+        if ((handle->handle_bytes > f_handle.handle_bytes) ||
+            (retval == 255) || (retval == -ENOSPC)) {
+                /* As per old exportfs_encode_fh documentation
+                 * we could return ENOSPC to indicate overflow
+                 * But file system returned 255 always. So handle
+                 * both the values
+                 */
+                /*
+                 * set the handle size to zero so we copy only
+                 * non variable part of the file_handle
+                 */
+                handle_bytes = 0;
+                retval = -EOVERFLOW;
+        } else
+                retval = 0;
+        /* copy the mount id */
+        if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) ||
+            copy_to_user(ufh, handle,
+                         sizeof(struct file_handle) + handle_bytes))
+                retval = -EFAULT;
+        kfree(handle);
+        return retval;
+}
+/**
+ * sys_name_to_handle_at: convert name to handle
+ * @dfd: directory relative to which name is interpreted if not absolute
+ * @name: name that should be converted to handle.
+ * @handle: resulting file handle
+ * @mnt_id: mount id of the file system containing the file
+ * @flag: flag value to indicate whether to follow symlink or not
+ *
+ * @handle->handle_size indicate the space available to store the
+ * variable part of the file handle in bytes. If there is not
+ * enough space, the field is updated to return the minimum
+ * value required.
+ */
+SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
+                struct file_handle __user *, handle, int __user *, mnt_id,
+                int, flag)
+{
+        struct path path;
+        int lookup_flags;
+        int err;
+        if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
+                return -EINVAL;
+        lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
+        if (flag & AT_EMPTY_PATH)
+                lookup_flags |= LOOKUP_EMPTY;
+        err = user_path_at(dfd, name, lookup_flags, &path);
+        if (!err) {
+                err = do_sys_name_to_handle(&path, handle, mnt_id);
+                path_put(&path);
+        }
+        return err;
+}
+static struct vfsmount *get_vfsmount_from_fd(int fd)
+{
+        struct path path;
+        if (fd == AT_FDCWD) {
+                struct fs_struct *fs = current->fs;
+                spin_lock(&fs->lock);
+                path = fs->pwd;
+                mntget(path.mnt);
+                spin_unlock(&fs->lock);
+        } else {
+                int fput_needed;
+                struct file *file = fget_light(fd, &fput_needed);
+                if (!file)
+                        return ERR_PTR(-EBADF);
+                path = file->f_path;
+                mntget(path.mnt);
+                fput_light(file, fput_needed);
+        }
+        return path.mnt;
+}
+static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
+{
+        return 1;
+}
+static int do_handle_to_path(int mountdirfd, struct file_handle *handle,
+                             struct path *path)
+{
+        int retval = 0;
+        int handle_dwords;
+        path->mnt = get_vfsmount_from_fd(mountdirfd);
+        if (IS_ERR(path->mnt)) {
+                retval = PTR_ERR(path->mnt);
+                goto out_err;
+        }
+        /* change the handle size to multiple of sizeof(u32) */
+        handle_dwords = handle->handle_bytes >> 2;
+        path->dentry = exportfs_decode_fh(path->mnt,
+                                          (struct fid *)handle->f_handle,
+                                          handle_dwords, handle->handle_type,
+                                          vfs_dentry_acceptable, NULL);
+        if (IS_ERR(path->dentry)) {
+                retval = PTR_ERR(path->dentry);
+                goto out_mnt;
+        }
+        return 0;
+out_mnt:
+        mntput(path->mnt);
+out_err:
+        return retval;
+}
+static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
+                   struct path *path)
+{
+        int retval = 0;
+        struct file_handle f_handle;
+        struct file_handle *handle = NULL;
+        /*
+         * With handle we don't look at the execute bit on the
+         * the directory. Ideally we would like CAP_DAC_SEARCH.
+         * But we don't have that
+         */
+        if (!capable(CAP_DAC_READ_SEARCH)) {
+                retval = -EPERM;
+                goto out_err;
+        }
+        if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
+                retval = -EFAULT;
+                goto out_err;
+        }
+        if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
+            (f_handle.handle_bytes == 0)) {
+                retval = -EINVAL;
+                goto out_err;
+        }
+        handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+                         GFP_KERNEL);
+        if (!handle) {
+                retval = -ENOMEM;
+                goto out_err;
+        }
+        /* copy the full handle */
+        if (copy_from_user(handle, ufh,
+                           sizeof(struct file_handle) +
+                           f_handle.handle_bytes)) {
+                retval = -EFAULT;
+                goto out_handle;
+        }
+        retval = do_handle_to_path(mountdirfd, handle, path);
+out_handle:
+        kfree(handle);
+out_err:
+        return retval;
+}
+long do_handle_open(int mountdirfd,
+                    struct file_handle __user *ufh, int open_flag)
+{
+        long retval = 0;
+        struct path path;
+        struct file *file;
+        int fd;
+        retval = handle_to_path(mountdirfd, ufh, &path);
+        if (retval)
+                return retval;
+        fd = get_unused_fd_flags(open_flag);
+        if (fd < 0) {
+                path_put(&path);
+                return fd;
+        }
+        file = file_open_root(path.dentry, path.mnt, "", open_flag);
+        if (IS_ERR(file)) {
+                put_unused_fd(fd);
+                retval =  PTR_ERR(file);
+        } else {
+                retval = fd;
+                fsnotify_open(file);
+                fd_install(fd, file);
+        }
+        path_put(&path);
+        return retval;
+}
+/**
+ * sys_open_by_handle_at: Open the file handle
+ * @mountdirfd: directory file descriptor
+ * @handle: file handle to be opened
+ * @flag: open flags.
+ *
+ * @mountdirfd indicate the directory file descriptor
+ * of the mount point. file handle is decoded relative
+ * to the vfsmount pointed by the @mountdirfd. @flags
+ * value is same as the open(2) flags.
+ */
+SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
+                struct file_handle __user *, handle,
+                int, flags)
+{
+        long ret;
+        if (force_o_largefile())
+                flags |= O_LARGEFILE;
+        ret = do_handle_open(mountdirfd, handle, flags);
+        return ret;
+}
diff --git a/fs/fifo.c b/fs/fifo.c
index 4e303c22d5ee..b1a524d798e7 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -66,8 +66,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
                                /* suppress POLLHUP until we have
                                 * seen a writer */
                                filp->f_version = pipe->w_counter;
-                        } else 
+                        } else {
-                        {
                                wait_for_partner(inode, &pipe->w_counter);
                                if(signal_pending(current))
                                        goto err_rd;
diff --git a/fs/file_table.c b/fs/file_table.c
index c3e89adf53c0..01e4c1e8e6b6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -125,13 +125,13 @@ struct file *get_empty_filp(void)
                goto fail;
        percpu_counter_inc(&nr_files);
+        f->f_cred = get_cred(cred);
        if (security_file_alloc(f))
                goto fail_sec;
        INIT_LIST_HEAD(&f->f_u.fu_list);
        atomic_long_set(&f->f_count, 1);
        rwlock_init(&f->f_owner.lock);
-        f->f_cred = get_cred(cred);
        spin_lock_init(&f->f_lock);
        eventpoll_init_file(f);
        /* f->f_version: 0 */
@@ -190,7 +190,8 @@ struct file *alloc_file(struct path *path, fmode_t mode,
                file_take_write(file);
                WARN_ON(mnt_clone_write(path->mnt));
        }
-        ima_counts_get(file);
+        if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+                i_readcount_inc(path->dentry->d_inode);
        return file;
 }
 EXPORT_SYMBOL(alloc_file);
@@ -246,11 +247,15 @@ static void __fput(struct file *file)
                file->f_op->release(inode, file);
        security_file_free(file);
        ima_file_free(file);
-        if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
+        if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
+                     !(file->f_mode & FMODE_PATH))) {
                cdev_put(inode->i_cdev);
+        }
        fops_put(file->f_op);
        put_pid(file->f_owner.pid);
        file_sb_list_del(file);
+        if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+                i_readcount_dec(inode);
        if (file->f_mode & FMODE_WRITE)
                drop_file_write_access(file);
        file->f_path.dentry = NULL;
@@ -276,11 +281,10 @@ struct file *fget(unsigned int fd)
        rcu_read_lock();
        file = fcheck_files(files, fd);
        if (file) {
-                if (!atomic_long_inc_not_zero(&file->f_count)) {
+                /* File object ref couldn't be taken */
-                        /* File object ref couldn't be taken */
+                if (file->f_mode & FMODE_PATH ||
-                        rcu_read_unlock();
+                    !atomic_long_inc_not_zero(&file->f_count))
-                        return NULL;
+                        file = NULL;
-                }
        }
        rcu_read_unlock();
@@ -289,6 +293,25 @@ struct file *fget(unsigned int fd)
 EXPORT_SYMBOL(fget);
+struct file *fget_raw(unsigned int fd)
+{
+        struct file *file;
+        struct files_struct *files = current->files;
+        rcu_read_lock();
+        file = fcheck_files(files, fd);
+        if (file) {
+                /* File object ref couldn't be taken */
+                if (!atomic_long_inc_not_zero(&file->f_count))
+                        file = NULL;
+        }
+        rcu_read_unlock();
+        return file;
+}
+EXPORT_SYMBOL(fget_raw);
 /*
 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
 *
@@ -313,6 +336,33 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
        *fput_needed = 0;
        if (atomic_read(&files->count) == 1) {
                file = fcheck_files(files, fd);
+                if (file && (file->f_mode & FMODE_PATH))
+                        file = NULL;
+        } else {
+                rcu_read_lock();
+                file = fcheck_files(files, fd);
+                if (file) {
+                        if (!(file->f_mode & FMODE_PATH) &&
+                            atomic_long_inc_not_zero(&file->f_count))
+                                *fput_needed = 1;
+                        else
+                                /* Didn't get the reference, someone's freed */
+                                file = NULL;
+                }
+                rcu_read_unlock();
+        }
+        return file;
+}
+struct file *fget_raw_light(unsigned int fd, int *fput_needed)
+{
+        struct file *file;
+        struct files_struct *files = current->files;
+        *fput_needed = 0;
+        if (atomic_read(&files->count) == 1) {
+                file = fcheck_files(files, fd);
        } else {
                rcu_read_lock();
                file = fcheck_files(files, fd);
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index 1429f3ae1e86..5d318c44f855 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -44,7 +44,6 @@ static sector_t		vxfs_bmap(struct address_space *, sector_t);
 const struct address_space_operations vxfs_aops = {
        .readpage =             vxfs_readpage,
        .bmap =                 vxfs_bmap,
-        .sync_page =            block_sync_page,
 };
 inline void
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 59c6e4956786..b5ed541fb137 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -176,6 +176,17 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
 }
 /*
+ * Remove the inode from the writeback list it is on.
+ */
+void inode_wb_list_del(struct inode *inode)
+{
+        spin_lock(&inode_wb_list_lock);
+        list_del_init(&inode->i_wb_list);
+        spin_unlock(&inode_wb_list_lock);
+}
+/*
 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
 * furthest end of its superblock's dirty-inode list.
 *
@@ -188,6 +199,7 @@ static void redirty_tail(struct inode *inode)
 {
        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+        assert_spin_locked(&inode_wb_list_lock);
        if (!list_empty(&wb->b_dirty)) {
                struct inode *tail;
@@ -205,14 +217,17 @@ static void requeue_io(struct inode *inode)
 {
        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+        assert_spin_locked(&inode_wb_list_lock);
        list_move(&inode->i_wb_list, &wb->b_more_io);
 }
 static void inode_sync_complete(struct inode *inode)
 {
        /*
-         * Prevent speculative execution through spin_unlock(&inode_lock);
+         * Prevent speculative execution through
+         * spin_unlock(&inode_wb_list_lock);
         */
        smp_mb();
        wake_up_bit(&inode->i_state, __I_SYNC);
 }
@@ -286,6 +301,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 */
 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
 {
+        assert_spin_locked(&inode_wb_list_lock);
        list_splice_init(&wb->b_more_io, &wb->b_io);
        move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
 }
@@ -306,25 +322,25 @@ static void inode_wait_for_writeback(struct inode *inode)
        wait_queue_head_t *wqh;
        wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
-         while (inode->i_state & I_SYNC) {
+        while (inode->i_state & I_SYNC) {
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode->i_lock);
+                spin_unlock(&inode_wb_list_lock);
                __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
-                spin_lock(&inode_lock);
+                spin_lock(&inode_wb_list_lock);
+                spin_lock(&inode->i_lock);
        }
 }
 /*
- * Write out an inode's dirty pages.  Called under inode_lock.  Either the
+ * Write out an inode's dirty pages.  Called under inode_wb_list_lock and
- * caller has ref on the inode (either via __iget or via syscall against an fd)
+ * inode->i_lock.  Either the caller has an active reference on the inode or
- * or the inode has I_WILL_FREE set (via generic_forget_inode)
+ * the inode has I_WILL_FREE set.
 *
 * If `wait' is set, wait on the writeout.
 *
 * The whole writeout design is quite complex and fragile.  We want to avoid
 * starvation of particular inodes when others are being redirtied, prevent
 * livelocks, etc.
- *
- * Called under inode_lock.
 */
 static int
 writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
@@ -333,6 +349,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        unsigned dirty;
        int ret;
+        assert_spin_locked(&inode_wb_list_lock);
+        assert_spin_locked(&inode->i_lock);
        if (!atomic_read(&inode->i_count))
                WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
        else
@@ -363,7 +382,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        /* Set I_SYNC, reset I_DIRTY_PAGES */
        inode->i_state |= I_SYNC;
        inode->i_state &= ~I_DIRTY_PAGES;
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&inode_wb_list_lock);
        ret = do_writepages(mapping, wbc);
@@ -383,10 +403,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
         * due to delalloc, clear dirty metadata flags right before
         * write_inode()
         */
-        spin_lock(&inode_lock);
+        spin_lock(&inode->i_lock);
        dirty = inode->i_state & I_DIRTY;
        inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode->i_lock);
        /* Don't write the inode if only I_DIRTY_PAGES was set */
        if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
                int err = write_inode(inode, wbc);
@@ -394,7 +414,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                        ret = err;
        }
-        spin_lock(&inode_lock);
+        spin_lock(&inode_wb_list_lock);
+        spin_lock(&inode->i_lock);
        inode->i_state &= ~I_SYNC;
        if (!(inode->i_state & I_FREEING)) {
                if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
@@ -506,7 +527,9 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
                 * kind does not need peridic writeout yet, and for the latter
                 * kind writeout is handled by the freer.
                 */
+                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+                        spin_unlock(&inode->i_lock);
                        requeue_io(inode);
                        continue;
                }
@@ -515,10 +538,13 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
                 * Was this inode dirtied after sync_sb_inodes was called?
                 * This keeps sync from extra jobs and livelock.
                 */
-                if (inode_dirtied_after(inode, wbc->wb_start))
+                if (inode_dirtied_after(inode, wbc->wb_start)) {
+                        spin_unlock(&inode->i_lock);
                        return 1;
+                }
                __iget(inode);
                pages_skipped = wbc->pages_skipped;
                writeback_single_inode(inode, wbc);
                if (wbc->pages_skipped != pages_skipped) {
@@ -528,10 +554,11 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
                         */
                        redirty_tail(inode);
                }
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode->i_lock);
+                spin_unlock(&inode_wb_list_lock);
                iput(inode);
                cond_resched();
-                spin_lock(&inode_lock);
+                spin_lock(&inode_wb_list_lock);
                if (wbc->nr_to_write <= 0) {
                        wbc->more_io = 1;
                        return 1;
@@ -550,7 +577,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
        if (!wbc->wb_start)
                wbc->wb_start = jiffies; /* livelock avoidance */
-        spin_lock(&inode_lock);
+        spin_lock(&inode_wb_list_lock);
        if (!wbc->for_kupdate || list_empty(&wb->b_io))
                queue_io(wb, wbc->older_than_this);
@@ -568,7 +595,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
                if (ret)
                        break;
        }
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_wb_list_lock);
        /* Leave any unwritten inodes on b_io */
 }
@@ -577,11 +604,11 @@ static void __writeback_inodes_sb(struct super_block *sb,
 {
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
-        spin_lock(&inode_lock);
+        spin_lock(&inode_wb_list_lock);
        if (!wbc->for_kupdate || list_empty(&wb->b_io))
                queue_io(wb, wbc->older_than_this);
        writeback_sb_inodes(sb, wb, wbc, true);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_wb_list_lock);
 }
 /*
@@ -720,13 +747,15 @@ static long wb_writeback(struct bdi_writeback *wb,
                 * become available for writeback. Otherwise
                 * we'll just busyloop.
                 */
-                spin_lock(&inode_lock);
+                spin_lock(&inode_wb_list_lock);
                if (!list_empty(&wb->b_more_io))  {
                        inode = wb_inode(wb->b_more_io.prev);
                        trace_wbc_writeback_wait(&wbc, wb->bdi);
+                        spin_lock(&inode->i_lock);
                        inode_wait_for_writeback(inode);
+                        spin_unlock(&inode->i_lock);
                }
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode_wb_list_lock);
        }
        return wrote;
@@ -992,7 +1021,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 {
        struct super_block *sb = inode->i_sb;
        struct backing_dev_info *bdi = NULL;
-        bool wakeup_bdi = false;
        /*
         * Don't do this for I_DIRTY_PAGES - that doesn't actually
@@ -1016,7 +1044,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
        if (unlikely(block_dump))
                block_dump___mark_inode_dirty(inode);
-        spin_lock(&inode_lock);
+        spin_lock(&inode->i_lock);
        if ((inode->i_state & flags) != flags) {
                const int was_dirty = inode->i_state & I_DIRTY;
@@ -1028,7 +1056,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                 * superblock list, based upon its state.
                 */
                if (inode->i_state & I_SYNC)
-                        goto out;
+                        goto out_unlock_inode;
                /*
                 * Only add valid (hashed) inodes to the superblock's
@@ -1036,16 +1064,17 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                 */
                if (!S_ISBLK(inode->i_mode)) {
                        if (inode_unhashed(inode))
-                                goto out;
+                                goto out_unlock_inode;
                }
                if (inode->i_state & I_FREEING)
-                        goto out;
+                        goto out_unlock_inode;
                /*
                 * If the inode was already on b_dirty/b_io/b_more_io, don't
                 * reposition it (that would break b_dirty time-ordering).
                 */
                if (!was_dirty) {
+                        bool wakeup_bdi = false;
                        bdi = inode_to_bdi(inode);
                        if (bdi_cap_writeback_dirty(bdi)) {
@@ -1062,15 +1091,20 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                                        wakeup_bdi = true;
                        }
+                        spin_unlock(&inode->i_lock);
+                        spin_lock(&inode_wb_list_lock);
                        inode->dirtied_when = jiffies;
                        list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+                        spin_unlock(&inode_wb_list_lock);
+                        if (wakeup_bdi)
+                                bdi_wakeup_thread_delayed(bdi);
+                        return;
                }
        }
-out:
+out_unlock_inode:
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode->i_lock);
-        if (wakeup_bdi)
-                bdi_wakeup_thread_delayed(bdi);
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
@@ -1101,7 +1135,7 @@ static void wait_sb_inodes(struct super_block *sb)
         */
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
-        spin_lock(&inode_lock);
+        spin_lock(&inode_sb_list_lock);
        /*
         * Data integrity sync. Must wait for all pages under writeback,
@@ -1111,22 +1145,25 @@ static void wait_sb_inodes(struct super_block *sb)
         * we still have to wait for that writeout.
         */
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-                struct address_space *mapping;
+                struct address_space *mapping = inode->i_mapping;
-                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+                spin_lock(&inode->i_lock);
-                        continue;
+                if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
-                mapping = inode->i_mapping;
+                    (mapping->nrpages == 0)) {
-                if (mapping->nrpages == 0)
+                        spin_unlock(&inode->i_lock);
                        continue;
+                }
                __iget(inode);
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode->i_lock);
+                spin_unlock(&inode_sb_list_lock);
                /*
-                 * We hold a reference to 'inode' so it couldn't have
+                 * We hold a reference to 'inode' so it couldn't have been
-                 * been removed from s_inodes list while we dropped the
+                 * removed from s_inodes list while we dropped the
-                 * inode_lock.  We cannot iput the inode now as we can
+                 * inode_sb_list_lock.  We cannot iput the inode now as we can
-                 * be holding the last reference and we cannot iput it
+                 * be holding the last reference and we cannot iput it under
-                 * under inode_lock. So we keep the reference and iput
+                 * inode_sb_list_lock. So we keep the reference and iput it
-                 * it later.
+                 * later.
                 */
                iput(old_inode);
                old_inode = inode;
@@ -1135,9 +1172,9 @@ static void wait_sb_inodes(struct super_block *sb)
                cond_resched();
-                spin_lock(&inode_lock);
+                spin_lock(&inode_sb_list_lock);
        }
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_sb_list_lock);
        iput(old_inode);
 }
@@ -1271,9 +1308,11 @@ int write_inode_now(struct inode *inode, int sync)
                wbc.nr_to_write = 0;
        might_sleep();
-        spin_lock(&inode_lock);
+        spin_lock(&inode_wb_list_lock);
+        spin_lock(&inode->i_lock);
        ret = writeback_single_inode(inode, &wbc);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&inode_wb_list_lock);
        if (sync)
                inode_sync_wait(inode);
        return ret;
@@ -1295,9 +1334,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
 {
        int ret;
-        spin_lock(&inode_lock);
+        spin_lock(&inode_wb_list_lock);
+        spin_lock(&inode->i_lock);
        ret = writeback_single_inode(inode, wbc);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&inode_wb_list_lock);
        return ret;
 }
 EXPORT_SYMBOL(sync_inode);
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 3e87cce5837d..b6cca47f7b07 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -305,7 +305,7 @@ static void cuse_gendev_release(struct device *dev)
 static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 {
        struct cuse_conn *cc = fc_to_cc(fc);
-        struct cuse_init_out *arg = &req->misc.cuse_init_out;
+        struct cuse_init_out *arg = req->out.args[0].value;
        struct page *page = req->pages[0];
        struct cuse_devinfo devinfo = { };
        struct device *dev;
@@ -384,6 +384,7 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
        dev_set_uevent_suppress(dev, 0);
        kobject_uevent(&dev->kobj, KOBJ_ADD);
 out:
+        kfree(arg);
        __free_page(page);
        return;
@@ -405,6 +406,7 @@ static int cuse_send_init(struct cuse_conn *cc)
        struct page *page;
        struct fuse_conn *fc = &cc->fc;
        struct cuse_init_in *arg;
+        void *outarg;
        BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
@@ -419,6 +421,10 @@ static int cuse_send_init(struct cuse_conn *cc)
        if (!page)
                goto err_put_req;
+        outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL);
+        if (!outarg)
+                goto err_free_page;
        arg = &req->misc.cuse_init_in;
        arg->major = FUSE_KERNEL_VERSION;
        arg->minor = FUSE_KERNEL_MINOR_VERSION;
@@ -429,7 +435,7 @@ static int cuse_send_init(struct cuse_conn *cc)
        req->in.args[0].value = arg;
        req->out.numargs = 2;
        req->out.args[0].size = sizeof(struct cuse_init_out);
-        req->out.args[0].value = &req->misc.cuse_init_out;
+        req->out.args[0].value = outarg;
        req->out.args[1].size = CUSE_INIT_INFO_MAX;
        req->out.argvar = 1;
        req->out.argpages = 1;
@@ -440,6 +446,8 @@ static int cuse_send_init(struct cuse_conn *cc)
        return 0;
+err_free_page:
+        __free_page(page);
 err_put_req:
        fuse_put_request(fc, req);
 err:
@@ -458,7 +466,7 @@ static void cuse_fc_release(struct fuse_conn *fc)
 * @file: file struct being opened
 *
 * Userland CUSE server can create a CUSE device by opening /dev/cuse
- * and replying to the initilaization request kernel sends.  This
+ * and replying to the initialization request kernel sends.  This
 * function is responsible for handling CUSE device initialization.
 * Because the fd opened by this function is used during
 * initialization, this function only creates cuse_conn and sends
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cf8d28d1fbad..640fc229df10 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -737,14 +737,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
        if (WARN_ON(PageMlocked(oldpage)))
                goto out_fallback_unlock;
-        remove_from_page_cache(oldpage);
+        err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);
-        page_cache_release(oldpage);
-        err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL);
        if (err) {
-                printk(KERN_WARNING "fuse_try_move_page: failed to add page");
+                unlock_page(newpage);
-                goto out_fallback_unlock;
+                return err;
        }
        page_cache_get(newpage);
        if (!(buf->flags & PIPE_BUF_FLAG_LRU))
@@ -1910,6 +1908,21 @@ __acquires(fc->lock)
                kfree(dequeue_forget(fc, 1, NULL));
 }
+static void end_polls(struct fuse_conn *fc)
+{
+        struct rb_node *p;
+        p = rb_first(&fc->polled_files);
+        while (p) {
+                struct fuse_file *ff;
+                ff = rb_entry(p, struct fuse_file, polled_node);
+                wake_up_interruptible_all(&ff->poll_wait);
+                p = rb_next(p);
+        }
+}
 /*
 * Abort all requests.
 *
@@ -1937,6 +1950,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
                fc->blocked = 0;
                end_io_requests(fc);
                end_queued_requests(fc);
+                end_polls(fc);
                wake_up_all(&fc->waitq);
                wake_up_all(&fc->blocked_waitq);
                kill_fasync(&fc->fasync, SIGIO, POLL_IN);
@@ -1953,6 +1967,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
                fc->connected = 0;
                fc->blocked = 0;
                end_queued_requests(fc);
+                end_polls(fc);
                wake_up_all(&fc->blocked_waitq);
                spin_unlock(&fc->lock);
                fuse_conn_put(fc);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index bfed8447ed80..c6ba49bd95b3 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -158,10 +158,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
        struct inode *inode;
-        if (nd->flags & LOOKUP_RCU)
+        inode = ACCESS_ONCE(entry->d_inode);
-                return -ECHILD;
-        inode = entry->d_inode;
        if (inode && is_bad_inode(inode))
                return 0;
        else if (fuse_dentry_time(entry) < get_jiffies_64()) {
@@ -177,6 +174,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                if (!inode)
                        return 0;
+                if (nd->flags & LOOKUP_RCU)
+                        return -ECHILD;
                fc = get_fuse_conn(inode);
                req = fuse_get_req(fc);
                if (IS_ERR(req))
@@ -970,6 +970,14 @@ static int fuse_access(struct inode *inode, int mask)
        return err;
 }
+static int fuse_perm_getattr(struct inode *inode, int flags)
+{
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        return fuse_do_getattr(inode, NULL, NULL);
+}
 /*
 * Check permission.  The two basic access models of FUSE are:
 *
@@ -989,9 +997,6 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
        bool refreshed = false;
        int err = 0;
-        if (flags & IPERM_FLAG_RCU)
-                return -ECHILD;
        if (!fuse_allow_task(fc, current))
                return -EACCES;
@@ -1000,9 +1005,15 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
         */
        if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) ||
            ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) {
-                err = fuse_update_attributes(inode, NULL, NULL, &refreshed);
+                struct fuse_inode *fi = get_fuse_inode(inode);
-                if (err)
-                        return err;
+                if (fi->i_time < get_jiffies_64()) {
+                        refreshed = true;
+                        err = fuse_perm_getattr(inode, flags);
+                        if (err)
+                                return err;
+                }
        }
        if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
@@ -1012,7 +1023,7 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
                   attributes.  This is also needed, because the root
                   node will at first have no permissions */
                if (err == -EACCES && !refreshed) {
-                        err = fuse_do_getattr(inode, NULL, NULL);
+                        err = fuse_perm_getattr(inode, flags);
                        if (!err)
                                err = generic_permission(inode, mask,
                                                        flags, NULL);
@@ -1023,13 +1034,16 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
                   noticed immediately, only after the attribute
                   timeout has expired */
        } else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
+                if (flags & IPERM_FLAG_RCU)
+                        return -ECHILD;
                err = fuse_access(inode, mask);
        } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
                if (!(inode->i_mode & S_IXUGO)) {
                        if (refreshed)
                                return -EACCES;
-                        err = fuse_do_getattr(inode, NULL, NULL);
+                        err = fuse_perm_getattr(inode, flags);
                        if (!err && !(inode->i_mode & S_IXUGO))
                                return -EACCES;
                }
@@ -1283,8 +1297,11 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
        if (err)
                return err;
-        if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc)
+        if (attr->ia_valid & ATTR_OPEN) {
-                return 0;
+                if (fc->atomic_o_trunc)
+                        return 0;
+                file = NULL;
+        }
        if (attr->ia_valid & ATTR_SIZE)
                is_truncate = true;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 95da1bc1c826..6ea00734984e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -86,18 +86,52 @@ struct fuse_file *fuse_file_get(struct fuse_file *ff)
        return ff;
 }
+static void fuse_release_async(struct work_struct *work)
+{
+        struct fuse_req *req;
+        struct fuse_conn *fc;
+        struct path path;
+        req = container_of(work, struct fuse_req, misc.release.work);
+        path = req->misc.release.path;
+        fc = get_fuse_conn(path.dentry->d_inode);
+        fuse_put_request(fc, req);
+        path_put(&path);
+}
 static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
 {
-        path_put(&req->misc.release.path);
+        if (fc->destroy_req) {
+                /*
+                 * If this is a fuseblk mount, then it's possible that
+                 * releasing the path will result in releasing the
+                 * super block and sending the DESTROY request.  If
+                 * the server is single threaded, this would hang.
+                 * For this reason do the path_put() in a separate
+                 * thread.
+                 */
+                atomic_inc(&req->count);
+                INIT_WORK(&req->misc.release.work, fuse_release_async);
+                schedule_work(&req->misc.release.work);
+        } else {
+                path_put(&req->misc.release.path);
+        }
 }
-static void fuse_file_put(struct fuse_file *ff)
+static void fuse_file_put(struct fuse_file *ff, bool sync)
 {
        if (atomic_dec_and_test(&ff->count)) {
                struct fuse_req *req = ff->reserved_req;
-                req->end = fuse_release_end;
+                if (sync) {
-                fuse_request_send_background(ff->fc, req);
+                        fuse_request_send(ff->fc, req);
+                        path_put(&req->misc.release.path);
+                        fuse_put_request(ff->fc, req);
+                } else {
+                        req->end = fuse_release_end;
+                        fuse_request_send_background(ff->fc, req);
+                }
                kfree(ff);
        }
 }
@@ -188,7 +222,7 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
                rb_erase(&ff->polled_node, &fc->polled_files);
        spin_unlock(&fc->lock);
-        wake_up_interruptible_sync(&ff->poll_wait);
+        wake_up_interruptible_all(&ff->poll_wait);
        inarg->fh = ff->fh;
        inarg->flags = flags;
@@ -219,8 +253,12 @@ void fuse_release_common(struct file *file, int opcode)
         * Normally this will send the RELEASE request, however if
         * some asynchronous READ or WRITE requests are outstanding,
         * the sending will be delayed.
+         *
+         * Make the release synchronous if this is a fuseblk mount,
+         * synchronous RELEASE is allowed (and desirable) in this case
+         * because the server can be trusted not to screw up.
         */
-        fuse_file_put(ff);
+        fuse_file_put(ff, ff->fc->destroy_req != NULL);
 }
 static int fuse_open(struct inode *inode, struct file *file)
@@ -558,7 +596,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
                page_cache_release(page);
        }
        if (req->ff)
-                fuse_file_put(req->ff);
+                fuse_file_put(req->ff, false);
 }
 static void fuse_send_readpages(struct fuse_req *req, struct file *file)
@@ -1137,7 +1175,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
 static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 {
        __free_page(req->pages[0]);
-        fuse_file_put(req->ff);
+        fuse_file_put(req->ff, false);
 }
 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ae5744a2f9e9..b788becada76 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,6 +21,7 @@
 #include <linux/rwsem.h>
 #include <linux/rbtree.h>
 #include <linux/poll.h>
+#include <linux/workqueue.h>
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -262,13 +263,15 @@ struct fuse_req {
        /** Data for asynchronous requests */
        union {
                struct {
-                        struct fuse_release_in in;
+                        union {
+                                struct fuse_release_in in;
+                                struct work_struct work;
+                        };
                        struct path path;
                } release;
                struct fuse_init_in init_in;
                struct fuse_init_out init_out;
                struct cuse_init_in cuse_init_in;
-                struct cuse_init_out cuse_init_out;
                struct {
                        struct fuse_read_in in;
                        u64 attr_ver;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9e3f68cc1bd1..cc6ec4b2f0ff 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -637,8 +637,10 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
        u64 nodeid;
        u32 generation;
-        if (*max_len < len)
+        if (*max_len < len) {
+                *max_len = len;
                return  255;
+        }
        nodeid = get_fuse_inode(inode)->nodeid;
        generation = inode->i_generation;
@@ -868,7 +870,6 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
        fc->bdi.name = "fuse";
        fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-        fc->bdi.unplug_io_fn = default_unplug_io_fn;
        /* fuse does it's own writeback accounting */
        fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 06c48a891832..8f26d1a58912 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -74,7 +74,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value,
                return -EINVAL;
        if (S_ISLNK(inode->i_mode))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EPERM;
        if (value) {
                acl = posix_acl_from_xattr(value, size);
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 21f7e46da4c0..f3d23ef4e876 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,4 +1,4 @@
-EXTRA_CFLAGS := -I$(src)
+ccflags-y := -I$(src)
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
        glops.o inode.o log.o lops.o main.o meta_io.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 7118f1a780a9..cbc07155b1a0 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -80,8 +80,11 @@ int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
        struct posix_acl *acl;
        int error;
-        if (flags & IPERM_FLAG_RCU)
+        if (flags & IPERM_FLAG_RCU) {
-                return -ECHILD;
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
        acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 4f36f8832b9b..c71995b111bf 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -695,6 +695,7 @@ out:
        if (error == 0)
                return 0;
+        unlock_page(page);
        page_cache_release(page);
        gfs2_trans_end(sdp);
@@ -1116,7 +1117,6 @@ static const struct address_space_operations gfs2_writeback_aops = {
        .writepages = gfs2_writeback_writepages,
        .readpage = gfs2_readpage,
        .readpages = gfs2_readpages,
-        .sync_page = block_sync_page,
        .write_begin = gfs2_write_begin,
        .write_end = gfs2_write_end,
        .bmap = gfs2_bmap,
@@ -1132,7 +1132,6 @@ static const struct address_space_operations gfs2_ordered_aops = {
        .writepage = gfs2_ordered_writepage,
        .readpage = gfs2_readpage,
        .readpages = gfs2_readpages,
-        .sync_page = block_sync_page,
        .write_begin = gfs2_write_begin,
        .write_end = gfs2_write_end,
        .set_page_dirty = gfs2_set_page_dirty,
@@ -1150,7 +1149,6 @@ static const struct address_space_operations gfs2_jdata_aops = {
        .writepages = gfs2_jdata_writepages,
        .readpage = gfs2_readpage,
        .readpages = gfs2_readpages,
-        .sync_page = block_sync_page,
        .write_begin = gfs2_write_begin,
        .write_end = gfs2_write_end,
        .set_page_dirty = gfs2_set_page_dirty,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3c4039d5eef1..ef3dc4b9fae2 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -21,6 +21,7 @@
 #include "meta_io.h"
 #include "quota.h"
 #include "rgrp.h"
+#include "super.h"
 #include "trans.h"
 #include "dir.h"
 #include "util.h"
@@ -757,7 +758,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrp_list rlist;
        u64 bn, bstart;
-        u32 blen;
+        u32 blen, btotal;
        __be64 *p;
        unsigned int rg_blocks = 0;
        int metadata;
@@ -839,6 +840,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        bstart = 0;
        blen = 0;
+        btotal = 0;
        for (p = top; p < bottom; p++) {
                if (!*p)
@@ -851,9 +853,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
                else {
                        if (bstart) {
                                if (metadata)
-                                        gfs2_free_meta(ip, bstart, blen);
+                                        __gfs2_free_meta(ip, bstart, blen);
                                else
-                                        gfs2_free_data(ip, bstart, blen);
+                                        __gfs2_free_data(ip, bstart, blen);
+                                btotal += blen;
                        }
                        bstart = bn;
@@ -865,11 +869,17 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        }
        if (bstart) {
                if (metadata)
-                        gfs2_free_meta(ip, bstart, blen);
+                        __gfs2_free_meta(ip, bstart, blen);
                else
-                        gfs2_free_data(ip, bstart, blen);
+                        __gfs2_free_data(ip, bstart, blen);
+                btotal += blen;
        }
+        gfs2_statfs_change(sdp, 0, +btotal, 0);
+        gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
+                          ip->i_inode.i_gid);
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_dinode_out(ip, dibh->b_data);
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 4a456338b873..0da8da2c991d 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -44,7 +44,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
        int error;
        int had_lock = 0;
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        parent = dget_parent(dentry);
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9023db8184f9..b5a5e60df0d5 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -36,9 +36,13 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
        struct super_block *sb = inode->i_sb;
        struct gfs2_inode *ip = GFS2_I(inode);
-        if (*len < GFS2_SMALL_FH_SIZE ||
+        if (connectable && (*len < GFS2_LARGE_FH_SIZE)) {
-            (connectable && *len < GFS2_LARGE_FH_SIZE))
+                *len = GFS2_LARGE_FH_SIZE;
                return 255;
+        } else if (*len < GFS2_SMALL_FH_SIZE) {
+                *len = GFS2_SMALL_FH_SIZE;
+                return 255;
+        }
        fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
        fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 7cfdcb913363..b2682e073eee 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -221,7 +221,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
                goto out_drop_write;
        error = -EACCES;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                goto out;
        error = 0;
@@ -448,15 +448,20 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-        if (!(file->f_flags & O_NOATIME)) {
+        if (!(file->f_flags & O_NOATIME) &&
+            !IS_NOATIME(&ip->i_inode)) {
                struct gfs2_holder i_gh;
                int error;
-                gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+                gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
                error = gfs2_glock_nq(&i_gh);
-                file_accessed(file);
+                if (error == 0) {
-                if (error == 0)
+                        file_accessed(file);
-                        gfs2_glock_dq_uninit(&i_gh);
+                        gfs2_glock_dq(&i_gh);
+                }
+                gfs2_holder_uninit(&i_gh);
+                if (error)
+                        return error;
        }
        vma->vm_ops = &gfs2_vm_ops;
        vma->vm_flags |= VM_CAN_NONLINEAR;
@@ -617,8 +622,7 @@ static void empty_write_end(struct page *page, unsigned from,
 {
        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-        page_zero_new_buffers(page, from, to);
+        zero_user(page, from, to-from);
-        flush_dcache_page(page);
        mark_page_accessed(page);
        if (!gfs2_is_writeback(ip))
@@ -627,36 +631,43 @@ static void empty_write_end(struct page *page, unsigned from,
        block_commit_write(page, from, to);
 }
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+static int needs_empty_write(sector_t block, struct inode *inode)
 {
-        unsigned start, end, next;
-        struct buffer_head *bh, *head;
        int error;
+        struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
-        if (!page_has_buffers(page)) {
+        bh_map.b_size = 1 << inode->i_blkbits;
-                error = __block_write_begin(page, from, to - from, gfs2_block_map);
+        error = gfs2_block_map(inode, block, &bh_map, 0);
-                if (unlikely(error))
+        if (unlikely(error))
-                        return error;
+                return error;
+        return !buffer_mapped(&bh_map);
+}
-                empty_write_end(page, from, to);
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
-                return 0;
+{
-        }
+        struct inode *inode = page->mapping->host;
+        unsigned start, end, next, blksize;
+        sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        int ret;
-        bh = head = page_buffers(page);
+        blksize = 1 << inode->i_blkbits;
        next = end = 0;
        while (next < from) {
-                next += bh->b_size;
+                next += blksize;
-                bh = bh->b_this_page;
+                block++;
        }
        start = next;
        do {
-                next += bh->b_size;
+                next += blksize;
-                if (buffer_mapped(bh)) {
+                ret = needs_empty_write(block, inode);
+                if (unlikely(ret < 0))
+                        return ret;
+                if (ret == 0) {
                        if (end) {
-                                error = __block_write_begin(page, start, end - start,
+                                ret = __block_write_begin(page, start, end - start,
-                                                            gfs2_block_map);
+                                                          gfs2_block_map);
-                                if (unlikely(error))
+                                if (unlikely(ret))
-                                        return error;
+                                        return ret;
                                empty_write_end(page, start, end);
                                end = 0;
                        }
@@ -664,13 +675,13 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
                }
                else
                        end = next;
-                bh = bh->b_this_page;
+                block++;
        } while (next < to);
        if (end) {
-                error = __block_write_begin(page, start, end - start, gfs2_block_map);
+                ret = __block_write_begin(page, start, end - start, gfs2_block_map);
-                if (unlikely(error))
+                if (unlikely(ret))
-                        return error;
+                        return ret;
                empty_write_end(page, start, end);
        }
@@ -976,8 +987,10 @@ static void do_unflock(struct file *file, struct file_lock *fl)
        mutex_lock(&fp->f_fl_mutex);
        flock_lock_file_wait(file, fl);
-        if (fl_gh->gh_gl)
+        if (fl_gh->gh_gl) {
-                gfs2_glock_dq_uninit(fl_gh);
+                gfs2_glock_dq_wait(fl_gh);
+                gfs2_holder_uninit(fl_gh);
+        }
        mutex_unlock(&fp->f_fl_mutex);
 }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 08a8beb152e6..e2431313491f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -26,6 +26,9 @@
 #include <linux/freezer.h>
 #include <linux/workqueue.h>
 #include <linux/jiffies.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
+#include <linux/bit_spinlock.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -41,10 +44,6 @@
 #define CREATE_TRACE_POINTS
 #include "trace_gfs2.h"
-struct gfs2_gl_hash_bucket {
-        struct hlist_head hb_list;
-};
 struct gfs2_glock_iter {
        int hash;                       /* hash bucket index         */
        struct gfs2_sbd *sdp;           /* incore superblock         */
@@ -54,7 +53,6 @@ struct gfs2_glock_iter {
 typedef void (*glock_examiner) (struct gfs2_glock * gl);
-static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
 static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
 #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
 static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
@@ -70,57 +68,9 @@ static DEFINE_SPINLOCK(lru_lock);
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
 #define GFS2_GL_HASH_MASK       (GFS2_GL_HASH_SIZE - 1)
-static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE];
+static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE];
 static struct dentry *gfs2_root;
-/*
- * Despite what you might think, the numbers below are not arbitrary :-)
- * They are taken from the ipv4 routing hash code, which is well tested
- * and thus should be nearly optimal. Later on we might tweek the numbers
- * but for now this should be fine.
- *
- * The reason for putting the locks in a separate array from the list heads
- * is that we can have fewer locks than list heads and save memory. We use
- * the same hash function for both, but with a different hash mask.
- */
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
-        defined(CONFIG_PROVE_LOCKING)
-#ifdef CONFIG_LOCKDEP
-# define GL_HASH_LOCK_SZ        256
-#else
-# if NR_CPUS >= 32
-#  define GL_HASH_LOCK_SZ       4096
-# elif NR_CPUS >= 16
-#  define GL_HASH_LOCK_SZ       2048
-# elif NR_CPUS >= 8
-#  define GL_HASH_LOCK_SZ       1024
-# elif NR_CPUS >= 4
-#  define GL_HASH_LOCK_SZ       512
-# else
-#  define GL_HASH_LOCK_SZ       256
-# endif
-#endif
-/* We never want more locks than chains */
-#if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ
-# undef GL_HASH_LOCK_SZ
-# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE
-#endif
-static rwlock_t gl_hash_locks[GL_HASH_LOCK_SZ];
-static inline rwlock_t *gl_lock_addr(unsigned int x)
-{
-        return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)];
-}
-#else /* not SMP, so no spinlocks required */
-static inline rwlock_t *gl_lock_addr(unsigned int x)
-{
-        return NULL;
-}
-#endif
 /**
 * gl_hash() - Turn glock number into hash bucket number
 * @lock: The glock number
@@ -141,25 +91,35 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp,
        return h;
 }
-/**
+static inline void spin_lock_bucket(unsigned int hash)
- * glock_free() - Perform a few checks and then release struct gfs2_glock
+{
- * @gl: The glock to release
+        struct hlist_bl_head *bl = &gl_hash_table[hash];
- *
+        bit_spin_lock(0, (unsigned long *)bl);
- * Also calls lock module to release its internal structure for this glock.
+}
- *
- */
-static void glock_free(struct gfs2_glock *gl)
+static inline void spin_unlock_bucket(unsigned int hash)
+{
+        struct hlist_bl_head *bl = &gl_hash_table[hash];
+        __bit_spin_unlock(0, (unsigned long *)bl);
+}
+static void gfs2_glock_dealloc(struct rcu_head *rcu)
+{
+        struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
+        if (gl->gl_ops->go_flags & GLOF_ASPACE)
+                kmem_cache_free(gfs2_glock_aspace_cachep, gl);
+        else
+                kmem_cache_free(gfs2_glock_cachep, gl);
+}
+void gfs2_glock_free(struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
-        struct address_space *mapping = gfs2_glock2aspace(gl);
-        struct kmem_cache *cachep = gfs2_glock_cachep;
-        GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
+        call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
-        trace_gfs2_glock_put(gl);
+        if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-        if (mapping)
+                wake_up(&sdp->sd_glock_wait);
-                cachep = gfs2_glock_aspace_cachep;
-        sdp->sd_lockstruct.ls_ops->lm_put_lock(cachep, gl);
 }
 /**
@@ -185,34 +145,49 @@ static int demote_ok(const struct gfs2_glock *gl)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        /* assert_spin_locked(&gl->gl_spin); */
        if (gl->gl_state == LM_ST_UNLOCKED)
                return 0;
-        if (!list_empty(&gl->gl_holders))
+        if (test_bit(GLF_LFLUSH, &gl->gl_flags))
+                return 0;
+        if ((gl->gl_name.ln_type != LM_TYPE_INODE) &&
+            !list_empty(&gl->gl_holders))
                return 0;
        if (glops->go_demote_ok)
                return glops->go_demote_ok(gl);
        return 1;
 }
 /**
- * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
+ * __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
 * @gl: the glock
 *
+ * If the glock is demotable, then we add it (or move it) to the end
+ * of the glock LRU list.
 */
-static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+static void __gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 {
-        int may_reclaim;
+        if (demote_ok(gl)) {
-        may_reclaim = (demote_ok(gl) &&
+                spin_lock(&lru_lock);
-                       (atomic_read(&gl->gl_ref) == 1 ||
-                        (gl->gl_name.ln_type == LM_TYPE_INODE &&
+                if (!list_empty(&gl->gl_lru))
-                         atomic_read(&gl->gl_ref) <= 2)));
+                        list_del_init(&gl->gl_lru);
-        spin_lock(&lru_lock);
+                else
-        if (list_empty(&gl->gl_lru) && may_reclaim) {
+                        atomic_inc(&lru_count);
                list_add_tail(&gl->gl_lru, &lru_list);
-                atomic_inc(&lru_count);
+                spin_unlock(&lru_lock);
        }
-        spin_unlock(&lru_lock);
+}
+void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+{
+        spin_lock(&gl->gl_spin);
+        __gfs2_glock_schedule_for_reclaim(gl);
+        spin_unlock(&gl->gl_spin);
 }
 /**
@@ -227,7 +202,6 @@ void gfs2_glock_put_nolock(struct gfs2_glock *gl)
 {
        if (atomic_dec_and_test(&gl->gl_ref))
                GLOCK_BUG_ON(gl, 1);
-        gfs2_glock_schedule_for_reclaim(gl);
 }
 /**
@@ -236,30 +210,26 @@ void gfs2_glock_put_nolock(struct gfs2_glock *gl)
 *
 */
-int gfs2_glock_put(struct gfs2_glock *gl)
+void gfs2_glock_put(struct gfs2_glock *gl)
 {
-        int rv = 0;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct address_space *mapping = gfs2_glock2aspace(gl);
-        write_lock(gl_lock_addr(gl->gl_hash));
+        if (atomic_dec_and_test(&gl->gl_ref)) {
-        if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) {
+                spin_lock_bucket(gl->gl_hash);
-                hlist_del(&gl->gl_list);
+                hlist_bl_del_rcu(&gl->gl_list);
+                spin_unlock_bucket(gl->gl_hash);
+                spin_lock(&lru_lock);
                if (!list_empty(&gl->gl_lru)) {
                        list_del_init(&gl->gl_lru);
                        atomic_dec(&lru_count);
                }
                spin_unlock(&lru_lock);
-                write_unlock(gl_lock_addr(gl->gl_hash));
                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
-                glock_free(gl);
+                GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
-                rv = 1;
+                trace_gfs2_glock_put(gl);
-                goto out;
+                sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
        }
-        spin_lock(&gl->gl_spin);
-        gfs2_glock_schedule_for_reclaim(gl);
-        spin_unlock(&gl->gl_spin);
-        write_unlock(gl_lock_addr(gl->gl_hash));
-out:
-        return rv;
 }
 /**
@@ -275,17 +245,15 @@ static struct gfs2_glock *search_bucket(unsigned int hash,
                                        const struct lm_lockname *name)
 {
        struct gfs2_glock *gl;
-        struct hlist_node *h;
+        struct hlist_bl_node *h;
-        hlist_for_each_entry(gl, h, &gl_hash_table[hash].hb_list, gl_list) {
+        hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) {
                if (!lm_name_equal(&gl->gl_name, name))
                        continue;
                if (gl->gl_sbd != sdp)
                        continue;
+                if (atomic_inc_not_zero(&gl->gl_ref))
-                atomic_inc(&gl->gl_ref);
+                        return gl;
-                return gl;
        }
        return NULL;
@@ -743,10 +711,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        struct gfs2_glock *gl, *tmp;
        unsigned int hash = gl_hash(sdp, &name);
        struct address_space *mapping;
+        struct kmem_cache *cachep;
-        read_lock(gl_lock_addr(hash));
+        rcu_read_lock();
        gl = search_bucket(hash, sdp, &name);
-        read_unlock(gl_lock_addr(hash));
+        rcu_read_unlock();
        *glp = gl;
        if (gl)
@@ -755,9 +724,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                return -ENOENT;
        if (glops->go_flags & GLOF_ASPACE)
-                gl = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_KERNEL);
+                cachep = gfs2_glock_aspace_cachep;
        else
-                gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
+                cachep = gfs2_glock_cachep;
+        gl = kmem_cache_alloc(cachep, GFP_KERNEL);
        if (!gl)
                return -ENOMEM;
@@ -790,15 +760,16 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                mapping->writeback_index = 0;
        }
-        write_lock(gl_lock_addr(hash));
+        spin_lock_bucket(hash);
        tmp = search_bucket(hash, sdp, &name);
        if (tmp) {
-                write_unlock(gl_lock_addr(hash));
+                spin_unlock_bucket(hash);
-                glock_free(gl);
+                kmem_cache_free(cachep, gl);
+                atomic_dec(&sdp->sd_glock_disposal);
                gl = tmp;
        } else {
-                hlist_add_head(&gl->gl_list, &gl_hash_table[hash].hb_list);
+                hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]);
-                write_unlock(gl_lock_addr(hash));
+                spin_unlock_bucket(hash);
        }
        *glp = gl;
@@ -1007,13 +978,13 @@ fail:
                        insert_pt = &gh2->gh_list;
        }
        set_bit(GLF_QUEUED, &gl->gl_flags);
+        trace_gfs2_glock_queue(gh, 1);
        if (likely(insert_pt == NULL)) {
                list_add_tail(&gh->gh_list, &gl->gl_holders);
                if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
                        goto do_cancel;
                return;
        }
-        trace_gfs2_glock_queue(gh, 1);
        list_add_tail(&gh->gh_list, insert_pt);
 do_cancel:
        gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
@@ -1113,6 +1084,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
                    !test_bit(GLF_DEMOTE, &gl->gl_flags))
                        fast_path = 1;
        }
+        __gfs2_glock_schedule_for_reclaim(gl);
        trace_gfs2_glock_queue(gh, 0);
        spin_unlock(&gl->gl_spin);
        if (likely(fast_path))
@@ -1276,10 +1248,8 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
 {
-        unsigned int x;
+        while (num_gh--)
+                gfs2_glock_dq(&ghs[num_gh]);
-        for (x = 0; x < num_gh; x++)
-                gfs2_glock_dq(&ghs[x]);
 }
 /**
@@ -1291,10 +1261,8 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
 {
-        unsigned int x;
+        while (num_gh--)
+                gfs2_glock_dq_uninit(&ghs[num_gh]);
-        for (x = 0; x < num_gh; x++)
-                gfs2_glock_dq_uninit(&ghs[x]);
 }
 void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
@@ -1440,42 +1408,30 @@ static struct shrinker glock_shrinker = {
 * @sdp: the filesystem
 * @bucket: the bucket
 *
- * Returns: 1 if the bucket has entries
 */
-static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
+static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
                          unsigned int hash)
 {
-        struct gfs2_glock *gl, *prev = NULL;
+        struct gfs2_glock *gl;
-        int has_entries = 0;
+        struct hlist_bl_head *head = &gl_hash_table[hash];
-        struct hlist_head *head = &gl_hash_table[hash].hb_list;
+        struct hlist_bl_node *pos;
-        read_lock(gl_lock_addr(hash));
+        rcu_read_lock();
-        /* Can't use hlist_for_each_entry - don't want prefetch here */
+        hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
-        if (hlist_empty(head))
+                if ((gl->gl_sbd == sdp) && atomic_read(&gl->gl_ref))
-                goto out;
-        gl = list_entry(head->first, struct gfs2_glock, gl_list);
-        while(1) {
-                if (!sdp || gl->gl_sbd == sdp) {
-                        gfs2_glock_hold(gl);
-                        read_unlock(gl_lock_addr(hash));
-                        if (prev)
-                                gfs2_glock_put(prev);
-                        prev = gl;
                        examiner(gl);
-                        has_entries = 1;
-                        read_lock(gl_lock_addr(hash));
-                }
-                if (gl->gl_list.next == NULL)
-                        break;
-                gl = list_entry(gl->gl_list.next, struct gfs2_glock, gl_list);
        }
-out:
+        rcu_read_unlock();
-        read_unlock(gl_lock_addr(hash));
-        if (prev)
-                gfs2_glock_put(prev);
        cond_resched();
-        return has_entries;
+}
+static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
+{
+        unsigned x;
+        for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+                examine_bucket(examiner, sdp, x);
 }
@@ -1529,10 +1485,21 @@ static void clear_glock(struct gfs2_glock *gl)
 void gfs2_glock_thaw(struct gfs2_sbd *sdp)
 {
-        unsigned x;
+        glock_hash_walk(thaw_glock, sdp);
+}
-        for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
-                examine_bucket(thaw_glock, sdp, x);
+{
+        int ret;
+        spin_lock(&gl->gl_spin);
+        ret = __dump_glock(seq, gl);
+        spin_unlock(&gl->gl_spin);
+        return ret;
+}
+static void dump_glock_func(struct gfs2_glock *gl)
+{
+        dump_glock(NULL, gl);
 }
 /**
@@ -1545,13 +1512,10 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
-        unsigned int x;
+        glock_hash_walk(clear_glock, sdp);
-        for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
-                examine_bucket(clear_glock, sdp, x);
        flush_workqueue(glock_workqueue);
        wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
-        gfs2_dump_lockstate(sdp);
+        glock_hash_walk(dump_glock_func, sdp);
 }
 void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
@@ -1717,73 +1681,22 @@ out:
        return error;
 }
-static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
-{
-        int ret;
-        spin_lock(&gl->gl_spin);
-        ret = __dump_glock(seq, gl);
-        spin_unlock(&gl->gl_spin);
-        return ret;
-}
-/**
- * gfs2_dump_lockstate - print out the current lockstate
- * @sdp: the filesystem
- * @ub: the buffer to copy the information into
- *
- * If @ub is NULL, dump the lockstate to the console.
- *
- */
-static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
-{
-        struct gfs2_glock *gl;
-        struct hlist_node *h;
-        unsigned int x;
-        int error = 0;
-        for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
-                read_lock(gl_lock_addr(x));
-                hlist_for_each_entry(gl, h, &gl_hash_table[x].hb_list, gl_list) {
-                        if (gl->gl_sbd != sdp)
-                                continue;
-                        error = dump_glock(NULL, gl);
-                        if (error)
-                                break;
-                }
-                read_unlock(gl_lock_addr(x));
-                if (error)
-                        break;
-        }
-        return error;
-}
 int __init gfs2_glock_init(void)
 {
        unsigned i;
        for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
-                INIT_HLIST_HEAD(&gl_hash_table[i].hb_list);
+                INIT_HLIST_BL_HEAD(&gl_hash_table[i]);
-        }
-#ifdef GL_HASH_LOCK_SZ
-        for(i = 0; i < GL_HASH_LOCK_SZ; i++) {
-                rwlock_init(&gl_hash_locks[i]);
        }
-#endif
        glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
-                                          WQ_HIGHPRI | WQ_FREEZEABLE, 0);
+                                          WQ_HIGHPRI | WQ_FREEZABLE, 0);
        if (IS_ERR(glock_workqueue))
                return PTR_ERR(glock_workqueue);
        gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
-                                                WQ_MEM_RECLAIM | WQ_FREEZEABLE,
+                                                WQ_MEM_RECLAIM | WQ_FREEZABLE,
                                                0);
        if (IS_ERR(gfs2_delete_workqueue)) {
                destroy_workqueue(glock_workqueue);
@@ -1802,62 +1715,54 @@ void gfs2_glock_exit(void)
        destroy_workqueue(gfs2_delete_workqueue);
 }
+static inline struct gfs2_glock *glock_hash_chain(unsigned hash)
+{
+        return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]),
+                              struct gfs2_glock, gl_list);
+}
+static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl)
+{
+        return hlist_bl_entry(rcu_dereference(gl->gl_list.next),
+                              struct gfs2_glock, gl_list);
+}
 static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 {
        struct gfs2_glock *gl;
-restart:
+        do {
-        read_lock(gl_lock_addr(gi->hash));
+                gl = gi->gl;
-        gl = gi->gl;
+                if (gl) {
-        if (gl) {
+                        gi->gl = glock_hash_next(gl);
-                gi->gl = hlist_entry(gl->gl_list.next,
+                } else {
-                                     struct gfs2_glock, gl_list);
+                        gi->gl = glock_hash_chain(gi->hash);
-        } else {
+                }
-                gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
+                while (gi->gl == NULL) {
-                                     struct gfs2_glock, gl_list);
+                        gi->hash++;
-        }
+                        if (gi->hash >= GFS2_GL_HASH_SIZE) {
-        if (gi->gl)
+                                rcu_read_unlock();
-                gfs2_glock_hold(gi->gl);
+                                return 1;
-        read_unlock(gl_lock_addr(gi->hash));
+                        }
-        if (gl)
+                        gi->gl = glock_hash_chain(gi->hash);
-                gfs2_glock_put(gl);
+                }
-        while (gi->gl == NULL) {
+        /* Skip entries for other sb and dead entries */
-                gi->hash++;
+        } while (gi->sdp != gi->gl->gl_sbd || atomic_read(&gi->gl->gl_ref) == 0);
-                if (gi->hash >= GFS2_GL_HASH_SIZE)
-                        return 1;
-                read_lock(gl_lock_addr(gi->hash));
-                gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
-                                     struct gfs2_glock, gl_list);
-                if (gi->gl)
-                        gfs2_glock_hold(gi->gl);
-                read_unlock(gl_lock_addr(gi->hash));
-        }
-        if (gi->sdp != gi->gl->gl_sbd)
-                goto restart;
        return 0;
 }
-static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi)
-{
-        if (gi->gl)
-                gfs2_glock_put(gi->gl);
-        gi->gl = NULL;
-}
 static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
 {
        struct gfs2_glock_iter *gi = seq->private;
        loff_t n = *pos;
        gi->hash = 0;
+        rcu_read_lock();
        do {
-                if (gfs2_glock_iter_next(gi)) {
+                if (gfs2_glock_iter_next(gi))
-                        gfs2_glock_iter_free(gi);
                        return NULL;
-                }
        } while (n--);
        return gi->gl;
@@ -1870,10 +1775,8 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
        (*pos)++;
-        if (gfs2_glock_iter_next(gi)) {
+        if (gfs2_glock_iter_next(gi))
-                gfs2_glock_iter_free(gi);
                return NULL;
-        }
        return gi->gl;
 }
@@ -1881,7 +1784,10 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
 static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
 {
        struct gfs2_glock_iter *gi = seq->private;
-        gfs2_glock_iter_free(gi);
+        if (gi->gl)
+                rcu_read_unlock();
+        gi->gl = NULL;
 }
 static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 691851ceb615..aea160690e94 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -118,7 +118,7 @@ struct lm_lockops {
        int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
        void (*lm_unmount) (struct gfs2_sbd *sdp);
        void (*lm_withdraw) (struct gfs2_sbd *sdp);
-        void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
+        void (*lm_put_lock) (struct gfs2_glock *gl);
        int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
                        unsigned int flags);
        void (*lm_cancel) (struct gfs2_glock *gl);
@@ -174,7 +174,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp,
                   int create, struct gfs2_glock **glp);
 void gfs2_glock_hold(struct gfs2_glock *gl);
 void gfs2_glock_put_nolock(struct gfs2_glock *gl);
-int gfs2_glock_put(struct gfs2_glock *gl);
+void gfs2_glock_put(struct gfs2_glock *gl);
 void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
                      struct gfs2_holder *gh);
 void gfs2_holder_reinit(unsigned int state, unsigned flags,
@@ -223,25 +223,22 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
        return error;
 }
-/*  Lock Value Block functions  */
+extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
+extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
-int gfs2_lvb_hold(struct gfs2_glock *gl);
+extern void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
-void gfs2_lvb_unhold(struct gfs2_glock *gl);
+extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
-void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
+extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
-void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
+extern void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
-void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
+extern void gfs2_glock_free(struct gfs2_glock *gl);
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
-void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
+extern int __init gfs2_glock_init(void);
-void gfs2_glock_thaw(struct gfs2_sbd *sdp);
+extern void gfs2_glock_exit(void);
-int __init gfs2_glock_init(void);
+extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
-void gfs2_glock_exit(void);
+extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
+extern int gfs2_register_debugfs(void);
-int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
+extern void gfs2_unregister_debugfs(void);
-void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
-int gfs2_register_debugfs(void);
-void gfs2_unregister_debugfs(void);
 extern const struct lm_lockops gfs2_dlm_ops;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 263561bf1a50..3754e3cbf02b 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -56,20 +56,26 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
        BUG_ON(current->journal_info);
        current->journal_info = &tr;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        while (!list_empty(head)) {
                bd = list_entry(head->next, struct gfs2_bufdata,
                                bd_ail_gl_list);
                bh = bd->bd_bh;
                gfs2_remove_from_ail(bd);
+                spin_unlock(&sdp->sd_ail_lock);
                bd->bd_bh = NULL;
                bh->b_private = NULL;
                bd->bd_blkno = bh->b_blocknr;
+                gfs2_log_lock(sdp);
                gfs2_assert_withdraw(sdp, !buffer_busy(bh));
                gfs2_trans_add_revoke(sdp, bd);
+                gfs2_log_unlock(sdp);
+                spin_lock(&sdp->sd_ail_lock);
        }
        gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
        gfs2_trans_end(sdp);
        gfs2_log_flush(sdp, NULL);
@@ -206,8 +212,17 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 static int inode_go_demote_ok(const struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_holder *gh;
        if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
                return 0;
+        if (!list_empty(&gl->gl_holders)) {
+                gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+                if (gh->gh_list.next != &gl->gl_holders)
+                        return 0;
+        }
        return 1;
 }
@@ -272,19 +287,6 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 }
 /**
- * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
- * @gl: the glock
- *
- * Returns: 1 if it's ok
- */
-static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
-{
-        const struct address_space *mapping = (const struct address_space *)(gl + 1);
-        return !mapping->nrpages;
-}
-/**
 * rgrp_go_lock - operation done after an rgrp lock is locked by
 *    a first holder on this node.
 * @gl: the glock
@@ -410,7 +412,6 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
        .go_xmote_th = rgrp_go_sync,
        .go_inval = rgrp_go_inval,
-        .go_demote_ok = rgrp_go_demote_ok,
        .go_lock = rgrp_go_lock,
        .go_unlock = rgrp_go_unlock,
        .go_dump = gfs2_rgrp_dump,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a79790c06275..870a89d6d4dc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -15,6 +15,8 @@
 #include <linux/workqueue.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
 #define DIO_WAIT        0x00000010
 #define DIO_METADATA    0x00000020
@@ -201,7 +203,7 @@ enum {
 };
 struct gfs2_glock {
-        struct hlist_node gl_list;
+        struct hlist_bl_node gl_list;
        unsigned long gl_flags;         /* GLF_... */
        struct lm_lockname gl_name;
        atomic_t gl_ref;
@@ -234,6 +236,7 @@ struct gfs2_glock {
        atomic_t gl_ail_count;
        struct delayed_work gl_work;
        struct work_struct gl_delete;
+        struct rcu_head gl_rcu;
 };
 #define GFS2_MIN_LVB_SIZE 32    /* Min size of LVB that gfs2 supports */
@@ -314,6 +317,7 @@ enum {
        QDF_USER                = 0,
        QDF_CHANGE              = 1,
        QDF_LOCKED              = 2,
+        QDF_REFRESH             = 3,
 };
 struct gfs2_quota_data {
@@ -647,6 +651,7 @@ struct gfs2_sbd {
        unsigned int sd_log_flush_head;
        u64 sd_log_flush_wrapped;
+        spinlock_t sd_ail_lock;
        struct list_head sd_ail1_list;
        struct list_head sd_ail2_list;
        u64 sd_ail_sync_gen;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7aa7d4f8984a..97d54a28776a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -763,14 +763,15 @@ fail:
        return error;
 }
-static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
+static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
+                              const struct qstr *qstr)
 {
        int err;
        size_t len;
        void *value;
        char *name;
-        err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
+        err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
                                           &name, &value, &len);
        if (err) {
@@ -854,7 +855,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
        if (error)
                goto fail_gunlock2;
-        error = gfs2_security_init(dip, GFS2_I(inode));
+        error = gfs2_security_init(dip, GFS2_I(inode), name);
        if (error)
                goto fail_gunlock2;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 6e493aee28f8..98c80d8c2a62 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -22,7 +22,6 @@ static void gdlm_ast(void *arg)
 {
        struct gfs2_glock *gl = arg;
        unsigned ret = gl->gl_state;
-        struct gfs2_sbd *sdp = gl->gl_sbd;
        BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
@@ -31,12 +30,7 @@ static void gdlm_ast(void *arg)
        switch (gl->gl_lksb.sb_status) {
        case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
-                if (gl->gl_ops->go_flags & GLOF_ASPACE)
+                gfs2_glock_free(gl);
-                        kmem_cache_free(gfs2_glock_aspace_cachep, gl);
-                else
-                        kmem_cache_free(gfs2_glock_cachep, gl);
-                if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-                        wake_up(&sdp->sd_glock_wait);
                return;
        case -DLM_ECANCEL: /* Cancel while getting lock */
                ret |= LM_OUT_CANCELED;
@@ -164,16 +158,14 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
                        GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
 }
-static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
+static void gdlm_put_lock(struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
        int error;
        if (gl->gl_lksb.sb_lkid == 0) {
-                kmem_cache_free(cachep, gl);
+                gfs2_glock_free(gl);
-                if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-                        wake_up(&sdp->sd_glock_wait);
                return;
        }
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index eb01f3575e10..5b102c1887fd 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -67,7 +67,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
 * @mapping: The associated mapping (maybe NULL)
 * @bd: The gfs2_bufdata to remove
 *
- * The log lock _must_ be held when calling this function
+ * The ail lock _must_ be held when calling this function
 *
 */
@@ -88,8 +88,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
 */
 static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
-__releases(&sdp->sd_log_lock)
+__releases(&sdp->sd_ail_lock)
-__acquires(&sdp->sd_log_lock)
+__acquires(&sdp->sd_ail_lock)
 {
        struct gfs2_bufdata *bd, *s;
        struct buffer_head *bh;
@@ -117,16 +117,16 @@ __acquires(&sdp->sd_log_lock)
                        list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
                        get_bh(bh);
-                        gfs2_log_unlock(sdp);
+                        spin_unlock(&sdp->sd_ail_lock);
                        lock_buffer(bh);
                        if (test_clear_buffer_dirty(bh)) {
                                bh->b_end_io = end_buffer_write_sync;
-                                submit_bh(WRITE_SYNC_PLUG, bh);
+                                submit_bh(WRITE_SYNC, bh);
                        } else {
                                unlock_buffer(bh);
                                brelse(bh);
                        }
-                        gfs2_log_lock(sdp);
+                        spin_lock(&sdp->sd_ail_lock);
                        retry = 1;
                        break;
@@ -175,10 +175,10 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
        struct gfs2_ail *ai;
        int done = 0;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        head = &sdp->sd_ail1_list;
        if (list_empty(head)) {
-                gfs2_log_unlock(sdp);
+                spin_unlock(&sdp->sd_ail_lock);
                return;
        }
        sync_gen = sdp->sd_ail_sync_gen++;
@@ -189,13 +189,13 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
                        if (ai->ai_sync_gen >= sync_gen)
                                continue;
                        ai->ai_sync_gen = sync_gen;
-                        gfs2_ail1_start_one(sdp, ai); /* This may drop log lock */
+                        gfs2_ail1_start_one(sdp, ai); /* This may drop ail lock */
                        done = 0;
                        break;
                }
        }
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
 }
 static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
@@ -203,7 +203,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
        struct gfs2_ail *ai, *s;
        int ret;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
                if (gfs2_ail1_empty_one(sdp, ai, flags))
@@ -214,7 +214,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
        ret = list_empty(&sdp->sd_ail1_list);
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
        return ret;
 }
@@ -247,7 +247,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
        int wrap = (new_tail < old_tail);
        int a, b, rm;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
                a = (old_tail <= ai->ai_first);
@@ -263,7 +263,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
                kfree(ai);
        }
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
 }
 /**
@@ -421,7 +421,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
        struct gfs2_ail *ai;
        unsigned int tail;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        if (list_empty(&sdp->sd_ail1_list)) {
                tail = sdp->sd_log_head;
@@ -430,7 +430,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
                tail = ai->ai_first;
        }
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
        return tail;
 }
@@ -647,7 +647,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
                lock_buffer(bh);
                if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
                        bh->b_end_io = end_buffer_write_sync;
-                        submit_bh(WRITE_SYNC_PLUG, bh);
+                        submit_bh(WRITE_SYNC, bh);
                } else {
                        unlock_buffer(bh);
                        brelse(bh);
@@ -743,10 +743,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
        sdp->sd_log_commited_databuf = 0;
        sdp->sd_log_commited_revoke = 0;
+        spin_lock(&sdp->sd_ail_lock);
        if (!list_empty(&ai->ai_ail1_list)) {
                list_add(&ai->ai_list, &sdp->sd_ail1_list);
                ai = NULL;
        }
+        spin_unlock(&sdp->sd_ail_lock);
        gfs2_log_unlock(sdp);
        trace_gfs2_log_flush(sdp, 0);
        up_write(&sdp->sd_log_flush_lock);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index bf33f822058d..51d27f00ebb4 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -51,8 +51,10 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
        /* If this buffer is in the AIL and it has already been written
         * to in-place disk block, remove it from the AIL.
         */
+        spin_lock(&sdp->sd_ail_lock);
        if (bd->bd_ail)
                list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
+        spin_unlock(&sdp->sd_ail_lock);
        get_bh(bh);
        atomic_inc(&sdp->sd_log_pinned);
        trace_gfs2_pin(bd, 1);
@@ -80,7 +82,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
        mark_buffer_dirty(bh);
        clear_buffer_pinned(bh);
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        if (bd->bd_ail) {
                list_del(&bd->bd_ail_st_list);
                brelse(bh);
@@ -91,9 +93,11 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
        }
        bd->bd_ail = ai;
        list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
-        clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+        spin_unlock(&sdp->sd_ail_lock);
+        if (test_and_clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags))
+                gfs2_glock_schedule_for_reclaim(bd->bd_gl);
        trace_gfs2_pin(bd, 0);
-        gfs2_log_unlock(sdp);
        unlock_buffer(bh);
        atomic_dec(&sdp->sd_log_pinned);
 }
@@ -200,7 +204,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
                }
                gfs2_log_unlock(sdp);
-                submit_bh(WRITE_SYNC_PLUG, bh);
+                submit_bh(WRITE_SYNC, bh);
                gfs2_log_lock(sdp);
                n = 0;
@@ -210,7 +214,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
                        gfs2_log_unlock(sdp);
                        lock_buffer(bd2->bd_bh);
                        bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
-                        submit_bh(WRITE_SYNC_PLUG, bh);
+                        submit_bh(WRITE_SYNC, bh);
                        gfs2_log_lock(sdp);
                        if (++n >= num)
                                break;
@@ -352,7 +356,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
                sdp->sd_log_num_revoke--;
                if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
-                        submit_bh(WRITE_SYNC_PLUG, bh);
+                        submit_bh(WRITE_SYNC, bh);
                        bh = gfs2_log_get_buf(sdp);
                        mh = (struct gfs2_meta_header *)bh->b_data;
@@ -369,7 +373,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
        }
        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
-        submit_bh(WRITE_SYNC_PLUG, bh);
+        submit_bh(WRITE_SYNC, bh);
 }
 static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
@@ -571,7 +575,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
        ptr = bh_log_ptr(bh);
        
        get_bh(bh);
-        submit_bh(WRITE_SYNC_PLUG, bh);
+        submit_bh(WRITE_SYNC, bh);
        gfs2_log_lock(sdp);
        while(!list_empty(list)) {
                bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
@@ -597,7 +601,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
                } else {
                        bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
                }
-                submit_bh(WRITE_SYNC_PLUG, bh1);
+                submit_bh(WRITE_SYNC, bh1);
                gfs2_log_lock(sdp);
                ptr += 2;
        }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index ebef7ab6e17e..888a5f5a1a58 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -14,6 +14,8 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
 #include <asm/atomic.h>
 #include "gfs2.h"
@@ -45,7 +47,7 @@ static void gfs2_init_glock_once(void *foo)
 {
        struct gfs2_glock *gl = foo;
-        INIT_HLIST_NODE(&gl->gl_list);
+        INIT_HLIST_BL_NODE(&gl->gl_list);
        spin_lock_init(&gl->gl_spin);
        INIT_LIST_HEAD(&gl->gl_holders);
        INIT_LIST_HEAD(&gl->gl_lru);
@@ -59,14 +61,7 @@ static void gfs2_init_gl_aspace_once(void *foo)
        struct address_space *mapping = (struct address_space *)(gl + 1);
        gfs2_init_glock_once(gl);
-        memset(mapping, 0, sizeof(*mapping));
+        address_space_init_once(mapping);
-        INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-        spin_lock_init(&mapping->tree_lock);
-        spin_lock_init(&mapping->i_mmap_lock);
-        INIT_LIST_HEAD(&mapping->private_list);
-        spin_lock_init(&mapping->private_lock);
-        INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
 }
 /**
@@ -144,7 +139,7 @@ static int __init init_gfs2_fs(void)
        error = -ENOMEM;
        gfs_recovery_wq = alloc_workqueue("gfs_recovery",
-                                          WQ_MEM_RECLAIM | WQ_FREEZEABLE, 0);
+                                          WQ_MEM_RECLAIM | WQ_FREEZABLE, 0);
        if (!gfs_recovery_wq)
                goto fail_wq;
@@ -198,6 +193,8 @@ static void __exit exit_gfs2_fs(void)
        unregister_filesystem(&gfs2meta_fs_type);
        destroy_workqueue(gfs_recovery_wq);
+        rcu_barrier();
        kmem_cache_destroy(gfs2_quotad_cachep);
        kmem_cache_destroy(gfs2_rgrpd_cachep);
        kmem_cache_destroy(gfs2_bufdata_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 939739c7b3f9..675349b5a133 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,7 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
        struct buffer_head *bh, *head;
        int nr_underway = 0;
        int write_op = REQ_META |
-                (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE);
+                (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
        BUG_ON(!PageLocked(page));
        BUG_ON(!page_has_buffers(page));
@@ -94,7 +94,6 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 const struct address_space_operations gfs2_meta_aops = {
        .writepage = gfs2_aspace_writepage,
        .releasepage = gfs2_releasepage,
-        .sync_page = block_sync_page,
 };
 /**
@@ -326,6 +325,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
                brelse(bh);
        }
        if (bd) {
+                spin_lock(&sdp->sd_ail_lock);
                if (bd->bd_ail) {
                        gfs2_remove_from_ail(bd);
                        bh->b_private = NULL;
@@ -333,6 +333,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
                        bd->bd_blkno = bh->b_blocknr;
                        gfs2_trans_add_revoke(sdp, bd);
                }
+                spin_unlock(&sdp->sd_ail_lock);
        }
        clear_buffer_dirty(bh);
        clear_buffer_uptodate(bh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 777927ce6f79..42ef24355afb 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -99,6 +99,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        init_waitqueue_head(&sdp->sd_log_waitq);
        init_waitqueue_head(&sdp->sd_logd_waitq);
+        spin_lock_init(&sdp->sd_ail_lock);
        INIT_LIST_HEAD(&sdp->sd_ail1_list);
        INIT_LIST_HEAD(&sdp->sd_ail2_list);
@@ -928,17 +929,9 @@ static const match_table_t nolock_tokens = {
        { Opt_err, NULL },
 };
-static void nolock_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        kmem_cache_free(cachep, gl);
-        if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-                wake_up(&sdp->sd_glock_wait);
-}
 static const struct lm_lockops nolock_ops = {
        .lm_proto_name = "lock_nolock",
-        .lm_put_lock = nolock_put_lock,
+        .lm_put_lock = gfs2_glock_free,
        .lm_tokens = &nolock_tokens,
 };
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index d8b26ac2e20b..09e436a50723 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1026,9 +1026,9 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 /**
 * gfs2_permission -
- * @inode:
+ * @inode: The inode
- * @mask:
+ * @mask: The mask to be tested
- * @nd: passed from Linux VFS, ignored by us
+ * @flags: Indicates whether this is an RCU path walk or not
 *
 * This may be called from the VFS directly, or from within GFS2 with the
 * inode locked, so we look to see if the glock is already locked and only
@@ -1044,11 +1044,11 @@ int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
        int error;
        int unlock = 0;
-        if (flags & IPERM_FLAG_RCU)
-                return -ECHILD;
        ip = GFS2_I(inode);
        if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
+                if (flags & IPERM_FLAG_RCU)
+                        return -ECHILD;
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
                if (error)
                        return error;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a689901963de..e23d9864c418 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -834,6 +834,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
                        goto out_end_trans;
                do_qc(qd, -qd->qd_change_sync);
+                set_bit(QDF_REFRESH, &qd->qd_flags);
        }
        error = 0;
@@ -929,6 +930,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al = ip->i_alloc;
+        struct gfs2_quota_data *qd;
        unsigned int x;
        int error = 0;
@@ -942,7 +944,11 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
             sort_qd, NULL);
        for (x = 0; x < al->al_qd_num; x++) {
-                error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]);
+                int force = NO_FORCE;
+                qd = al->al_qd[x];
+                if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags))
+                        force = FORCE;
+                error = do_glock(qd, force, &al->al_qd_ghs[x]);
                if (error)
                        break;
        }
@@ -1587,6 +1593,8 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
        offset = qd2offset(qd);
        alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota));
+        if (gfs2_is_stuffed(ip))
+                alloc_required = 1;
        if (alloc_required) {
                al = gfs2_alloc_get(ip);
                if (al == NULL)
@@ -1600,7 +1608,9 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
                blocks += gfs2_rg_blocks(al);
        }
-        error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
+        /* Some quotas span block boundaries and can update two blocks,
+           adding an extra block to the transaction to handle such quotas */
+        error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 2, 0);
        if (error)
                goto out_release;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7293ea27020c..cf930cd9664a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1602,7 +1602,7 @@ rgrp_error:
 *
 */
-void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
+void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd;
@@ -1617,7 +1617,21 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_trans_add_rg(rgd);
+}
+/**
+ * gfs2_free_data - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        __gfs2_free_data(ip, bstart, blen);
        gfs2_statfs_change(sdp, 0, +blen, 0);
        gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
 }
@@ -1630,7 +1644,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
 *
 */
-void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd;
@@ -1645,10 +1659,24 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_trans_add_rg(rgd);
+        gfs2_meta_wipe(ip, bstart, blen);
+}
+/**
+ * gfs2_free_meta - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        __gfs2_free_meta(ip, bstart, blen);
        gfs2_statfs_change(sdp, 0, +blen, 0);
        gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
-        gfs2_meta_wipe(ip, bstart, blen);
 }
 void gfs2_unlink_di(struct inode *inode)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 50c2bb04369c..a80e3034ac47 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -52,7 +52,9 @@ extern int gfs2_ri_update(struct gfs2_inode *ip);
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
 extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
+extern void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
 extern void gfs2_unlink_di(struct inode *inode);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index afa66aaa2237..b4d70b13be92 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -238,46 +238,22 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 }
 /*
- * hfs_unlink()
+ * hfs_remove()
 *
- * This is the unlink() entry in the inode_operations structure for
+ * This serves as both unlink() and rmdir() in the inode_operations
- * regular HFS directories.  The purpose is to delete an existing
+ * structure for regular HFS directories.  The purpose is to delete
- * file, given the inode for the parent directory and the name
+ * an existing child, given the inode for the parent directory and
- * (and its length) of the existing file.
+ * the name (and its length) of the existing directory.
- */
-static int hfs_unlink(struct inode *dir, struct dentry *dentry)
-{
-        struct inode *inode;
-        int res;
-        inode = dentry->d_inode;
-        res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
-        if (res)
-                return res;
-        drop_nlink(inode);
-        hfs_delete_inode(inode);
-        inode->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(inode);
-        return res;
-}
-/*
- * hfs_rmdir()
 *
- * This is the rmdir() entry in the inode_operations structure for
+ * HFS does not have hardlinks, so both rmdir and unlink set the
- * regular HFS directories.  The purpose is to delete an existing
+ * link count to 0.  The only difference is the emptiness check.
- * directory, given the inode for the parent directory and the name
- * (and its length) of the existing directory.
 */
-static int hfs_rmdir(struct inode *dir, struct dentry *dentry)
+static int hfs_remove(struct inode *dir, struct dentry *dentry)
 {
-        struct inode *inode;
+        struct inode *inode = dentry->d_inode;
        int res;
-        inode = dentry->d_inode;
+        if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
-        if (inode->i_size != 2)
                return -ENOTEMPTY;
        res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
        if (res)
@@ -307,7 +283,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        /* Unlink destination if it already exists */
        if (new_dentry->d_inode) {
-                res = hfs_unlink(new_dir, new_dentry);
+                res = hfs_remove(new_dir, new_dentry);
                if (res)
                        return res;
        }
@@ -332,9 +308,9 @@ const struct file_operations hfs_dir_operations = {
 const struct inode_operations hfs_dir_inode_operations = {
        .create         = hfs_create,
        .lookup         = hfs_lookup,
-        .unlink         = hfs_unlink,
+        .unlink         = hfs_remove,
        .mkdir          = hfs_mkdir,
-        .rmdir          = hfs_rmdir,
+        .rmdir          = hfs_remove,
        .rename         = hfs_rename,
        .setattr        = hfs_inode_setattr,
 };
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index dffb4e996643..fff16c968e67 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -150,7 +150,6 @@ static int hfs_writepages(struct address_space *mapping,
 const struct address_space_operations hfs_btree_aops = {
        .readpage       = hfs_readpage,
        .writepage      = hfs_writepage,
-        .sync_page      = block_sync_page,
        .write_begin    = hfs_write_begin,
        .write_end      = generic_write_end,
        .bmap           = hfs_bmap,
@@ -160,7 +159,6 @@ const struct address_space_operations hfs_btree_aops = {
 const struct address_space_operations hfs_aops = {
        .readpage       = hfs_readpage,
        .writepage      = hfs_writepage,
-        .sync_page      = block_sync_page,
        .write_begin    = hfs_write_begin,
        .write_end      = generic_write_end,
        .bmap           = hfs_bmap,
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 52a0bcaa7b6d..b1991a2a08e0 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -397,8 +397,8 @@ int hfsplus_file_extend(struct inode *inode)
        u32 start, len, goal;
        int res;
-        if (sbi->total_blocks - sbi->free_blocks + 8 >
+        if (sbi->alloc_file->i_size * 8 <
-                        sbi->alloc_file->i_size * 8) {
+            sbi->total_blocks - sbi->free_blocks + 8) {
                /* extend alloc file */
                printk(KERN_ERR "hfs: extend alloc file! "
                                "(%llu,%u,%u)\n",
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index a8df651747f0..b248a6cfcad9 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -146,7 +146,6 @@ static int hfsplus_writepages(struct address_space *mapping,
 const struct address_space_operations hfsplus_btree_aops = {
        .readpage       = hfsplus_readpage,
        .writepage      = hfsplus_writepage,
-        .sync_page      = block_sync_page,
        .write_begin    = hfsplus_write_begin,
        .write_end      = generic_write_end,
        .bmap           = hfsplus_bmap,
@@ -156,7 +155,6 @@ const struct address_space_operations hfsplus_btree_aops = {
 const struct address_space_operations hfsplus_aops = {
        .readpage       = hfsplus_readpage,
        .writepage      = hfsplus_writepage,
-        .sync_page      = block_sync_page,
        .write_begin    = hfsplus_write_begin,
        .write_end      = generic_write_end,
        .bmap           = hfsplus_bmap,
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 508ce662ce12..fbaa6690c8e0 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -47,7 +47,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
        if (err)
                goto out;
-        if (!is_owner_or_cap(inode)) {
+        if (!inode_owner_or_capable(inode)) {
                err = -EACCES;
                goto out_drop_write;
        }
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index d66ad113b1cc..40ad88c12c64 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -134,7 +134,7 @@ int hfs_part_find(struct super_block *sb,
        res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK,
                                 data, READ);
        if (res)
-                return res;
+                goto out;
        switch (be16_to_cpu(*((__be16 *)data))) {
        case HFS_OLD_PMAP_MAGIC:
@@ -147,7 +147,7 @@ int hfs_part_find(struct super_block *sb,
                res = -ENOENT;
                break;
        }
+out:
        kfree(data);
        return res;
 }
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 9a3b4795f43c..b49b55584c84 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -338,20 +338,22 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        struct inode *root, *inode;
        struct qstr str;
        struct nls_table *nls = NULL;
-        int err = -EINVAL;
+        int err;
+        err = -EINVAL;
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
-                return -ENOMEM;
+                goto out;
        sb->s_fs_info = sbi;
        mutex_init(&sbi->alloc_mutex);
        mutex_init(&sbi->vh_mutex);
        hfsplus_fill_defaults(sbi);
+        err = -EINVAL;
        if (!hfsplus_parse_options(data, sbi)) {
                printk(KERN_ERR "hfs: unable to parse mount options\n");
-                err = -EINVAL;
+                goto out_unload_nls;
-                goto cleanup;
        }
        /* temporarily use utf8 to correctly find the hidden dir below */
@@ -359,16 +361,14 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        sbi->nls = load_nls("utf8");
        if (!sbi->nls) {
                printk(KERN_ERR "hfs: unable to load nls for utf8\n");
-                err = -EINVAL;
+                goto out_unload_nls;
-                goto cleanup;
        }
        /* Grab the volume header */
        if (hfsplus_read_wrapper(sb)) {
                if (!silent)
                        printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n");
-                err = -EINVAL;
+                goto out_unload_nls;
-                goto cleanup;
        }
        vhdr = sbi->s_vhdr;
@@ -377,7 +377,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION ||
            be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) {
                printk(KERN_ERR "hfs: wrong filesystem version\n");
-                goto cleanup;
+                goto out_free_vhdr;
        }
        sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
        sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
@@ -421,19 +421,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
        if (!sbi->ext_tree) {
                printk(KERN_ERR "hfs: failed to load extents file\n");
-                goto cleanup;
+                goto out_free_vhdr;
        }
        sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
        if (!sbi->cat_tree) {
                printk(KERN_ERR "hfs: failed to load catalog file\n");
-                goto cleanup;
+                goto out_close_ext_tree;
        }
        inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID);
        if (IS_ERR(inode)) {
                printk(KERN_ERR "hfs: failed to load allocation file\n");
                err = PTR_ERR(inode);
-                goto cleanup;
+                goto out_close_cat_tree;
        }
        sbi->alloc_file = inode;
@@ -442,14 +442,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        if (IS_ERR(root)) {
                printk(KERN_ERR "hfs: failed to load root directory\n");
                err = PTR_ERR(root);
-                goto cleanup;
+                goto out_put_alloc_file;
-        }
-        sb->s_d_op = &hfsplus_dentry_operations;
-        sb->s_root = d_alloc_root(root);
-        if (!sb->s_root) {
-                iput(root);
-                err = -ENOMEM;
-                goto cleanup;
        }
        str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
@@ -459,46 +452,69 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
                hfs_find_exit(&fd);
                if (entry.type != cpu_to_be16(HFSPLUS_FOLDER))
-                        goto cleanup;
+                        goto out_put_root;
                inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id));
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
-                        goto cleanup;
+                        goto out_put_root;
                }
                sbi->hidden_dir = inode;
        } else
                hfs_find_exit(&fd);
-        if (sb->s_flags & MS_RDONLY)
+        if (!(sb->s_flags & MS_RDONLY)) {
-                goto out;
+                /*
+                 * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused
+                 * all three are registered with Apple for our use
+                 */
+                vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION);
+                vhdr->modify_date = hfsp_now2mt();
+                be32_add_cpu(&vhdr->write_count, 1);
+                vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
+                vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
+                hfsplus_sync_fs(sb, 1);
-        /* H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused
+                if (!sbi->hidden_dir) {
-         * all three are registered with Apple for our use
+                        mutex_lock(&sbi->vh_mutex);
-         */
+                        sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
-        vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION);
+                        hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str,
-        vhdr->modify_date = hfsp_now2mt();
+                                           sbi->hidden_dir);
-        be32_add_cpu(&vhdr->write_count, 1);
+                        mutex_unlock(&sbi->vh_mutex);
-        vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
-        vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
+                        hfsplus_mark_inode_dirty(sbi->hidden_dir,
-        hfsplus_sync_fs(sb, 1);
+                                                 HFSPLUS_I_CAT_DIRTY);
+                }
-        if (!sbi->hidden_dir) {
-                mutex_lock(&sbi->vh_mutex);
-                sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
-                hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
-                                   &str, sbi->hidden_dir);
-                mutex_unlock(&sbi->vh_mutex);
-                hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY);
        }
-out:
+        sb->s_d_op = &hfsplus_dentry_operations;
+        sb->s_root = d_alloc_root(root);
+        if (!sb->s_root) {
+                err = -ENOMEM;
+                goto out_put_hidden_dir;
+        }
        unload_nls(sbi->nls);
        sbi->nls = nls;
        return 0;
-cleanup:
+out_put_hidden_dir:
-        hfsplus_put_super(sb);
+        iput(sbi->hidden_dir);
+out_put_root:
+        iput(sbi->alloc_file);
+out_put_alloc_file:
+        iput(sbi->alloc_file);
+out_close_cat_tree:
+        hfs_btree_close(sbi->cat_tree);
+out_close_ext_tree:
+        hfs_btree_close(sbi->ext_tree);
+out_free_vhdr:
+        kfree(sbi->s_vhdr);
+        kfree(sbi->s_backup_vhdr);
+out_unload_nls:
+        unload_nls(sbi->nls);
        unload_nls(nls);
+        kfree(sbi);
+out:
        return err;
 }
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 196231794f64..3031d81f5f0f 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -167,7 +167,7 @@ reread:
                break;
        case cpu_to_be16(HFSP_WRAP_MAGIC):
                if (!hfsplus_read_mdb(sbi->s_vhdr, &wd))
-                        goto out;
+                        goto out_free_backup_vhdr;
                wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
                part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
                part_size = wd.embed_count * wd.ablk_size;
@@ -179,7 +179,7 @@ reread:
                 * (should do this only for cdrom/loop though)
                 */
                if (hfs_part_find(sb, &part_start, &part_size))
-                        goto out;
+                        goto out_free_backup_vhdr;
                goto reread;
        }
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 63b6f5632318..0c39dc3ef7d7 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -1,7 +1,7 @@
 config HPFS_FS
        tristate "OS/2 HPFS file system support"
        depends on BLOCK
-        depends on BKL # nontrivial to fix
+        depends on BROKEN || !PREEMPT
        help
          OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
          is the file system used for organizing files on OS/2 hard disk
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index d32f63a569f7..b3d7c0ddb609 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -6,16 +6,15 @@
 *  directory VFS functions
 */
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include "hpfs_fn.h"
 static int hpfs_dir_release(struct inode *inode, struct file *filp)
 {
-        lock_kernel();
+        hpfs_lock(inode->i_sb);
        hpfs_del_pos(inode, &filp->f_pos);
        /*hpfs_write_if_changed(inode);*/
-        unlock_kernel();
+        hpfs_unlock(inode->i_sb);
        return 0;
 }
@@ -30,7 +29,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
        struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
        struct super_block *s = i->i_sb;
-        lock_kernel();
+        hpfs_lock(s);
        /*printk("dir lseek\n");*/
        if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
@@ -43,12 +42,12 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
        }
        mutex_unlock(&i->i_mutex);
 ok:
-        unlock_kernel();
+        hpfs_unlock(s);
        return filp->f_pos = new_off;
 fail:
        mutex_unlock(&i->i_mutex);
        /*printk("illegal lseek: %016llx\n", new_off);*/
-        unlock_kernel();
+        hpfs_unlock(s);
        return -ESPIPE;
 }
@@ -64,7 +63,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        int c1, c2 = 0;
        int ret = 0;
-        lock_kernel();
+        hpfs_lock(inode->i_sb);
        if (hpfs_sb(inode->i_sb)->sb_chk) {
                if (hpfs_chk_sectors(inode->i_sb, inode->i_ino, 1, "dir_fnode")) {
@@ -167,7 +166,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                hpfs_brelse4(&qbh);
        }
 out:
-        unlock_kernel();
+        hpfs_unlock(inode->i_sb);
        return ret;
 }
@@ -197,10 +196,10 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
        struct inode *result = NULL;
        struct hpfs_inode_info *hpfs_result;
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        if ((err = hpfs_chk_name(name, &len))) {
                if (err == -ENAMETOOLONG) {
-                        unlock_kernel();
+                        hpfs_unlock(dir->i_sb);
                        return ERR_PTR(-ENAMETOOLONG);
                }
                goto end_add;
@@ -298,7 +297,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
        end:
        end_add:
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        d_add(dentry, result);
        return NULL;
@@ -311,7 +310,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
        
        /*bail:*/
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return ERR_PTR(-ENOENT);
 }
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index c0340887c7ea..9b9eb6933e43 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -6,16 +6,15 @@
 *  file VFS functions
 */
-#include <linux/smp_lock.h>
 #include "hpfs_fn.h"
 #define BLOCKS(size) (((size) + 511) >> 9)
 static int hpfs_file_release(struct inode *inode, struct file *file)
 {
-        lock_kernel();
+        hpfs_lock(inode->i_sb);
        hpfs_write_if_changed(inode);
-        unlock_kernel();
+        hpfs_unlock(inode->i_sb);
        return 0;
 }
@@ -49,14 +48,14 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
 static void hpfs_truncate(struct inode *i)
 {
        if (IS_IMMUTABLE(i)) return /*-EPERM*/;
-        lock_kernel();
+        hpfs_lock(i->i_sb);
        hpfs_i(i)->i_n_secs = 0;
        i->i_blocks = 1 + ((i->i_size + 511) >> 9);
        hpfs_i(i)->mmu_private = i->i_size;
        hpfs_truncate_btree(i->i_sb, i->i_ino, 1, ((i->i_size + 511) >> 9));
        hpfs_write_inode(i);
        hpfs_i(i)->i_n_secs = 0;
-        unlock_kernel();
+        hpfs_unlock(i->i_sb);
 }
 static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
@@ -120,7 +119,6 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations hpfs_aops = {
        .readpage = hpfs_readpage,
        .writepage = hpfs_writepage,
-        .sync_page = block_sync_page,
        .write_begin = hpfs_write_begin,
        .write_end = generic_write_end,
        .bmap = _hpfs_bmap
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 1c43dbea55e8..c15adbca07ff 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -342,3 +342,25 @@ static inline time32_t gmt_to_local(struct super_block *s, time_t t)
        extern struct timezone sys_tz;
        return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift;
 }
+/*
+ * Locking:
+ *
+ * hpfs_lock() is a leftover from the big kernel lock.
+ * Right now, these functions are empty and only left
+ * for documentation purposes. The file system no longer
+ * works on SMP systems, so the lock is not needed
+ * any more.
+ *
+ * If someone is interested in making it work again, this
+ * would be the place to start by adding a per-superblock
+ * mutex and fixing all the bugs and performance issues
+ * caused by that.
+ */
+static inline void hpfs_lock(struct super_block *s)
+{
+}
+static inline void hpfs_unlock(struct super_block *s)
+{
+}
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 1ae35baa539e..87f1f787e767 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -6,7 +6,6 @@
 *  inode VFS functions
 */
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include "hpfs_fn.h"
@@ -267,7 +266,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
        struct inode *inode = dentry->d_inode;
        int error = -EINVAL;
-        lock_kernel();
+        hpfs_lock(inode->i_sb);
        if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root)
                goto out_unlock;
        if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size)
@@ -290,7 +289,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
        hpfs_write_inode(inode);
 out_unlock:
-        unlock_kernel();
+        hpfs_unlock(inode->i_sb);
        return error;
 }
@@ -307,8 +306,8 @@ void hpfs_evict_inode(struct inode *inode)
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
        if (!inode->i_nlink) {
-                lock_kernel();
+                hpfs_lock(inode->i_sb);
                hpfs_remove_fnode(inode->i_sb, inode->i_ino);
-                unlock_kernel();
+                hpfs_unlock(inode->i_sb);
        }
 }
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index f4ad9e31ddc4..d5f8c8a19023 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -6,7 +6,6 @@
 *  adding & removing files & directories
 */
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include "hpfs_fn.h"
 static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
@@ -25,7 +24,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct hpfs_dirent dee;
        int err;
        if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        err = -ENOSPC;
        fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
        if (!fnode)
@@ -103,7 +102,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        }
        d_instantiate(dentry, result);
        mutex_unlock(&hpfs_i(dir)->i_mutex);
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return 0;
 bail3:
        mutex_unlock(&hpfs_i(dir)->i_mutex);
@@ -115,7 +114,7 @@ bail1:
        brelse(bh);
        hpfs_free_sectors(dir->i_sb, fno, 1);
 bail:
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return err;
 }
@@ -132,7 +131,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
        int err;
        if ((err = hpfs_chk_name(name, &len)))
                return err==-ENOENT ? -EINVAL : err;
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        err = -ENOSPC;
        fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
        if (!fnode)
@@ -195,7 +194,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
        }
        d_instantiate(dentry, result);
        mutex_unlock(&hpfs_i(dir)->i_mutex);
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return 0;
 bail2:
@@ -205,7 +204,7 @@ bail1:
        brelse(bh);
        hpfs_free_sectors(dir->i_sb, fno, 1);
 bail:
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return err;
 }
@@ -224,7 +223,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
        if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
        if (!new_valid_dev(rdev))
                return -EINVAL;
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        err = -ENOSPC;
        fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
        if (!fnode)
@@ -274,7 +273,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
        d_instantiate(dentry, result);
        mutex_unlock(&hpfs_i(dir)->i_mutex);
        brelse(bh);
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return 0;
 bail2:
        mutex_unlock(&hpfs_i(dir)->i_mutex);
@@ -283,7 +282,7 @@ bail1:
        brelse(bh);
        hpfs_free_sectors(dir->i_sb, fno, 1);
 bail:
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return err;
 }
@@ -299,9 +298,9 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
        struct inode *result;
        int err;
        if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        if (hpfs_sb(dir->i_sb)->sb_eas < 2) {
-                unlock_kernel();
+                hpfs_unlock(dir->i_sb);
                return -EPERM;
        }
        err = -ENOSPC;
@@ -354,7 +353,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
        hpfs_write_inode_nolock(result);
        d_instantiate(dentry, result);
        mutex_unlock(&hpfs_i(dir)->i_mutex);
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return 0;
 bail2:
        mutex_unlock(&hpfs_i(dir)->i_mutex);
@@ -363,7 +362,7 @@ bail1:
        brelse(bh);
        hpfs_free_sectors(dir->i_sb, fno, 1);
 bail:
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return err;
 }
@@ -380,7 +379,7 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
        int rep = 0;
        int err;
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        hpfs_adjust_length(name, &len);
 again:
        mutex_lock(&hpfs_i(inode)->i_parent_mutex);
@@ -416,7 +415,7 @@ again:
                dentry_unhash(dentry);
                if (!d_unhashed(dentry)) {
                        dput(dentry);
-                        unlock_kernel();
+                        hpfs_unlock(dir->i_sb);
                        return -ENOSPC;
                }
                if (generic_permission(inode, MAY_WRITE, 0, NULL) ||
@@ -435,7 +434,7 @@ again:
                        if (!err)
                                goto again;
                }
-                unlock_kernel();
+                hpfs_unlock(dir->i_sb);
                return -ENOSPC;
        default:
                drop_nlink(inode);
@@ -448,7 +447,7 @@ out1:
 out:
        mutex_unlock(&hpfs_i(dir)->i_mutex);
        mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return err;
 }
@@ -466,7 +465,7 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
        int r;
        hpfs_adjust_length(name, &len);
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        mutex_lock(&hpfs_i(inode)->i_parent_mutex);
        mutex_lock(&hpfs_i(dir)->i_mutex);
        err = -ENOENT;
@@ -508,7 +507,7 @@ out1:
 out:
        mutex_unlock(&hpfs_i(dir)->i_mutex);
        mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return err;
 }
@@ -521,21 +520,21 @@ static int hpfs_symlink_readpage(struct file *file, struct page *page)
        int err;
        err = -EIO;
-        lock_kernel();
+        hpfs_lock(i->i_sb);
        if (!(fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh)))
                goto fail;
        err = hpfs_read_ea(i->i_sb, fnode, "SYMLINK", link, PAGE_SIZE);
        brelse(bh);
        if (err)
                goto fail;
-        unlock_kernel();
+        hpfs_unlock(i->i_sb);
        SetPageUptodate(page);
        kunmap(page);
        unlock_page(page);
        return 0;
 fail:
-        unlock_kernel();
+        hpfs_unlock(i->i_sb);
        SetPageError(page);
        kunmap(page);
        unlock_page(page);
@@ -567,7 +566,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        err = 0;
        hpfs_adjust_length(old_name, &old_len);
-        lock_kernel();
+        hpfs_lock(i->i_sb);
        /* order doesn't matter, due to VFS exclusion */
        mutex_lock(&hpfs_i(i)->i_parent_mutex);
        if (new_inode)
@@ -659,7 +658,7 @@ end1:
        mutex_unlock(&hpfs_i(i)->i_parent_mutex);
        if (new_inode)
                mutex_unlock(&hpfs_i(new_inode)->i_parent_mutex);
-        unlock_kernel();
+        hpfs_unlock(i->i_sb);
        return err;
 }
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index b30426b1fc97..c89b40808587 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -13,7 +13,6 @@
 #include <linux/statfs.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/bitmap.h>
 #include <linux/slab.h>
@@ -103,15 +102,11 @@ static void hpfs_put_super(struct super_block *s)
 {
        struct hpfs_sb_info *sbi = hpfs_sb(s);
-        lock_kernel();
        kfree(sbi->sb_cp_table);
        kfree(sbi->sb_bmp_dir);
        unmark_dirty(s);
        s->s_fs_info = NULL;
        kfree(sbi);
-        unlock_kernel();
 }
 unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
@@ -143,7 +138,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct super_block *s = dentry->d_sb;
        struct hpfs_sb_info *sbi = hpfs_sb(s);
        u64 id = huge_encode_dev(s->s_bdev->bd_dev);
-        lock_kernel();
+        hpfs_lock(s);
        /*if (sbi->sb_n_free == -1) {*/
                sbi->sb_n_free = count_bitmaps(s);
@@ -160,7 +155,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = 254;
-        unlock_kernel();
+        hpfs_unlock(s);
        return 0;
 }
@@ -406,7 +401,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
        
        *flags |= MS_NOATIME;
        
-        lock_kernel();
+        hpfs_lock(s);
        lock_super(s);
        uid = sbi->sb_uid; gid = sbi->sb_gid;
        umask = 0777 & ~sbi->sb_mode;
@@ -441,12 +436,12 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
        replace_mount_options(s, new_opts);
        unlock_super(s);
-        unlock_kernel();
+        hpfs_unlock(s);
        return 0;
 out_err:
        unlock_super(s);
-        unlock_kernel();
+        hpfs_unlock(s);
        kfree(new_opts);
        return -EINVAL;
 }
@@ -484,13 +479,15 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
        int o;
-        lock_kernel();
+        if (num_possible_cpus() > 1) {
+                printk(KERN_ERR "HPFS is not SMP safe\n");
+                return -EINVAL;
+        }
        save_mount_options(s, options);
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi) {
-                unlock_kernel();
                return -ENOMEM;
        }
        s->s_fs_info = sbi;
@@ -677,7 +674,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
                        root->i_blocks = 5;
                hpfs_brelse4(&qbh);
        }
-        unlock_kernel();
        return 0;
 bail4:  brelse(bh2);
@@ -689,7 +685,6 @@ bail0:
        kfree(sbi->sb_cp_table);
        s->s_fs_info = NULL;
        kfree(sbi);
-        unlock_kernel();
        return -EINVAL;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9885082b470f..b9eeb1cd03ff 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -332,8 +332,7 @@ static void truncate_huge_page(struct page *page)
 {
        cancel_dirty_page(page, /* No IO accounting for huge pages? */0);
        ClearPageUptodate(page);
-        remove_from_page_cache(page);
+        delete_from_page_cache(page);
-        put_page(page);
 }
 static void truncate_hugepages(struct inode *inode, loff_t lstart)
diff --git a/fs/inode.c b/fs/inode.c
index da85e56378f3..05a1f75ae791 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -25,6 +25,39 @@
 #include <linux/async.h>
 #include <linux/posix_acl.h>
 #include <linux/ima.h>
+#include <linux/cred.h>
+#include "internal.h"
+/*
+ * inode locking rules.
+ *
+ * inode->i_lock protects:
+ *   inode->i_state, inode->i_hash, __iget()
+ * inode_lru_lock protects:
+ *   inode_lru, inode->i_lru
+ * inode_sb_list_lock protects:
+ *   sb->s_inodes, inode->i_sb_list
+ * inode_wb_list_lock protects:
+ *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
+ * inode_hash_lock protects:
+ *   inode_hashtable, inode->i_hash
+ *
+ * Lock ordering:
+ *
+ * inode_sb_list_lock
+ *   inode->i_lock
+ *     inode_lru_lock
+ *
+ * inode_wb_list_lock
+ *   inode->i_lock
+ *
+ * inode_hash_lock
+ *   inode_sb_list_lock
+ *   inode->i_lock
+ *
+ * iunique_lock
+ *   inode_hash_lock
+ */
 /*
 * This is needed for the following functions:
@@ -59,6 +92,8 @@
 static unsigned int i_hash_mask __read_mostly;
 static unsigned int i_hash_shift __read_mostly;
+static struct hlist_head *inode_hashtable __read_mostly;
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
 /*
 * Each inode can be on two separate lists. One is
@@ -73,27 +108,19 @@ static unsigned int i_hash_shift __read_mostly;
 */
 static LIST_HEAD(inode_lru);
-static struct hlist_head *inode_hashtable __read_mostly;
+static DEFINE_SPINLOCK(inode_lru_lock);
-/*
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
- * A simple spinlock to protect the list manipulations.
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
- *
- * NOTE! You also have to own the lock if you change
- * the i_state of an inode while it is in use..
- */
-DEFINE_SPINLOCK(inode_lock);
 /*
- * iprune_sem provides exclusion between the kswapd or try_to_free_pages
+ * iprune_sem provides exclusion between the icache shrinking and the
- * icache shrinking path, and the umount path.  Without this exclusion,
+ * umount path.
- * by the time prune_icache calls iput for the inode whose pages it has
- * been invalidating, or by the time it calls clear_inode & destroy_inode
- * from its final dispose_list, the struct super_block they refer to
- * (for inode->i_sb->s_op) may already have been freed and reused.
 *
- * We make this an rwsem because the fastpath is icache shrinking. In
+ * We don't actually need it to protect anything in the umount path,
- * some cases a filesystem may be doing a significant amount of work in
+ * but only need to cycle through it to make sure any inode that
- * its inode reclaim code, so this should improve parallelism.
+ * prune_icache took off the LRU list has been fully torn down by the
+ * time we are past evict_inodes.
 */
 static DECLARE_RWSEM(iprune_sem);
@@ -139,15 +166,6 @@ int proc_nr_inodes(ctl_table *table, int write,
 }
 #endif
-static void wake_up_inode(struct inode *inode)
-{
-        /*
-         * Prevent speculative execution through spin_unlock(&inode_lock);
-         */
-        smp_mb();
-        wake_up_bit(&inode->i_state, __I_NEW);
-}
 /**
 * inode_init_always - perform inode structure intialisation
 * @sb: superblock inode belongs to
@@ -295,6 +313,20 @@ static void destroy_inode(struct inode *inode)
                call_rcu(&inode->i_rcu, i_callback);
 }
+void address_space_init_once(struct address_space *mapping)
+{
+        memset(mapping, 0, sizeof(*mapping));
+        INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+        spin_lock_init(&mapping->tree_lock);
+        spin_lock_init(&mapping->i_mmap_lock);
+        INIT_LIST_HEAD(&mapping->private_list);
+        spin_lock_init(&mapping->private_lock);
+        INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+        mutex_init(&mapping->unmap_mutex);
+}
+EXPORT_SYMBOL(address_space_init_once);
 /*
 * These are initializations that only need to be done
 * once, because the fields are idempotent across use
@@ -308,13 +340,7 @@ void inode_init_once(struct inode *inode)
        INIT_LIST_HEAD(&inode->i_devices);
        INIT_LIST_HEAD(&inode->i_wb_list);
        INIT_LIST_HEAD(&inode->i_lru);
-        INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
+        address_space_init_once(&inode->i_data);
-        spin_lock_init(&inode->i_data.tree_lock);
-        spin_lock_init(&inode->i_data.i_mmap_lock);
-        INIT_LIST_HEAD(&inode->i_data.private_list);
-        spin_lock_init(&inode->i_data.private_lock);
-        INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
-        INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
        i_size_ordered_init(inode);
 #ifdef CONFIG_FSNOTIFY
        INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
@@ -330,7 +356,7 @@ static void init_once(void *foo)
 }
 /*
- * inode_lock must be held
+ * inode->i_lock must be held
 */
 void __iget(struct inode *inode)
 {
@@ -348,23 +374,22 @@ EXPORT_SYMBOL(ihold);
 static void inode_lru_list_add(struct inode *inode)
 {
+        spin_lock(&inode_lru_lock);
        if (list_empty(&inode->i_lru)) {
                list_add(&inode->i_lru, &inode_lru);
                inodes_stat.nr_unused++;
        }
+        spin_unlock(&inode_lru_lock);
 }
 static void inode_lru_list_del(struct inode *inode)
 {
+        spin_lock(&inode_lru_lock);
        if (!list_empty(&inode->i_lru)) {
                list_del_init(&inode->i_lru);
                inodes_stat.nr_unused--;
        }
-}
+        spin_unlock(&inode_lru_lock);
-static inline void __inode_sb_list_add(struct inode *inode)
-{
-        list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
 }
 /**
@@ -373,15 +398,17 @@ static inline void __inode_sb_list_add(struct inode *inode)
 */
 void inode_sb_list_add(struct inode *inode)
 {
-        spin_lock(&inode_lock);
+        spin_lock(&inode_sb_list_lock);
-        __inode_sb_list_add(inode);
+        list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_sb_list_lock);
 }
 EXPORT_SYMBOL_GPL(inode_sb_list_add);
-static inline void __inode_sb_list_del(struct inode *inode)
+static inline void inode_sb_list_del(struct inode *inode)
 {
+        spin_lock(&inode_sb_list_lock);
        list_del_init(&inode->i_sb_list);
+        spin_unlock(&inode_sb_list_lock);
 }
 static unsigned long hash(struct super_block *sb, unsigned long hashval)
@@ -406,24 +433,15 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
 {
        struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
-        spin_lock(&inode_lock);
+        spin_lock(&inode_hash_lock);
+        spin_lock(&inode->i_lock);
        hlist_add_head(&inode->i_hash, b);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&inode_hash_lock);
 }
 EXPORT_SYMBOL(__insert_inode_hash);
 /**
- *      __remove_inode_hash - remove an inode from the hash
- *      @inode: inode to unhash
- *
- *      Remove an inode from the superblock.
- */
-static void __remove_inode_hash(struct inode *inode)
-{
-        hlist_del_init(&inode->i_hash);
-}
-/**
 *      remove_inode_hash - remove an inode from the hash
 *      @inode: inode to unhash
 *
@@ -431,9 +449,11 @@ static void __remove_inode_hash(struct inode *inode)
 */
 void remove_inode_hash(struct inode *inode)
 {
-        spin_lock(&inode_lock);
+        spin_lock(&inode_hash_lock);
+        spin_lock(&inode->i_lock);
        hlist_del_init(&inode->i_hash);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&inode_hash_lock);
 }
 EXPORT_SYMBOL(remove_inode_hash);
@@ -450,10 +470,29 @@ void end_writeback(struct inode *inode)
 }
 EXPORT_SYMBOL(end_writeback);
+/*
+ * Free the inode passed in, removing it from the lists it is still connected
+ * to. We remove any pages still attached to the inode and wait for any IO that
+ * is still in progress before finally destroying the inode.
+ *
+ * An inode must already be marked I_FREEING so that we avoid the inode being
+ * moved back onto lists if we race with other code that manipulates the lists
+ * (e.g. writeback_single_inode). The caller is responsible for setting this.
+ *
+ * An inode must already be removed from the LRU list before being evicted from
+ * the cache. This should occur atomically with setting the I_FREEING state
+ * flag, so no inodes here should ever be on the LRU when being evicted.
+ */
 static void evict(struct inode *inode)
 {
        const struct super_operations *op = inode->i_sb->s_op;
+        BUG_ON(!(inode->i_state & I_FREEING));
+        BUG_ON(!list_empty(&inode->i_lru));
+        inode_wb_list_del(inode);
+        inode_sb_list_del(inode);
        if (op->evict_inode) {
                op->evict_inode(inode);
        } else {
@@ -465,6 +504,15 @@ static void evict(struct inode *inode)
                bd_forget(inode);
        if (S_ISCHR(inode->i_mode) && inode->i_cdev)
                cd_forget(inode);
+        remove_inode_hash(inode);
+        spin_lock(&inode->i_lock);
+        wake_up_bit(&inode->i_state, __I_NEW);
+        BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
+        spin_unlock(&inode->i_lock);
+        destroy_inode(inode);
 }
 /*
@@ -483,14 +531,6 @@ static void dispose_list(struct list_head *head)
                list_del_init(&inode->i_lru);
                evict(inode);
-                spin_lock(&inode_lock);
-                __remove_inode_hash(inode);
-                __inode_sb_list_del(inode);
-                spin_unlock(&inode_lock);
-                wake_up_inode(inode);
-                destroy_inode(inode);
        }
 }
@@ -508,74 +548,77 @@ void evict_inodes(struct super_block *sb)
        struct inode *inode, *next;
        LIST_HEAD(dispose);
-        down_write(&iprune_sem);
+        spin_lock(&inode_sb_list_lock);
-        spin_lock(&inode_lock);
        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
                if (atomic_read(&inode->i_count))
                        continue;
+                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
-                        WARN_ON(1);
+                        spin_unlock(&inode->i_lock);
                        continue;
                }
                inode->i_state |= I_FREEING;
+                inode_lru_list_del(inode);
-                /*
+                spin_unlock(&inode->i_lock);
-                 * Move the inode off the IO lists and LRU once I_FREEING is
+                list_add(&inode->i_lru, &dispose);
-                 * set so that it won't get moved back on there if it is dirty.
-                 */
-                list_move(&inode->i_lru, &dispose);
-                list_del_init(&inode->i_wb_list);
-                if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-                        inodes_stat.nr_unused--;
        }
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_sb_list_lock);
        dispose_list(&dispose);
+        /*
+         * Cycle through iprune_sem to make sure any inode that prune_icache
+         * moved off the list before we took the lock has been fully torn
+         * down.
+         */
+        down_write(&iprune_sem);
        up_write(&iprune_sem);
 }
 /**
 * invalidate_inodes    - attempt to free all inodes on a superblock
 * @sb:         superblock to operate on
+ * @kill_dirty: flag to guide handling of dirty inodes
 *
 * Attempts to free all inodes for a given superblock.  If there were any
 * busy inodes return a non-zero value, else zero.
+ * If @kill_dirty is set, discard dirty inodes too, otherwise treat
+ * them as busy.
 */
-int invalidate_inodes(struct super_block *sb)
+int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 {
        int busy = 0;
        struct inode *inode, *next;
        LIST_HEAD(dispose);
-        down_write(&iprune_sem);
+        spin_lock(&inode_sb_list_lock);
-        spin_lock(&inode_lock);
        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
+                spin_lock(&inode->i_lock);
+                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+                        spin_unlock(&inode->i_lock);
                        continue;
+                }
+                if (inode->i_state & I_DIRTY && !kill_dirty) {
+                        spin_unlock(&inode->i_lock);
+                        busy = 1;
+                        continue;
+                }
                if (atomic_read(&inode->i_count)) {
+                        spin_unlock(&inode->i_lock);
                        busy = 1;
                        continue;
                }
                inode->i_state |= I_FREEING;
+                inode_lru_list_del(inode);
-                /*
+                spin_unlock(&inode->i_lock);
-                 * Move the inode off the IO lists and LRU once I_FREEING is
+                list_add(&inode->i_lru, &dispose);
-                 * set so that it won't get moved back on there if it is dirty.
-                 */
-                list_move(&inode->i_lru, &dispose);
-                list_del_init(&inode->i_wb_list);
-                if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-                        inodes_stat.nr_unused--;
        }
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_sb_list_lock);
        dispose_list(&dispose);
-        up_write(&iprune_sem);
        return busy;
 }
@@ -595,7 +638,7 @@ static int can_unuse(struct inode *inode)
 /*
 * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
- * temporary list and then are freed outside inode_lock by dispose_list().
+ * temporary list and then are freed outside inode_lru_lock by dispose_list().
 *
 * Any inodes which are pinned purely because of attached pagecache have their
 * pagecache removed.  If the inode has metadata buffers attached to
@@ -616,7 +659,7 @@ static void prune_icache(int nr_to_scan)
        unsigned long reap = 0;
        down_read(&iprune_sem);
-        spin_lock(&inode_lock);
+        spin_lock(&inode_lru_lock);
        for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
                struct inode *inode;
@@ -626,53 +669,67 @@ static void prune_icache(int nr_to_scan)
                inode = list_entry(inode_lru.prev, struct inode, i_lru);
                /*
+                 * we are inverting the inode_lru_lock/inode->i_lock here,
+                 * so use a trylock. If we fail to get the lock, just move the
+                 * inode to the back of the list so we don't spin on it.
+                 */
+                if (!spin_trylock(&inode->i_lock)) {
+                        list_move(&inode->i_lru, &inode_lru);
+                        continue;
+                }
+                /*
                 * Referenced or dirty inodes are still in use. Give them
                 * another pass through the LRU as we canot reclaim them now.
                 */
                if (atomic_read(&inode->i_count) ||
                    (inode->i_state & ~I_REFERENCED)) {
                        list_del_init(&inode->i_lru);
+                        spin_unlock(&inode->i_lock);
                        inodes_stat.nr_unused--;
                        continue;
                }
                /* recently referenced inodes get one more pass */
                if (inode->i_state & I_REFERENCED) {
-                        list_move(&inode->i_lru, &inode_lru);
                        inode->i_state &= ~I_REFERENCED;
+                        list_move(&inode->i_lru, &inode_lru);
+                        spin_unlock(&inode->i_lock);
                        continue;
                }
                if (inode_has_buffers(inode) || inode->i_data.nrpages) {
                        __iget(inode);
-                        spin_unlock(&inode_lock);
+                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&inode_lru_lock);
                        if (remove_inode_buffers(inode))
                                reap += invalidate_mapping_pages(&inode->i_data,
                                                                0, -1);
                        iput(inode);
-                        spin_lock(&inode_lock);
+                        spin_lock(&inode_lru_lock);
                        if (inode != list_entry(inode_lru.next,
                                                struct inode, i_lru))
                                continue;       /* wrong inode or list_empty */
-                        if (!can_unuse(inode))
+                        /* avoid lock inversions with trylock */
+                        if (!spin_trylock(&inode->i_lock))
+                                continue;
+                        if (!can_unuse(inode)) {
+                                spin_unlock(&inode->i_lock);
                                continue;
+                        }
                }
                WARN_ON(inode->i_state & I_NEW);
                inode->i_state |= I_FREEING;
+                spin_unlock(&inode->i_lock);
-                /*
-                 * Move the inode off the IO lists and LRU once I_FREEING is
-                 * set so that it won't get moved back on there if it is dirty.
-                 */
                list_move(&inode->i_lru, &freeable);
-                list_del_init(&inode->i_wb_list);
                inodes_stat.nr_unused--;
        }
        if (current_is_kswapd())
                __count_vm_events(KSWAPD_INODESTEAL, reap);
        else
                __count_vm_events(PGINODESTEAL, reap);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_lru_lock);
        dispose_list(&freeable);
        up_read(&iprune_sem);
@@ -721,15 +778,21 @@ static struct inode *find_inode(struct super_block *sb,
 repeat:
        hlist_for_each_entry(inode, node, head, i_hash) {
-                if (inode->i_sb != sb)
+                spin_lock(&inode->i_lock);
+                if (inode->i_sb != sb) {
+                        spin_unlock(&inode->i_lock);
                        continue;
-                if (!test(inode, data))
+                }
+                if (!test(inode, data)) {
+                        spin_unlock(&inode->i_lock);
                        continue;
+                }
                if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
                __iget(inode);
+                spin_unlock(&inode->i_lock);
                return inode;
        }
        return NULL;
@@ -747,15 +810,21 @@ static struct inode *find_inode_fast(struct super_block *sb,
 repeat:
        hlist_for_each_entry(inode, node, head, i_hash) {
-                if (inode->i_ino != ino)
+                spin_lock(&inode->i_lock);
+                if (inode->i_ino != ino) {
+                        spin_unlock(&inode->i_lock);
                        continue;
-                if (inode->i_sb != sb)
+                }
+                if (inode->i_sb != sb) {
+                        spin_unlock(&inode->i_lock);
                        continue;
+                }
                if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
                __iget(inode);
+                spin_unlock(&inode->i_lock);
                return inode;
        }
        return NULL;
@@ -815,19 +884,26 @@ struct inode *new_inode(struct super_block *sb)
 {
        struct inode *inode;
-        spin_lock_prefetch(&inode_lock);
+        spin_lock_prefetch(&inode_sb_list_lock);
        inode = alloc_inode(sb);
        if (inode) {
-                spin_lock(&inode_lock);
+                spin_lock(&inode->i_lock);
-                __inode_sb_list_add(inode);
                inode->i_state = 0;
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode->i_lock);
+                inode_sb_list_add(inode);
        }
        return inode;
 }
 EXPORT_SYMBOL(new_inode);
+/**
+ * unlock_new_inode - clear the I_NEW state and wake up any waiters
+ * @inode:      new inode to unlock
+ *
+ * Called when the inode is fully initialised to clear the new state of the
+ * inode and wake up anyone waiting for the inode to finish initialisation.
+ */
 void unlock_new_inode(struct inode *inode)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -847,51 +923,67 @@ void unlock_new_inode(struct inode *inode)
                }
        }
 #endif
-        /*
+        spin_lock(&inode->i_lock);
-         * This is special!  We do not need the spinlock when clearing I_NEW,
-         * because we're guaranteed that nobody else tries to do anything about
-         * the state of the inode when it is locked, as we just created it (so
-         * there can be no old holders that haven't tested I_NEW).
-         * However we must emit the memory barrier so that other CPUs reliably
-         * see the clearing of I_NEW after the other inode initialisation has
-         * completed.
-         */
-        smp_mb();
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW;
-        wake_up_inode(inode);
+        wake_up_bit(&inode->i_state, __I_NEW);
+        spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL(unlock_new_inode);
-/*
+/**
- * This is called without the inode lock held.. Be careful.
+ * iget5_locked - obtain an inode from a mounted file system
+ * @sb:         super block of file system
+ * @hashval:    hash value (usually inode number) to get
+ * @test:       callback used for comparisons between inodes
+ * @set:        callback used to initialize a new struct inode
+ * @data:       opaque data pointer to pass to @test and @set
+ *
+ * Search for the inode specified by @hashval and @data in the inode cache,
+ * and if present it is return it with an increased reference count. This is
+ * a generalized version of iget_locked() for file systems where the inode
+ * number is not sufficient for unique identification of an inode.
 *
- * We no longer cache the sb_flags in i_flags - see fs.h
+ * If the inode is not in cache, allocate a new inode and return it locked,
- *      -- rmk@arm.uk.linux.org
+ * hashed, and with the I_NEW flag set. The file system gets to fill it in
+ * before unlocking it via unlock_new_inode().
+ *
+ * Note both @test and @set are called with the inode_hash_lock held, so can't
+ * sleep.
 */
-static struct inode *get_new_inode(struct super_block *sb,
+struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
-                                struct hlist_head *head,
+                int (*test)(struct inode *, void *),
-                                int (*test)(struct inode *, void *),
+                int (*set)(struct inode *, void *), void *data)
-                                int (*set)(struct inode *, void *),
-                                void *data)
 {
+        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode;
+        spin_lock(&inode_hash_lock);
+        inode = find_inode(sb, head, test, data);
+        spin_unlock(&inode_hash_lock);
+        if (inode) {
+                wait_on_inode(inode);
+                return inode;
+        }
        inode = alloc_inode(sb);
        if (inode) {
                struct inode *old;
-                spin_lock(&inode_lock);
+                spin_lock(&inode_hash_lock);
                /* We released the lock, so.. */
                old = find_inode(sb, head, test, data);
                if (!old) {
                        if (set(inode, data))
                                goto set_failed;
-                        hlist_add_head(&inode->i_hash, head);
+                        spin_lock(&inode->i_lock);
-                        __inode_sb_list_add(inode);
                        inode->i_state = I_NEW;
-                        spin_unlock(&inode_lock);
+                        hlist_add_head(&inode->i_hash, head);
+                        spin_unlock(&inode->i_lock);
+                        inode_sb_list_add(inode);
+                        spin_unlock(&inode_hash_lock);
                        /* Return the locked inode with I_NEW set, the
                         * caller is responsible for filling in the contents
@@ -904,7 +996,7 @@ static struct inode *get_new_inode(struct super_block *sb,
                 * us. Use the old inode instead of the one we just
                 * allocated.
                 */
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode_hash_lock);
                destroy_inode(inode);
                inode = old;
                wait_on_inode(inode);
@@ -912,33 +1004,53 @@ static struct inode *get_new_inode(struct super_block *sb,
        return inode;
 set_failed:
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_hash_lock);
        destroy_inode(inode);
        return NULL;
 }
+EXPORT_SYMBOL(iget5_locked);
-/*
+/**
- * get_new_inode_fast is the fast path version of get_new_inode, see the
+ * iget_locked - obtain an inode from a mounted file system
- * comment at iget_locked for details.
+ * @sb:         super block of file system
+ * @ino:        inode number to get
+ *
+ * Search for the inode specified by @ino in the inode cache and if present
+ * return it with an increased reference count. This is for file systems
+ * where the inode number is sufficient for unique identification of an inode.
+ *
+ * If the inode is not in cache, allocate a new inode and return it locked,
+ * hashed, and with the I_NEW flag set.  The file system gets to fill it in
+ * before unlocking it via unlock_new_inode().
 */
-static struct inode *get_new_inode_fast(struct super_block *sb,
+struct inode *iget_locked(struct super_block *sb, unsigned long ino)
-                                struct hlist_head *head, unsigned long ino)
 {
+        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;
+        spin_lock(&inode_hash_lock);
+        inode = find_inode_fast(sb, head, ino);
+        spin_unlock(&inode_hash_lock);
+        if (inode) {
+                wait_on_inode(inode);
+                return inode;
+        }
        inode = alloc_inode(sb);
        if (inode) {
                struct inode *old;
-                spin_lock(&inode_lock);
+                spin_lock(&inode_hash_lock);
                /* We released the lock, so.. */
                old = find_inode_fast(sb, head, ino);
                if (!old) {
                        inode->i_ino = ino;
-                        hlist_add_head(&inode->i_hash, head);
+                        spin_lock(&inode->i_lock);
-                        __inode_sb_list_add(inode);
                        inode->i_state = I_NEW;
-                        spin_unlock(&inode_lock);
+                        hlist_add_head(&inode->i_hash, head);
+                        spin_unlock(&inode->i_lock);
+                        inode_sb_list_add(inode);
+                        spin_unlock(&inode_hash_lock);
                        /* Return the locked inode with I_NEW set, the
                         * caller is responsible for filling in the contents
@@ -951,13 +1063,14 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
                 * us. Use the old inode instead of the one we just
                 * allocated.
                 */
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode_hash_lock);
                destroy_inode(inode);
                inode = old;
                wait_on_inode(inode);
        }
        return inode;
 }
+EXPORT_SYMBOL(iget_locked);
 /*
 * search the inode cache for a matching inode number.
@@ -972,10 +1085,14 @@ static int test_inode_iunique(struct super_block *sb, unsigned long ino)
        struct hlist_node *node;
        struct inode *inode;
+        spin_lock(&inode_hash_lock);
        hlist_for_each_entry(inode, node, b, i_hash) {
-                if (inode->i_ino == ino && inode->i_sb == sb)
+                if (inode->i_ino == ino && inode->i_sb == sb) {
+                        spin_unlock(&inode_hash_lock);
                        return 0;
+                }
        }
+        spin_unlock(&inode_hash_lock);
        return 1;
 }
@@ -1005,7 +1122,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
        static unsigned int counter;
        ino_t res;
-        spin_lock(&inode_lock);
        spin_lock(&iunique_lock);
        do {
                if (counter <= max_reserved)
@@ -1013,7 +1129,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
                res = counter++;
        } while (!test_inode_iunique(sb, res));
        spin_unlock(&iunique_lock);
-        spin_unlock(&inode_lock);
        return res;
 }
@@ -1021,116 +1136,50 @@ EXPORT_SYMBOL(iunique);
 struct inode *igrab(struct inode *inode)
 {
-        spin_lock(&inode_lock);
+        spin_lock(&inode->i_lock);
-        if (!(inode->i_state & (I_FREEING|I_WILL_FREE)))
+        if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
                __iget(inode);
-        else
+                spin_unlock(&inode->i_lock);
+        } else {
+                spin_unlock(&inode->i_lock);
                /*
                 * Handle the case where s_op->clear_inode is not been
                 * called yet, and somebody is calling igrab
                 * while the inode is getting freed.
                 */
                inode = NULL;
-        spin_unlock(&inode_lock);
+        }
        return inode;
 }
 EXPORT_SYMBOL(igrab);
 /**
- * ifind - internal function, you want ilookup5() or iget5().
- * @sb:         super block of file system to search
- * @head:       the head of the list to search
- * @test:       callback used for comparisons between inodes
- * @data:       opaque data pointer to pass to @test
- * @wait:       if true wait for the inode to be unlocked, if false do not
- *
- * ifind() searches for the inode specified by @data in the inode
- * cache. This is a generalized version of ifind_fast() for file systems where
- * the inode number is not sufficient for unique identification of an inode.
- *
- * If the inode is in the cache, the inode is returned with an incremented
- * reference count.
- *
- * Otherwise NULL is returned.
- *
- * Note, @test is called with the inode_lock held, so can't sleep.
- */
-static struct inode *ifind(struct super_block *sb,
-                struct hlist_head *head, int (*test)(struct inode *, void *),
-                void *data, const int wait)
-{
-        struct inode *inode;
-        spin_lock(&inode_lock);
-        inode = find_inode(sb, head, test, data);
-        if (inode) {
-                spin_unlock(&inode_lock);
-                if (likely(wait))
-                        wait_on_inode(inode);
-                return inode;
-        }
-        spin_unlock(&inode_lock);
-        return NULL;
-}
-/**
- * ifind_fast - internal function, you want ilookup() or iget().
- * @sb:         super block of file system to search
- * @head:       head of the list to search
- * @ino:        inode number to search for
- *
- * ifind_fast() searches for the inode @ino in the inode cache. This is for
- * file systems where the inode number is sufficient for unique identification
- * of an inode.
- *
- * If the inode is in the cache, the inode is returned with an incremented
- * reference count.
- *
- * Otherwise NULL is returned.
- */
-static struct inode *ifind_fast(struct super_block *sb,
-                struct hlist_head *head, unsigned long ino)
-{
-        struct inode *inode;
-        spin_lock(&inode_lock);
-        inode = find_inode_fast(sb, head, ino);
-        if (inode) {
-                spin_unlock(&inode_lock);
-                wait_on_inode(inode);
-                return inode;
-        }
-        spin_unlock(&inode_lock);
-        return NULL;
-}
-/**
 * ilookup5_nowait - search for an inode in the inode cache
 * @sb:         super block of file system to search
 * @hashval:    hash value (usually inode number) to search for
 * @test:       callback used for comparisons between inodes
 * @data:       opaque data pointer to pass to @test
 *
- * ilookup5() uses ifind() to search for the inode specified by @hashval and
+ * Search for the inode specified by @hashval and @data in the inode cache.
- * @data in the inode cache. This is a generalized version of ilookup() for
- * file systems where the inode number is not sufficient for unique
- * identification of an inode.
- *
 * If the inode is in the cache, the inode is returned with an incremented
- * reference count.  Note, the inode lock is not waited upon so you have to be
+ * reference count.
- * very careful what you do with the returned inode.  You probably should be
- * using ilookup5() instead.
 *
- * Otherwise NULL is returned.
+ * Note: I_NEW is not waited upon so you have to be very careful what you do
+ * with the returned inode.  You probably should be using ilookup5() instead.
 *
- * Note, @test is called with the inode_lock held, so can't sleep.
+ * Note: @test is called with the inode_hash_lock held, so can't sleep.
 */
 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
 {
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+        struct inode *inode;
-        return ifind(sb, head, test, data, 0);
+        spin_lock(&inode_hash_lock);
+        inode = find_inode(sb, head, test, data);
+        spin_unlock(&inode_hash_lock);
+        return inode;
 }
 EXPORT_SYMBOL(ilookup5_nowait);
@@ -1141,24 +1190,24 @@ EXPORT_SYMBOL(ilookup5_nowait);
 * @test:       callback used for comparisons between inodes
 * @data:       opaque data pointer to pass to @test
 *
- * ilookup5() uses ifind() to search for the inode specified by @hashval and
+ * Search for the inode specified by @hashval and @data in the inode cache,
- * @data in the inode cache. This is a generalized version of ilookup() for
+ * and if the inode is in the cache, return the inode with an incremented
- * file systems where the inode number is not sufficient for unique
+ * reference count.  Waits on I_NEW before returning the inode.
- * identification of an inode.
- *
- * If the inode is in the cache, the inode lock is waited upon and the inode is
 * returned with an incremented reference count.
 *
- * Otherwise NULL is returned.
+ * This is a generalized version of ilookup() for file systems where the
+ * inode number is not sufficient for unique identification of an inode.
 *
- * Note, @test is called with the inode_lock held, so can't sleep.
+ * Note: @test is called with the inode_hash_lock held, so can't sleep.
 */
 struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
 {
-        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+        struct inode *inode = ilookup5_nowait(sb, hashval, test, data);
-        return ifind(sb, head, test, data, 1);
+        if (inode)
+                wait_on_inode(inode);
+        return inode;
 }
 EXPORT_SYMBOL(ilookup5);
@@ -1167,91 +1216,23 @@ EXPORT_SYMBOL(ilookup5);
 * @sb:         super block of file system to search
 * @ino:        inode number to search for
 *
- * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache.
+ * Search for the inode @ino in the inode cache, and if the inode is in the
- * This is for file systems where the inode number is sufficient for unique
+ * cache, the inode is returned with an incremented reference count.
- * identification of an inode.
- *
- * If the inode is in the cache, the inode is returned with an incremented
- * reference count.
- *
- * Otherwise NULL is returned.
 */
 struct inode *ilookup(struct super_block *sb, unsigned long ino)
 {
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
-        return ifind_fast(sb, head, ino);
-}
-EXPORT_SYMBOL(ilookup);
-/**
- * iget5_locked - obtain an inode from a mounted file system
- * @sb:         super block of file system
- * @hashval:    hash value (usually inode number) to get
- * @test:       callback used for comparisons between inodes
- * @set:        callback used to initialize a new struct inode
- * @data:       opaque data pointer to pass to @test and @set
- *
- * iget5_locked() uses ifind() to search for the inode specified by @hashval
- * and @data in the inode cache and if present it is returned with an increased
- * reference count. This is a generalized version of iget_locked() for file
- * systems where the inode number is not sufficient for unique identification
- * of an inode.
- *
- * If the inode is not in cache, get_new_inode() is called to allocate a new
- * inode and this is returned locked, hashed, and with the I_NEW flag set. The
- * file system gets to fill it in before unlocking it via unlock_new_inode().
- *
- * Note both @test and @set are called with the inode_lock held, so can't sleep.
- */
-struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
-                int (*test)(struct inode *, void *),
-                int (*set)(struct inode *, void *), void *data)
-{
-        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode;
-        inode = ifind(sb, head, test, data, 1);
+        spin_lock(&inode_hash_lock);
-        if (inode)
+        inode = find_inode_fast(sb, head, ino);
-                return inode;
+        spin_unlock(&inode_hash_lock);
-        /*
-         * get_new_inode() will do the right thing, re-trying the search
-         * in case it had to block at any point.
-         */
-        return get_new_inode(sb, head, test, set, data);
-}
-EXPORT_SYMBOL(iget5_locked);
-/**
- * iget_locked - obtain an inode from a mounted file system
- * @sb:         super block of file system
- * @ino:        inode number to get
- *
- * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
- * the inode cache and if present it is returned with an increased reference
- * count. This is for file systems where the inode number is sufficient for
- * unique identification of an inode.
- *
- * If the inode is not in cache, get_new_inode_fast() is called to allocate a
- * new inode and this is returned locked, hashed, and with the I_NEW flag set.
- * The file system gets to fill it in before unlocking it via
- * unlock_new_inode().
- */
-struct inode *iget_locked(struct super_block *sb, unsigned long ino)
-{
-        struct hlist_head *head = inode_hashtable + hash(sb, ino);
-        struct inode *inode;
-        inode = ifind_fast(sb, head, ino);
        if (inode)
-                return inode;
+                wait_on_inode(inode);
-        /*
+        return inode;
-         * get_new_inode_fast() will do the right thing, re-trying the search
-         * in case it had to block at any point.
-         */
-        return get_new_inode_fast(sb, head, ino);
 }
-EXPORT_SYMBOL(iget_locked);
+EXPORT_SYMBOL(ilookup);
 int insert_inode_locked(struct inode *inode)
 {
@@ -1259,27 +1240,33 @@ int insert_inode_locked(struct inode *inode)
        ino_t ino = inode->i_ino;
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
-        inode->i_state |= I_NEW;
        while (1) {
                struct hlist_node *node;
                struct inode *old = NULL;
-                spin_lock(&inode_lock);
+                spin_lock(&inode_hash_lock);
                hlist_for_each_entry(old, node, head, i_hash) {
                        if (old->i_ino != ino)
                                continue;
                        if (old->i_sb != sb)
                                continue;
-                        if (old->i_state & (I_FREEING|I_WILL_FREE))
+                        spin_lock(&old->i_lock);
+                        if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+                                spin_unlock(&old->i_lock);
                                continue;
+                        }
                        break;
                }
                if (likely(!node)) {
+                        spin_lock(&inode->i_lock);
+                        inode->i_state |= I_NEW;
                        hlist_add_head(&inode->i_hash, head);
-                        spin_unlock(&inode_lock);
+                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&inode_hash_lock);
                        return 0;
                }
                __iget(old);
-                spin_unlock(&inode_lock);
+                spin_unlock(&old->i_lock);
+                spin_unlock(&inode_hash_lock);
                wait_on_inode(old);
                if (unlikely(!inode_unhashed(old))) {
                        iput(old);
@@ -1296,29 +1283,34 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
        struct super_block *sb = inode->i_sb;
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
-        inode->i_state |= I_NEW;
        while (1) {
                struct hlist_node *node;
                struct inode *old = NULL;
-                spin_lock(&inode_lock);
+                spin_lock(&inode_hash_lock);
                hlist_for_each_entry(old, node, head, i_hash) {
                        if (old->i_sb != sb)
                                continue;
                        if (!test(old, data))
                                continue;
-                        if (old->i_state & (I_FREEING|I_WILL_FREE))
+                        spin_lock(&old->i_lock);
+                        if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+                                spin_unlock(&old->i_lock);
                                continue;
+                        }
                        break;
                }
                if (likely(!node)) {
+                        spin_lock(&inode->i_lock);
+                        inode->i_state |= I_NEW;
                        hlist_add_head(&inode->i_hash, head);
-                        spin_unlock(&inode_lock);
+                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&inode_hash_lock);
                        return 0;
                }
                __iget(old);
-                spin_unlock(&inode_lock);
+                spin_unlock(&old->i_lock);
+                spin_unlock(&inode_hash_lock);
                wait_on_inode(old);
                if (unlikely(!inode_unhashed(old))) {
                        iput(old);
@@ -1363,47 +1355,35 @@ static void iput_final(struct inode *inode)
        const struct super_operations *op = inode->i_sb->s_op;
        int drop;
+        WARN_ON(inode->i_state & I_NEW);
        if (op && op->drop_inode)
                drop = op->drop_inode(inode);
        else
                drop = generic_drop_inode(inode);
+        if (!drop && (sb->s_flags & MS_ACTIVE)) {
+                inode->i_state |= I_REFERENCED;
+                if (!(inode->i_state & (I_DIRTY|I_SYNC)))
+                        inode_lru_list_add(inode);
+                spin_unlock(&inode->i_lock);
+                return;
+        }
        if (!drop) {
-                if (sb->s_flags & MS_ACTIVE) {
-                        inode->i_state |= I_REFERENCED;
-                        if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
-                                inode_lru_list_add(inode);
-                        }
-                        spin_unlock(&inode_lock);
-                        return;
-                }
-                WARN_ON(inode->i_state & I_NEW);
                inode->i_state |= I_WILL_FREE;
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode->i_lock);
                write_inode_now(inode, 1);
-                spin_lock(&inode_lock);
+                spin_lock(&inode->i_lock);
                WARN_ON(inode->i_state & I_NEW);
                inode->i_state &= ~I_WILL_FREE;
-                __remove_inode_hash(inode);
        }
-        WARN_ON(inode->i_state & I_NEW);
        inode->i_state |= I_FREEING;
-        /*
-         * Move the inode off the IO lists and LRU once I_FREEING is
-         * set so that it won't get moved back on there if it is dirty.
-         */
        inode_lru_list_del(inode);
-        list_del_init(&inode->i_wb_list);
+        spin_unlock(&inode->i_lock);
-        __inode_sb_list_del(inode);
-        spin_unlock(&inode_lock);
        evict(inode);
-        remove_inode_hash(inode);
-        wake_up_inode(inode);
-        BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
-        destroy_inode(inode);
 }
 /**
@@ -1420,7 +1400,7 @@ void iput(struct inode *inode)
        if (inode) {
                BUG_ON(inode->i_state & I_CLEAR);
-                if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
+                if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
                        iput_final(inode);
        }
 }
@@ -1599,9 +1579,8 @@ EXPORT_SYMBOL(inode_wait);
 * to recheck inode state.
 *
 * It doesn't matter if I_NEW is not set initially, a call to
- * wake_up_inode() after removing from the hash list will DTRT.
+ * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
- *
+ * will DTRT.
- * This is called with inode_lock held.
 */
 static void __wait_on_freeing_inode(struct inode *inode)
 {
@@ -1609,10 +1588,11 @@ static void __wait_on_freeing_inode(struct inode *inode)
        DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
        wq = bit_waitqueue(&inode->i_state, __I_NEW);
        prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&inode_hash_lock);
        schedule();
        finish_wait(wq, &wait.wait);
-        spin_lock(&inode_lock);
+        spin_lock(&inode_hash_lock);
 }
 static __initdata unsigned long ihash_entries;
@@ -1704,7 +1684,7 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
 EXPORT_SYMBOL(init_special_inode);
 /**
- * Init uid,gid,mode for new inode according to posix standards
+ * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
 * @inode: New inode
 * @dir: Directory inode
 * @mode: mode of the new inode
@@ -1722,3 +1702,22 @@ void inode_init_owner(struct inode *inode, const struct inode *dir,
        inode->i_mode = mode;
 }
 EXPORT_SYMBOL(inode_init_owner);
+/**
+ * inode_owner_or_capable - check current task permissions to inode
+ * @inode: inode being checked
+ *
+ * Return true if current either has CAP_FOWNER to the inode, or
+ * owns the file.
+ */
+bool inode_owner_or_capable(const struct inode *inode)
+{
+        struct user_namespace *ns = inode_userns(inode);
+        if (current_user_ns() == ns && current_fsuid() == inode->i_uid)
+                return true;
+        if (ns_capable(ns, CAP_FOWNER))
+                return true;
+        return false;
+}
+EXPORT_SYMBOL(inode_owner_or_capable);
diff --git a/fs/internal.h b/fs/internal.h
index 0663568b1247..b29c46e4e32f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -12,6 +12,7 @@
 #include <linux/lglock.h>
 struct super_block;
+struct file_system_type;
 struct linux_binprm;
 struct path;
@@ -61,10 +62,9 @@ extern int check_unsafe_exec(struct linux_binprm *);
 extern int copy_mount_options(const void __user *, unsigned long *);
 extern int copy_mount_string(const void __user *, char **);
-extern void free_vfsmnt(struct vfsmount *);
-extern struct vfsmount *alloc_vfsmnt(const char *);
 extern unsigned int mnt_get_count(struct vfsmount *mnt);
 extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
+extern struct vfsmount *lookup_mnt(struct path *);
 extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
                                struct vfsmount *);
 extern void release_mounts(struct list_head *);
@@ -99,6 +99,8 @@ extern struct file *get_empty_filp(void);
 extern int do_remount_sb(struct super_block *, int, void *, int);
 extern void __put_super(struct super_block *sb);
 extern void put_super(struct super_block *sb);
+extern struct dentry *mount_fs(struct file_system_type *,
+                               int, const char *, void *);
 /*
 * open.c
@@ -106,10 +108,30 @@ extern void put_super(struct super_block *sb);
 struct nameidata;
 extern struct file *nameidata_to_filp(struct nameidata *);
 extern void release_open_intent(struct nameidata *);
+struct open_flags {
+        int open_flag;
+        int mode;
+        int acc_mode;
+        int intent;
+};
+extern struct file *do_filp_open(int dfd, const char *pathname,
+                const struct open_flags *op, int lookup_flags);
+extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
+                const char *, const struct open_flags *, int lookup_flags);
+extern long do_handle_open(int mountdirfd,
+                           struct file_handle __user *ufh, int open_flag);
 /*
 * inode.c
 */
+extern spinlock_t inode_sb_list_lock;
+/*
+ * fs-writeback.c
+ */
+extern void inode_wb_list_del(struct inode *inode);
 extern int get_nr_dirty_inodes(void);
 extern void evict_inodes(struct super_block *);
-extern int invalidate_inodes(struct super_block *);
+extern int invalidate_inodes(struct super_block *, bool);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index a59635e295fa..1d9b9fcb2db4 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -273,6 +273,13 @@ int __generic_block_fiemap(struct inode *inode,
                len = isize;
        }
+        /*
+         * Some filesystems can't deal with being asked to map less than
+         * blocksize, so make sure our len is at least block length.
+         */
+        if (logical_to_blk(inode, len) == 0)
+                len = blk_to_logical(inode, 1);
        start_blk = logical_to_blk(inode, start);
        last_blk = logical_to_blk(inode, start + len - 1);
@@ -541,6 +548,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 {
        int error = 0;
        int __user *argp = (int __user *)arg;
+        struct inode *inode = filp->f_path.dentry->d_inode;
        switch (cmd) {
        case FIOCLEX:
@@ -560,13 +568,11 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
                break;
        case FIOQSIZE:
-                if (S_ISDIR(filp->f_path.dentry->d_inode->i_mode) ||
+                if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
-                    S_ISREG(filp->f_path.dentry->d_inode->i_mode) ||
+                    S_ISLNK(inode->i_mode)) {
-                    S_ISLNK(filp->f_path.dentry->d_inode->i_mode)) {
+                        loff_t res = inode_get_bytes(inode);
-                        loff_t res =
+                        error = copy_to_user(argp, &res, sizeof(res)) ?
-                                inode_get_bytes(filp->f_path.dentry->d_inode);
+                                        -EFAULT : 0;
-                        error = copy_to_user((loff_t __user *)arg, &res,
-                                             sizeof(res)) ? -EFAULT : 0;
                } else
                        error = -ENOTTY;
                break;
@@ -583,14 +589,10 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
                return ioctl_fiemap(filp, arg);
        case FIGETBSZ:
-        {
+                return put_user(inode->i_sb->s_blocksize, argp);
-                struct inode *inode = filp->f_path.dentry->d_inode;
-                int __user *p = (int __user *)arg;
-                return put_user(inode->i_sb->s_blocksize, p);
-        }
        default:
-                if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
+                if (S_ISREG(inode->i_mode))
                        error = file_ioctl(filp, cmd, arg);
                else
                        error = vfs_ioctl(filp, cmd, arg);
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index ed752cb38474..dd4687ff30d0 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -124,9 +124,13 @@ isofs_export_encode_fh(struct dentry *dentry,
         * offset of the inode and the upper 16 bits of fh32[1] to
         * hold the offset of the parent.
         */
+        if (connectable && (len < 5)) {
-        if (len < 3 || (connectable && len < 5))
+                *max_len = 5;
+                return 255;
+        } else if (len < 3) {
+                *max_len = 3;
                return 255;
+        }
        len = 3;
        fh32[0] = ei->i_iget5_block;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index a0f3833c0dbf..3db5ba4568fc 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1158,7 +1158,6 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
 static const struct address_space_operations isofs_aops = {
        .readpage = isofs_readpage,
-        .sync_page = block_sync_page,
        .bmap = _isofs_bmap
 };
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 34a4861c14b8..da871ee084d3 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/bio.h>
+#include <linux/blkdev.h>
 /*
 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -294,7 +295,7 @@ void journal_commit_transaction(journal_t *journal)
        int first_tag = 0;
        int tag_flag;
        int i;
-        int write_op = WRITE_SYNC;
+        struct blk_plug plug;
        /*
         * First job: lock down the current transaction and wait for
@@ -327,13 +328,6 @@ void journal_commit_transaction(journal_t *journal)
        spin_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
-        /*
-         * Use plugged writes here, since we want to submit several before
-         * we unplug the device. We don't do explicit unplugging in here,
-         * instead we rely on sync_buffer() doing the unplug for us.
-         */
-        if (commit_transaction->t_synchronous_commit)
-                write_op = WRITE_SYNC_PLUG;
        spin_lock(&commit_transaction->t_handle_lock);
        while (commit_transaction->t_updates) {
                DEFINE_WAIT(wait);
@@ -418,8 +412,10 @@ void journal_commit_transaction(journal_t *journal)
         * Now start flushing things to disk, in the order they appear
         * on the transaction lists.  Data blocks go first.
         */
+        blk_start_plug(&plug);
        err = journal_submit_data_buffers(journal, commit_transaction,
-                                          write_op);
+                                          WRITE_SYNC);
+        blk_finish_plug(&plug);
        /*
         * Wait for all previously submitted IO to complete.
@@ -480,7 +476,9 @@ void journal_commit_transaction(journal_t *journal)
                err = 0;
        }
-        journal_write_revoke_records(journal, commit_transaction, write_op);
+        blk_start_plug(&plug);
+        journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC);
        /*
         * If we found any dirty or locked buffers, then we should have
@@ -650,7 +648,7 @@ start_journal_io:
                                clear_buffer_dirty(bh);
                                set_buffer_uptodate(bh);
                                bh->b_end_io = journal_end_buffer_io_sync;
-                                submit_bh(write_op, bh);
+                                submit_bh(WRITE_SYNC, bh);
                        }
                        cond_resched();
@@ -661,6 +659,8 @@ start_journal_io:
                }
        }
+        blk_finish_plug(&plug);
        /* Lo and behold: we have just managed to send a transaction to
           the log.  Before we can commit it, wait for the IO so far to
           complete.  Control buffers being written are on the
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index da1b5e4ffce1..eb11601f2e00 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -839,7 +839,7 @@ journal_t * journal_init_inode (struct inode *inode)
        err = journal_bmap(journal, 0, &blocknr);
        /* If that failed, give up */
        if (err) {
-                printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
+                printk(KERN_ERR "%s: Cannot locate journal superblock\n",
                       __func__);
                goto out_err;
        }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f3ad1598b201..fa36d7662b21 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -137,9 +137,9 @@ static int journal_submit_commit_record(journal_t *journal,
        if (journal->j_flags & JBD2_BARRIER &&
            !JBD2_HAS_INCOMPAT_FEATURE(journal,
                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-                ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
+                ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
        else
-                ret = submit_bh(WRITE_SYNC_PLUG, bh);
+                ret = submit_bh(WRITE_SYNC, bh);
        *cbh = bh;
        return ret;
@@ -329,7 +329,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        int tag_bytes = journal_tag_bytes(journal);
        struct buffer_head *cbh = NULL; /* For transactional checksums */
        __u32 crc32_sum = ~0;
-        int write_op = WRITE_SYNC;
+        struct blk_plug plug;
        /*
         * First job: lock down the current transaction and wait for
@@ -363,13 +363,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        write_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
-        /*
-         * Use plugged writes here, since we want to submit several before
-         * we unplug the device. We don't do explicit unplugging in here,
-         * instead we rely on sync_buffer() doing the unplug for us.
-         */
-        if (commit_transaction->t_synchronous_commit)
-                write_op = WRITE_SYNC_PLUG;
        trace_jbd2_commit_locking(journal, commit_transaction);
        stats.run.rs_wait = commit_transaction->t_max_wait;
        stats.run.rs_locked = jiffies;
@@ -469,8 +462,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        if (err)
                jbd2_journal_abort(journal, err);
+        blk_start_plug(&plug);
        jbd2_journal_write_revoke_records(journal, commit_transaction,
-                                          write_op);
+                                          WRITE_SYNC);
+        blk_finish_plug(&plug);
        jbd_debug(3, "JBD: commit phase 2\n");
@@ -497,6 +492,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        err = 0;
        descriptor = NULL;
        bufs = 0;
+        blk_start_plug(&plug);
        while (commit_transaction->t_buffers) {
                /* Find the next buffer to be journaled... */
@@ -658,7 +654,7 @@ start_journal_io:
                                clear_buffer_dirty(bh);
                                set_buffer_uptodate(bh);
                                bh->b_end_io = journal_end_buffer_io_sync;
-                                submit_bh(write_op, bh);
+                                submit_bh(WRITE_SYNC, bh);
                        }
                        cond_resched();
                        stats.run.rs_blocks_logged += bufs;
@@ -699,6 +695,8 @@ start_journal_io:
                        __jbd2_journal_abort_hard(journal);
        }
+        blk_finish_plug(&plug);
        /* Lo and behold: we have just managed to send a transaction to
           the log.  Before we can commit it, wait for the IO so far to
           complete.  Control buffers being written are on the
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 9e4686900f18..90407b8fece7 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -473,7 +473,8 @@ int __jbd2_log_space_left(journal_t *journal)
 }
 /*
- * Called under j_state_lock.  Returns true if a transaction commit was started.
+ * Called with j_state_lock locked for writing.
+ * Returns true if a transaction commit was started.
 */
 int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
@@ -520,11 +521,13 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 {
        transaction_t *transaction = NULL;
        tid_t tid;
+        int need_to_start = 0;
        read_lock(&journal->j_state_lock);
        if (journal->j_running_transaction && !current->journal_info) {
                transaction = journal->j_running_transaction;
-                __jbd2_log_start_commit(journal, transaction->t_tid);
+                if (!tid_geq(journal->j_commit_request, transaction->t_tid))
+                        need_to_start = 1;
        } else if (journal->j_committing_transaction)
                transaction = journal->j_committing_transaction;
@@ -535,6 +538,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
        tid = transaction->t_tid;
        read_unlock(&journal->j_state_lock);
+        if (need_to_start)
+                jbd2_log_start_commit(journal, tid);
        jbd2_log_wait_commit(journal, tid);
        return 1;
 }
@@ -986,7 +991,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
        err = jbd2_journal_bmap(journal, 0, &blocknr);
        /* If that failed, give up */
        if (err) {
-                printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
+                printk(KERN_ERR "%s: Cannot locate journal superblock\n",
                       __func__);
                goto out_err;
        }
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index faad2bd787c7..1d1191050f99 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -117,10 +117,10 @@ static inline void update_t_max_wait(transaction_t *transaction)
 static int start_this_handle(journal_t *journal, handle_t *handle,
                             int gfp_mask)
 {
-        transaction_t *transaction;
+        transaction_t   *transaction, *new_transaction = NULL;
-        int needed;
+        tid_t           tid;
-        int nblocks = handle->h_buffer_credits;
+        int             needed, need_to_start;
-        transaction_t *new_transaction = NULL;
+        int             nblocks = handle->h_buffer_credits;
        if (nblocks > journal->j_max_transaction_buffers) {
                printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -222,8 +222,11 @@ repeat:
                atomic_sub(nblocks, &transaction->t_outstanding_credits);
                prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
                                TASK_UNINTERRUPTIBLE);
-                __jbd2_log_start_commit(journal, transaction->t_tid);
+                tid = transaction->t_tid;
+                need_to_start = !tid_geq(journal->j_commit_request, tid);
                read_unlock(&journal->j_state_lock);
+                if (need_to_start)
+                        jbd2_log_start_commit(journal, tid);
                schedule();
                finish_wait(&journal->j_wait_transaction_locked, &wait);
                goto repeat;
@@ -442,7 +445,8 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
-        int ret;
+        tid_t           tid;
+        int             need_to_start, ret;
        /* If we've had an abort of any type, don't even think about
         * actually doing the restart! */
@@ -465,8 +469,11 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
        spin_unlock(&transaction->t_handle_lock);
        jbd_debug(2, "restarting handle %p\n", handle);
-        __jbd2_log_start_commit(journal, transaction->t_tid);
+        tid = transaction->t_tid;
+        need_to_start = !tid_geq(journal->j_commit_request, tid);
        read_unlock(&journal->j_state_lock);
+        if (need_to_start)
+                jbd2_log_start_commit(journal, tid);
        lock_map_release(&handle->h_lockdep_map);
        handle->h_buffer_credits = nblocks;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 95b79672150a..828a0e1ea438 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -402,7 +402,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
        if (name[0] != '\0')
                return -EINVAL;
-        if (!is_owner_or_cap(dentry->d_inode))
+        if (!inode_owner_or_capable(dentry->d_inode))
                return -EPERM;
        if (value) {
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index fd05a0b9431d..5a001020c542 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -40,12 +40,13 @@ static z_stream inf_strm, def_strm;
 static int __init alloc_workspaces(void)
 {
-        def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+        def_strm.workspace = vmalloc(zlib_deflate_workspacesize(MAX_WBITS,
+                                                        MAX_MEM_LEVEL));
        if (!def_strm.workspace) {
-                printk(KERN_WARNING "Failed to allocate %d bytes for deflate workspace\n", zlib_deflate_workspacesize());
+                printk(KERN_WARNING "Failed to allocate %d bytes for deflate workspace\n", zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL));
                return -ENOMEM;
        }
-        D1(printk(KERN_DEBUG "Allocated %d bytes for deflate workspace\n", zlib_deflate_workspacesize()));
+        D1(printk(KERN_DEBUG "Allocated %d bytes for deflate workspace\n", zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL)));
        inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
        if (!inf_strm.workspace) {
                printk(KERN_WARNING "Failed to allocate %d bytes for inflate workspace\n", zlib_inflate_workspacesize());
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 92978658ed18..82faddd1f321 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -215,8 +215,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
           no chance of AB-BA deadlock involving its f->sem). */
        mutex_unlock(&f->sem);
-        ret = jffs2_do_create(c, dir_f, f, ri,
+        ret = jffs2_do_create(c, dir_f, f, ri, &dentry->d_name);
-                              dentry->d_name.name, dentry->d_name.len);
        if (ret)
                goto fail;
@@ -386,7 +385,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
        jffs2_complete_reservation(c);
-        ret = jffs2_init_security(inode, dir_i);
+        ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
        if (ret)
                goto fail;
@@ -530,7 +529,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
        jffs2_complete_reservation(c);
-        ret = jffs2_init_security(inode, dir_i);
+        ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
        if (ret)
                goto fail;
@@ -703,7 +702,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
        jffs2_complete_reservation(c);
-        ret = jffs2_init_security(inode, dir_i);
+        ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
        if (ret)
                goto fail;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 5a53d9bdb2b5..e4619b00f7c5 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -401,7 +401,7 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
                            struct jffs2_raw_inode *ri, unsigned char *buf,
                            uint32_t offset, uint32_t writelen, uint32_t *retlen);
 int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f,
-                    struct jffs2_raw_inode *ri, const char *name, int namelen);
+                    struct jffs2_raw_inode *ri, const struct qstr *qstr);
 int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name,
                    int namelen, struct jffs2_inode_info *dead_f, uint32_t time);
 int jffs2_do_link(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino,
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 239f51216a68..cfeb7164b085 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -23,14 +23,15 @@
 #include "nodelist.h"
 /* ---- Initial Security Label Attachment -------------- */
-int jffs2_init_security(struct inode *inode, struct inode *dir)
+int jffs2_init_security(struct inode *inode, struct inode *dir,
+                        const struct qstr *qstr)
 {
        int rc;
        size_t len;
        void *value;
        char *name;
-        rc = security_inode_init_security(inode, dir, &name, &value, &len);
+        rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
        if (rc) {
                if (rc == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index c819eb0e982d..30d175b6d290 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -424,7 +424,9 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
        return ret;
 }
-int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, const char *name, int namelen)
+int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
+                    struct jffs2_inode_info *f, struct jffs2_raw_inode *ri,
+                    const struct qstr *qstr)
 {
        struct jffs2_raw_dirent *rd;
        struct jffs2_full_dnode *fn;
@@ -466,15 +468,15 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
        mutex_unlock(&f->sem);
        jffs2_complete_reservation(c);
-        ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode);
+        ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode, qstr);
        if (ret)
                return ret;
        ret = jffs2_init_acl_post(&f->vfs_inode);
        if (ret)
                return ret;
-        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+        ret = jffs2_reserve_space(c, sizeof(*rd)+qstr->len, &alloclen,
-                                ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+                                ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(qstr->len));
        if (ret) {
                /* Eep. */
@@ -493,19 +495,19 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
        rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
        rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
-        rd->totlen = cpu_to_je32(sizeof(*rd) + namelen);
+        rd->totlen = cpu_to_je32(sizeof(*rd) + qstr->len);
        rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4));
        rd->pino = cpu_to_je32(dir_f->inocache->ino);
        rd->version = cpu_to_je32(++dir_f->highest_version);
        rd->ino = ri->ino;
        rd->mctime = ri->ctime;
-        rd->nsize = namelen;
+        rd->nsize = qstr->len;
        rd->type = DT_REG;
        rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
-        rd->name_crc = cpu_to_je32(crc32(0, name, namelen));
+        rd->name_crc = cpu_to_je32(crc32(0, qstr->name, qstr->len));
-        fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_NORMAL);
+        fd = jffs2_write_dirent(c, dir_f, rd, qstr->name, qstr->len, ALLOC_NORMAL);
        jffs2_free_raw_dirent(rd);
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index cf4f5759b42b..7be4beb306f3 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -121,10 +121,11 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
 #endif /* CONFIG_JFFS2_FS_XATTR */
 #ifdef CONFIG_JFFS2_FS_SECURITY
-extern int jffs2_init_security(struct inode *inode, struct inode *dir);
+extern int jffs2_init_security(struct inode *inode, struct inode *dir,
+                               const struct qstr *qstr);
 extern const struct xattr_handler jffs2_security_xattr_handler;
 #else
-#define jffs2_init_security(inode,dir)  (0)
+#define jffs2_init_security(inode,dir,qstr)     (0)
 #endif /* CONFIG_JFFS2_FS_SECURITY */
 #endif /* _JFFS2_FS_XATTR_H_ */
diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile
index 3adb6395e42d..a58fa72d7e59 100644
--- a/fs/jfs/Makefile
+++ b/fs/jfs/Makefile
@@ -13,4 +13,4 @@ jfs-y    := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \
 jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o
-EXTRA_CFLAGS += -D_JFS_4K
+ccflags-y := -D_JFS_4K
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 9978803ceedc..eddbb373209e 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -352,7 +352,6 @@ const struct address_space_operations jfs_aops = {
        .readpages      = jfs_readpages,
        .writepage      = jfs_writepage,
        .writepages     = jfs_writepages,
-        .sync_page      = block_sync_page,
        .write_begin    = jfs_write_begin,
        .write_end      = nobh_write_end,
        .bmap           = jfs_bmap,
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index afe222bf300f..6f98a1866776 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -72,7 +72,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                if (err)
                        return err;
-                if (!is_owner_or_cap(inode)) {
+                if (!inode_owner_or_capable(inode)) {
                        err = -EACCES;
                        goto setflags_out;
                }
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 48b44bd8267b..6740d34cd82b 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -583,7 +583,6 @@ static void metapage_invalidatepage(struct page *page, unsigned long offset)
 const struct address_space_operations jfs_metapage_aops = {
        .readpage       = metapage_readpage,
        .writepage      = metapage_writepage,
-        .sync_page      = block_sync_page,
        .releasepage    = metapage_releasepage,
        .invalidatepage = metapage_invalidatepage,
        .set_page_dirty = __set_page_dirty_nobuffers,
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index 88b6cc535bf2..e9e100fd7c09 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -62,10 +62,11 @@ extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
 extern int jfs_removexattr(struct dentry *, const char *);
 #ifdef CONFIG_JFS_SECURITY
-extern int jfs_init_security(tid_t, struct inode *, struct inode *);
+extern int jfs_init_security(tid_t, struct inode *, struct inode *,
+                             const struct qstr *);
 #else
 static inline int jfs_init_security(tid_t tid, struct inode *inode,
-                                    struct inode *dir)
+                                    struct inode *dir, const struct qstr *qstr)
 {
        return 0;
 }
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 81ead850ddb6..eaaf2b511e89 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -115,7 +115,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
        if (rc)
                goto out3;
-        rc = jfs_init_security(tid, ip, dip);
+        rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
        if (rc) {
                txAbort(tid, 0);
                goto out3;
@@ -253,7 +253,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
        if (rc)
                goto out3;
-        rc = jfs_init_security(tid, ip, dip);
+        rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
        if (rc) {
                txAbort(tid, 0);
                goto out3;
@@ -809,9 +809,6 @@ static int jfs_link(struct dentry *old_dentry,
        if (ip->i_nlink == JFS_LINK_MAX)
                return -EMLINK;
-        if (ip->i_nlink == 0)
-                return -ENOENT;
        dquot_initialize(dir);
        tid = txBegin(ip->i_sb, 0);
@@ -932,7 +929,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
        mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
        mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
-        rc = jfs_init_security(tid, ip, dip);
+        rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
        if (rc)
                goto out3;
@@ -1395,7 +1392,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
        if (rc)
                goto out3;
-        rc = jfs_init_security(tid, ip, dir);
+        rc = jfs_init_security(tid, ip, dir, &dentry->d_name);
        if (rc) {
                txAbort(tid, 0);
                goto out3;
@@ -1600,7 +1597,7 @@ out:
 static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        /*
         * This is not negative dentry. Always valid.
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 2d7f165d0f1d..24838f1eeee5 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -678,7 +678,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
        struct posix_acl *acl;
        int rc;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EPERM;
        /*
@@ -1091,7 +1091,8 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
 }
 #ifdef CONFIG_JFS_SECURITY
-int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir)
+int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
+                      const struct qstr *qstr)
 {
        int rc;
        size_t len;
@@ -1099,7 +1100,8 @@ int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir)
        char *suffix;
        char *name;
-        rc = security_inode_init_security(inode, dir, &suffix, &value, &len);
+        rc = security_inode_init_security(inode, dir, qstr, &suffix, &value,
+                                          &len);
        if (rc) {
                if (rc == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 5f1bcb2f06f3..b7c99bfb3da6 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -520,7 +520,7 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,
                                        struct nsm_handle *nsm,
                                        const struct nlm_reboot *info)
 {
-        struct nlm_host *host = NULL;
+        struct nlm_host *host;
        struct hlist_head *chain;
        struct hlist_node *pos;
@@ -532,12 +532,13 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,
                        host->h_state++;
                        nlm_get_host(host);
-                        goto out;
+                        mutex_unlock(&nlm_host_mutex);
+                        return host;
                }
        }
-out:
        mutex_unlock(&nlm_host_mutex);
-        return host;
+        return NULL;
 }
 /**
diff --git a/fs/locks.c b/fs/locks.c
index 0f3998291f78..0a4f50dfadfb 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -145,7 +145,6 @@ static DEFINE_SPINLOCK(file_lock_lock);
 /*
 * Protects the two list heads above, plus the inode->i_flock list
- * FIXME: should use a spinlock, once lockd and ceph are ready.
 */
 void lock_flocks(void)
 {
@@ -415,17 +414,7 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
        fl->fl_ops = NULL;
        fl->fl_lmops = NULL;
-        switch (l->l_type) {
+        return assign_type(fl, l->l_type);
-        case F_RDLCK:
-        case F_WRLCK:
-        case F_UNLCK:
-                fl->fl_type = l->l_type;
-                break;
-        default:
-                return -EINVAL;
-        }
-        return (0);
 }
 #endif
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
index 44bbfd249abc..961f02b86d97 100644
--- a/fs/logfs/compr.c
+++ b/fs/logfs/compr.c
@@ -81,7 +81,7 @@ error:
 int __init logfs_compr_init(void)
 {
-        size_t size = max(zlib_deflate_workspacesize(),
+        size_t size = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
                        zlib_inflate_workspacesize());
        stream.workspace = vmalloc(size);
        if (!stream.workspace)
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 723bc5bca09a..1adc8d455f0e 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -39,7 +39,6 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
        bio.bi_end_io = request_complete;
        submit_bio(rw, &bio);
-        generic_unplug_device(bdev_get_queue(bdev));
        wait_for_completion(&complete);
        return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
 }
@@ -168,7 +167,6 @@ static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
        }
        len = PAGE_ALIGN(len);
        __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
-        generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
 }
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index e86376b87af1..c2ad7028def4 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -196,7 +196,7 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                if (IS_RDONLY(inode))
                        return -EROFS;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                err = get_user(flags, (int __user *)arg);
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 03b8c240aeda..edfea7a3a747 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -293,7 +293,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        return ret;
 }
-/* called with inode_lock held */
+/* called with inode->i_lock held */
 static int logfs_drop_inode(struct inode *inode)
 {
        struct logfs_super *super = logfs_super(inode->i_sb);
diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
index 0fd7ca994264..6624684dd5de 100644
--- a/fs/minix/Kconfig
+++ b/fs/minix/Kconfig
@@ -15,3 +15,11 @@ config MINIX_FS
          module will be called minix.  Note that the file system of your root
          partition (the one containing the directory /) cannot be compiled as
          a module.
+config MINIX_FS_NATIVE_ENDIAN
+        def_bool MINIX_FS
+        depends on H8300 || M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU)
+config MINIX_FS_BIG_ENDIAN_16BIT_INDEXED
+        def_bool MINIX_FS
+        depends on M68K && MMU
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index ae0b83f476a6..adcdc0a4e182 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -399,7 +399,6 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block)
 static const struct address_space_operations minix_aops = {
        .readpage = minix_readpage,
        .writepage = minix_writepage,
-        .sync_page = block_sync_page,
        .write_begin = minix_write_begin,
        .write_end = generic_write_end,
        .bmap = minix_bmap
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 407b1c84911e..341e2122879a 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -88,4 +88,78 @@ static inline struct minix_inode_info *minix_i(struct inode *inode)
        return list_entry(inode, struct minix_inode_info, vfs_inode);
 }
+#if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \
+        defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
+#error Minix file system byte order broken
+#elif defined(CONFIG_MINIX_FS_NATIVE_ENDIAN)
+/*
+ * big-endian 32 or 64 bit indexed bitmaps on big-endian system or
+ * little-endian bitmaps on little-endian system
+ */
+#define minix_test_and_set_bit(nr, addr)        \
+        __test_and_set_bit((nr), (unsigned long *)(addr))
+#define minix_set_bit(nr, addr)         \
+        __set_bit((nr), (unsigned long *)(addr))
+#define minix_test_and_clear_bit(nr, addr) \
+        __test_and_clear_bit((nr), (unsigned long *)(addr))
+#define minix_test_bit(nr, addr)                \
+        test_bit((nr), (unsigned long *)(addr))
+#define minix_find_first_zero_bit(addr, size) \
+        find_first_zero_bit((unsigned long *)(addr), (size))
+#elif defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
+/*
+ * big-endian 16bit indexed bitmaps
+ */
+static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size)
+{
+        const unsigned short *p = vaddr, *addr = vaddr;
+        unsigned short num;
+        if (!size)
+                return 0;
+        size = (size >> 4) + ((size & 15) > 0);
+        while (*p++ == 0xffff) {
+                if (--size == 0)
+                        return (p - addr) << 4;
+        }
+        num = *--p;
+        return ((p - addr) << 4) + ffz(num);
+}
+#define minix_test_and_set_bit(nr, addr)        \
+        __test_and_set_bit((nr) ^ 16, (unsigned long *)(addr))
+#define minix_set_bit(nr, addr) \
+        __set_bit((nr) ^ 16, (unsigned long *)(addr))
+#define minix_test_and_clear_bit(nr, addr)      \
+        __test_and_clear_bit((nr) ^ 16, (unsigned long *)(addr))
+static inline int minix_test_bit(int nr, const void *vaddr)
+{
+        const unsigned short *p = vaddr;
+        return (p[nr >> 4] & (1U << (nr & 15))) != 0;
+}
+#else
+/*
+ * little-endian bitmaps
+ */
+#define minix_test_and_set_bit  __test_and_set_bit_le
+#define minix_set_bit           __set_bit_le
+#define minix_test_and_clear_bit        __test_and_clear_bit_le
+#define minix_test_bit  test_bit_le
+#define minix_find_first_zero_bit       find_first_zero_bit_le
+#endif
 #endif /* FS_MINIX_H */
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index ce7337ddfdbf..6e6777f1b4b2 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -213,7 +213,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
                new_de = minix_find_entry(new_dentry, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                minix_set_link(new_de, new_page, old_inode);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
@@ -225,18 +224,15 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
                        if (new_dir->i_nlink >= info->s_link_max)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = minix_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
        minix_delete_entry(old_de, old_page);
-        inode_dec_link_count(old_inode);
+        mark_inode_dirty(old_inode);
        if (dir_de) {
                minix_set_link(dir_de, dir_page, new_dir);
diff --git a/fs/mpage.c b/fs/mpage.c
index d78455a81ec9..0afc809e46e0 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -364,6 +364,9 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
        sector_t last_block_in_bio = 0;
        struct buffer_head map_bh;
        unsigned long first_logical_block = 0;
+        struct blk_plug plug;
+        blk_start_plug(&plug);
        map_bh.b_state = 0;
        map_bh.b_size = 0;
@@ -385,6 +388,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
        BUG_ON(!list_empty(pages));
        if (bio)
                mpage_bio_submit(READ, bio);
+        blk_finish_plug(&plug);
        return 0;
 }
 EXPORT_SYMBOL(mpage_readpages);
@@ -666,8 +670,11 @@ int
 mpage_writepages(struct address_space *mapping,
                struct writeback_control *wbc, get_block_t get_block)
 {
+        struct blk_plug plug;
        int ret;
+        blk_start_plug(&plug);
        if (!get_block)
                ret = generic_writepages(mapping, wbc);
        else {
@@ -682,6 +689,7 @@ mpage_writepages(struct address_space *mapping,
                if (mpd.bio)
                        mpage_bio_submit(WRITE, mpd.bio);
        }
+        blk_finish_plug(&plug);
        return ret;
 }
 EXPORT_SYMBOL(mpage_writepages);
diff --git a/fs/namei.c b/fs/namei.c
index 7d77f24d32a9..3cb616d38d9c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -136,7 +136,7 @@ static int do_getname(const char __user *filename, char *page)
        return retval;
 }
-char * getname(const char __user * filename)
+static char *getname_flags(const char __user * filename, int flags)
 {
        char *tmp, *result;
@@ -147,14 +147,21 @@ char * getname(const char __user * filename)
                result = tmp;
                if (retval < 0) {
-                        __putname(tmp);
+                        if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
-                        result = ERR_PTR(retval);
+                                __putname(tmp);
+                                result = ERR_PTR(retval);
+                        }
                }
        }
        audit_getname(result);
        return result;
 }
+char *getname(const char __user * filename)
+{
+        return getname_flags(filename, 0);
+}
 #ifdef CONFIG_AUDITSYSCALL
 void putname(const char *name)
 {
@@ -176,6 +183,9 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
+        if (current_user_ns() != inode_userns(inode))
+                goto other_perms;
        if (current_fsuid() == inode->i_uid)
                mode >>= 6;
        else {
@@ -189,6 +199,7 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
                        mode >>= 3;
        }
+other_perms:
        /*
         * If the DACs are ok we don't need any capability check.
         */
@@ -230,7 +241,7 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags,
         * Executable DACs are overridable if at least one exec bit is set.
         */
        if (!(mask & MAY_EXEC) || execute_ok(inode))
-                if (capable(CAP_DAC_OVERRIDE))
+                if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
                        return 0;
        /*
@@ -238,7 +249,7 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags,
         */
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
        if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
-                if (capable(CAP_DAC_READ_SEARCH))
+                if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
                        return 0;
        return -EACCES;
@@ -401,9 +412,11 @@ static int nameidata_drop_rcu(struct nameidata *nd)
 {
        struct fs_struct *fs = current->fs;
        struct dentry *dentry = nd->path.dentry;
+        int want_root = 0;
        BUG_ON(!(nd->flags & LOOKUP_RCU));
-        if (nd->root.mnt) {
+        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+                want_root = 1;
                spin_lock(&fs->lock);
                if (nd->root.mnt != fs->root.mnt ||
                                nd->root.dentry != fs->root.dentry)
@@ -414,7 +427,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
                goto err;
        BUG_ON(nd->inode != dentry->d_inode);
        spin_unlock(&dentry->d_lock);
-        if (nd->root.mnt) {
+        if (want_root) {
                path_get(&nd->root);
                spin_unlock(&fs->lock);
        }
@@ -427,7 +440,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
 err:
        spin_unlock(&dentry->d_lock);
 err_root:
-        if (nd->root.mnt)
+        if (want_root)
                spin_unlock(&fs->lock);
        return -ECHILD;
 }
@@ -454,17 +467,11 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
 {
        struct fs_struct *fs = current->fs;
        struct dentry *parent = nd->path.dentry;
+        int want_root = 0;
-        /*
-         * It can be possible to revalidate the dentry that we started
-         * the path walk with. force_reval_path may also revalidate the
-         * dentry already committed to the nameidata.
-         */
-        if (unlikely(parent == dentry))
-                return nameidata_drop_rcu(nd);
        BUG_ON(!(nd->flags & LOOKUP_RCU));
-        if (nd->root.mnt) {
+        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+                want_root = 1;
                spin_lock(&fs->lock);
                if (nd->root.mnt != fs->root.mnt ||
                                nd->root.dentry != fs->root.dentry)
@@ -484,7 +491,7 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
        parent->d_count++;
        spin_unlock(&dentry->d_lock);
        spin_unlock(&parent->d_lock);
-        if (nd->root.mnt) {
+        if (want_root) {
                path_get(&nd->root);
                spin_unlock(&fs->lock);
        }
@@ -498,7 +505,7 @@ err:
        spin_unlock(&dentry->d_lock);
        spin_unlock(&parent->d_lock);
 err_root:
-        if (nd->root.mnt)
+        if (want_root)
                spin_unlock(&fs->lock);
        return -ECHILD;
 }
@@ -506,8 +513,16 @@ err_root:
 /* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
 static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
 {
-        if (nd->flags & LOOKUP_RCU)
+        if (nd->flags & LOOKUP_RCU) {
-                return nameidata_dentry_drop_rcu(nd, dentry);
+                if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
+                        nd->flags &= ~LOOKUP_RCU;
+                        if (!(nd->flags & LOOKUP_ROOT))
+                                nd->root.mnt = NULL;
+                        rcu_read_unlock();
+                        br_read_unlock(vfsmount_lock);
+                        return -ECHILD;
+                }
+        }
        return 0;
 }
@@ -526,7 +541,8 @@ static int nameidata_drop_rcu_last(struct nameidata *nd)
        BUG_ON(!(nd->flags & LOOKUP_RCU));
        nd->flags &= ~LOOKUP_RCU;
-        nd->root.mnt = NULL;
+        if (!(nd->flags & LOOKUP_ROOT))
+                nd->root.mnt = NULL;
        spin_lock(&dentry->d_lock);
        if (!__d_rcu_to_refcount(dentry, nd->seq))
                goto err_unlock;
@@ -547,53 +563,31 @@ err_unlock:
        return -ECHILD;
 }
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
-{
-        if (likely(nd->flags & LOOKUP_RCU))
-                return nameidata_drop_rcu_last(nd);
-        return 0;
-}
 /**
 * release_open_intent - free up open intent resources
 * @nd: pointer to nameidata
 */
 void release_open_intent(struct nameidata *nd)
 {
-        if (nd->intent.open.file->f_path.dentry == NULL)
+        struct file *file = nd->intent.open.file;
-                put_filp(nd->intent.open.file);
-        else
-                fput(nd->intent.open.file);
-}
-/*
- * Call d_revalidate and handle filesystems that request rcu-walk
- * to be dropped. This may be called and return in rcu-walk mode,
- * regardless of success or error. If -ECHILD is returned, the caller
- * must return -ECHILD back up the path walk stack so path walk may
- * be restarted in ref-walk mode.
- */
-static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
-        int status;
-        status = dentry->d_op->d_revalidate(dentry, nd);
+        if (file && !IS_ERR(file)) {
-        if (status == -ECHILD) {
+                if (file->f_path.dentry == NULL)
-                if (nameidata_dentry_drop_rcu(nd, dentry))
+                        put_filp(file);
-                        return status;
+                else
-                status = dentry->d_op->d_revalidate(dentry, nd);
+                        fput(file);
        }
+}
-        return status;
+static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        return dentry->d_op->d_revalidate(dentry, nd);
 }
-static inline struct dentry *
+static struct dentry *
 do_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        int status;
+        int status = d_revalidate(dentry, nd);
-        status = d_revalidate(dentry, nd);
        if (unlikely(status <= 0)) {
                /*
                 * The dentry failed validation.
@@ -602,37 +596,18 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
                 * to return a fail status.
                 */
                if (status < 0) {
-                        /* If we're in rcu-walk, we don't have a ref */
+                        dput(dentry);
-                        if (!(nd->flags & LOOKUP_RCU))
-                                dput(dentry);
                        dentry = ERR_PTR(status);
+                } else if (!d_invalidate(dentry)) {
-                } else {
+                        dput(dentry);
-                        /* Don't d_invalidate in rcu-walk mode */
+                        dentry = NULL;
-                        if (nameidata_dentry_drop_rcu_maybe(nd, dentry))
-                                return ERR_PTR(-ECHILD);
-                        if (!d_invalidate(dentry)) {
-                                dput(dentry);
-                                dentry = NULL;
-                        }
                }
        }
        return dentry;
 }
-static inline int need_reval_dot(struct dentry *dentry)
-{
-        if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
-                return 0;
-        if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
-                return 0;
-        return 1;
-}
 /*
- * force_reval_path - force revalidation of a dentry
+ * handle_reval_path - force revalidation of a dentry
 *
 * In some situations the path walking code will trust dentries without
 * revalidating them. This causes problems for filesystems that depend on
@@ -646,30 +621,28 @@ static inline int need_reval_dot(struct dentry *dentry)
 * invalidate the dentry. It's up to the caller to handle putting references
 * to the path if necessary.
 */
-static int
+static inline int handle_reval_path(struct nameidata *nd)
-force_reval_path(struct path *path, struct nameidata *nd)
 {
+        struct dentry *dentry = nd->path.dentry;
        int status;
-        struct dentry *dentry = path->dentry;
-        /*
+        if (likely(!(nd->flags & LOOKUP_JUMPED)))
-         * only check on filesystems where it's possible for the dentry to
+                return 0;
-         * become stale.
-         */
+        if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
-        if (!need_reval_dot(dentry))
+                return 0;
+        if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
                return 0;
+        /* Note: we do not d_invalidate() */
        status = d_revalidate(dentry, nd);
        if (status > 0)
                return 0;
-        if (!status) {
+        if (!status)
-                /* Don't d_invalidate in rcu-walk mode */
-                if (nameidata_drop_rcu(nd))
-                        return -ECHILD;
-                d_invalidate(dentry);
                status = -ESTALE;
-        }
        return status;
 }
@@ -685,6 +658,7 @@ force_reval_path(struct path *path, struct nameidata *nd)
 static inline int exec_permission(struct inode *inode, unsigned int flags)
 {
        int ret;
+        struct user_namespace *ns = inode_userns(inode);
        if (inode->i_op->permission) {
                ret = inode->i_op->permission(inode, MAY_EXEC, flags);
@@ -697,7 +671,8 @@ static inline int exec_permission(struct inode *inode, unsigned int flags)
        if (ret == -ECHILD)
                return ret;
-        if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
+        if (ns_capable(ns, CAP_DAC_OVERRIDE) ||
+                        ns_capable(ns, CAP_DAC_READ_SEARCH))
                goto ok;
        return ret;
@@ -738,6 +713,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
                path_put(&nd->path);
                nd->path = nd->root;
                path_get(&nd->root);
+                nd->flags |= LOOKUP_JUMPED;
        }
        nd->inode = nd->path.dentry->d_inode;
@@ -767,18 +743,43 @@ static inline void path_to_nameidata(const struct path *path,
        nd->path.dentry = path->dentry;
 }
+static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
+{
+        struct inode *inode = link->dentry->d_inode;
+        if (!IS_ERR(cookie) && inode->i_op->put_link)
+                inode->i_op->put_link(link->dentry, nd, cookie);
+        path_put(link);
+}
 static __always_inline int
-__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
+follow_link(struct path *link, struct nameidata *nd, void **p)
 {
        int error;
        struct dentry *dentry = link->dentry;
-        touch_atime(link->mnt, dentry);
+        BUG_ON(nd->flags & LOOKUP_RCU);
-        nd_set_link(nd, NULL);
        if (link->mnt == nd->path.mnt)
                mntget(link->mnt);
+        if (unlikely(current->total_link_count >= 40)) {
+                *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
+                path_put(&nd->path);
+                return -ELOOP;
+        }
+        cond_resched();
+        current->total_link_count++;
+        touch_atime(link->mnt, dentry);
+        nd_set_link(nd, NULL);
+        error = security_inode_follow_link(link->dentry, nd);
+        if (error) {
+                *p = ERR_PTR(error); /* no ->put_link(), please */
+                path_put(&nd->path);
+                return error;
+        }
        nd->last_type = LAST_BIND;
        *p = dentry->d_inode->i_op->follow_link(dentry, nd);
        error = PTR_ERR(*p);
@@ -788,50 +789,18 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
                if (s)
                        error = __vfs_follow_link(nd, s);
                else if (nd->last_type == LAST_BIND) {
-                        error = force_reval_path(&nd->path, nd);
+                        nd->flags |= LOOKUP_JUMPED;
-                        if (error)
+                        nd->inode = nd->path.dentry->d_inode;
+                        if (nd->inode->i_op->follow_link) {
+                                /* stepped on a _really_ weird one */
                                path_put(&nd->path);
+                                error = -ELOOP;
+                        }
                }
        }
        return error;
 }
-/*
- * This limits recursive symlink follows to 8, while
- * limiting consecutive symlinks to 40.
- *
- * Without that kind of total limit, nasty chains of consecutive
- * symlinks can cause almost arbitrarily long lookups. 
- */
-static inline int do_follow_link(struct path *path, struct nameidata *nd)
-{
-        void *cookie;
-        int err = -ELOOP;
-        if (current->link_count >= MAX_NESTED_LINKS)
-                goto loop;
-        if (current->total_link_count >= 40)
-                goto loop;
-        BUG_ON(nd->depth >= MAX_NESTED_LINKS);
-        cond_resched();
-        err = security_inode_follow_link(path->dentry, nd);
-        if (err)
-                goto loop;
-        current->link_count++;
-        current->total_link_count++;
-        nd->depth++;
-        err = __do_follow_link(path, nd, &cookie);
-        if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
-                path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
-        path_put(path);
-        current->link_count--;
-        nd->depth--;
-        return err;
-loop:
-        path_put_conditional(path, nd);
-        path_put(&nd->path);
-        return err;
-}
 static int follow_up_rcu(struct path *path)
 {
        struct vfsmount *parent;
@@ -970,8 +939,7 @@ static int follow_managed(struct path *path, unsigned flags)
                if (managed & DCACHE_MANAGE_TRANSIT) {
                        BUG_ON(!path->dentry->d_op);
                        BUG_ON(!path->dentry->d_op->d_manage);
-                        ret = path->dentry->d_op->d_manage(path->dentry,
+                        ret = path->dentry->d_op->d_manage(path->dentry, false);
-                                                           false, false);
                        if (ret < 0)
                                return ret == -EISDIR ? 0 : ret;
                }
@@ -1024,6 +992,12 @@ int follow_down_one(struct path *path)
        return 0;
 }
+static inline bool managed_dentry_might_block(struct dentry *dentry)
+{
+        return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
+                dentry->d_op->d_manage(dentry, true) < 0);
+}
 /*
 * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
 * meet a managed dentry and we're not walking to "..".  True is returned to
@@ -1032,19 +1006,26 @@ int follow_down_one(struct path *path)
 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
                               struct inode **inode, bool reverse_transit)
 {
-        while (d_mountpoint(path->dentry)) {
+        for (;;) {
                struct vfsmount *mounted;
-                if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
+                /*
-                    !reverse_transit &&
+                 * Don't forget we might have a non-mountpoint managed dentry
-                    path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
+                 * that wants to block transit.
+                 */
+                *inode = path->dentry->d_inode;
+                if (!reverse_transit &&
+                     unlikely(managed_dentry_might_block(path->dentry)))
                        return false;
+                if (!d_mountpoint(path->dentry))
+                        break;
                mounted = __lookup_mnt(path->mnt, path->dentry, 1);
                if (!mounted)
                        break;
                path->mnt = mounted;
                path->dentry = mounted->mnt_root;
                nd->seq = read_seqcount_begin(&path->dentry->d_seq);
-                *inode = path->dentry->d_inode;
        }
        if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
@@ -1070,7 +1051,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                        seq = read_seqcount_begin(&parent->d_seq);
                        if (read_seqcount_retry(&old->d_seq, nd->seq))
-                                return -ECHILD;
+                                goto failed;
                        inode = parent->d_inode;
                        nd->path.dentry = parent;
                        nd->seq = seq;
@@ -1083,8 +1064,15 @@ static int follow_dotdot_rcu(struct nameidata *nd)
        }
        __follow_mount_rcu(nd, &nd->path, &inode, true);
        nd->inode = inode;
        return 0;
+failed:
+        nd->flags &= ~LOOKUP_RCU;
+        if (!(nd->flags & LOOKUP_ROOT))
+                nd->root.mnt = NULL;
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        return -ECHILD;
 }
 /*
@@ -1095,7 +1083,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 * Care must be taken as namespace_sem may be held (indicated by mounting_here
 * being true).
 */
-int follow_down(struct path *path, bool mounting_here)
+int follow_down(struct path *path)
 {
        unsigned managed;
        int ret;
@@ -1116,7 +1104,7 @@ int follow_down(struct path *path, bool mounting_here)
                        BUG_ON(!path->dentry->d_op);
                        BUG_ON(!path->dentry->d_op->d_manage);
                        ret = path->dentry->d_op->d_manage(
-                                path->dentry, mounting_here, false);
+                                path->dentry, false);
                        if (ret < 0)
                                return ret == -EISDIR ? 0 : ret;
                }
@@ -1218,57 +1206,85 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 {
        struct vfsmount *mnt = nd->path.mnt;
        struct dentry *dentry, *parent = nd->path.dentry;
-        struct inode *dir;
+        int need_reval = 1;
+        int status = 1;
        int err;
        /*
-         * See if the low-level filesystem might want
-         * to use its own hash..
-         */
-        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-                err = parent->d_op->d_hash(parent, nd->inode, name);
-                if (err < 0)
-                        return err;
-        }
-        /*
         * Rename seqlock is not required here because in the off chance
         * of a false negative due to a concurrent rename, we're going to
         * do the non-racy lookup, below.
         */
        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;
                *inode = nd->inode;
                dentry = __d_lookup_rcu(parent, name, &seq, inode);
-                if (!dentry) {
+                if (!dentry)
-                        if (nameidata_drop_rcu(nd))
+                        goto unlazy;
-                                return -ECHILD;
-                        goto need_lookup;
-                }
                /* Memory barrier in read_seqcount_begin of child is enough */
                if (__read_seqcount_retry(&parent->d_seq, nd->seq))
                        return -ECHILD;
                nd->seq = seq;
-                if (dentry->d_flags & DCACHE_OP_REVALIDATE)
-                        goto need_revalidate;
+                if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
-done2:
+                        status = d_revalidate(dentry, nd);
+                        if (unlikely(status <= 0)) {
+                                if (status != -ECHILD)
+                                        need_reval = 0;
+                                goto unlazy;
+                        }
+                }
                path->mnt = mnt;
                path->dentry = dentry;
                if (likely(__follow_mount_rcu(nd, path, inode, false)))
                        return 0;
-                if (nameidata_drop_rcu(nd))
+unlazy:
-                        return -ECHILD;
+                if (dentry) {
-                /* fallthru */
+                        if (nameidata_dentry_drop_rcu(nd, dentry))
+                                return -ECHILD;
+                } else {
+                        if (nameidata_drop_rcu(nd))
+                                return -ECHILD;
+                }
+        } else {
+                dentry = __d_lookup(parent, name);
        }
-        dentry = __d_lookup(parent, name);
-        if (!dentry)
+retry:
-                goto need_lookup;
+        if (unlikely(!dentry)) {
-found:
+                struct inode *dir = parent->d_inode;
-        if (dentry->d_flags & DCACHE_OP_REVALIDATE)
+                BUG_ON(nd->inode != dir);
-                goto need_revalidate;
-done:
+                mutex_lock(&dir->i_mutex);
+                dentry = d_lookup(parent, name);
+                if (likely(!dentry)) {
+                        dentry = d_alloc_and_lookup(parent, name, nd);
+                        if (IS_ERR(dentry)) {
+                                mutex_unlock(&dir->i_mutex);
+                                return PTR_ERR(dentry);
+                        }
+                        /* known good */
+                        need_reval = 0;
+                        status = 1;
+                }
+                mutex_unlock(&dir->i_mutex);
+        }
+        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
+                status = d_revalidate(dentry, nd);
+        if (unlikely(status <= 0)) {
+                if (status < 0) {
+                        dput(dentry);
+                        return status;
+                }
+                if (!d_invalidate(dentry)) {
+                        dput(dentry);
+                        dentry = NULL;
+                        need_reval = 1;
+                        goto retry;
+                }
+        }
        path->mnt = mnt;
        path->dentry = dentry;
        err = follow_managed(path, nd->flags);
@@ -1278,49 +1294,113 @@ done:
        }
        *inode = path->dentry->d_inode;
        return 0;
+}
-need_lookup:
+static inline int may_lookup(struct nameidata *nd)
-        dir = parent->d_inode;
+{
-        BUG_ON(nd->inode != dir);
+        if (nd->flags & LOOKUP_RCU) {
+                int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+                if (err != -ECHILD)
+                        return err;
+                if (nameidata_drop_rcu(nd))
+                        return -ECHILD;
+        }
+        return exec_permission(nd->inode, 0);
+}
-        mutex_lock(&dir->i_mutex);
+static inline int handle_dots(struct nameidata *nd, int type)
-        /*
+{
-         * First re-do the cached lookup just in case it was created
+        if (type == LAST_DOTDOT) {
-         * while we waited for the directory semaphore, or the first
+                if (nd->flags & LOOKUP_RCU) {
-         * lookup failed due to an unrelated rename.
+                        if (follow_dotdot_rcu(nd))
-         *
+                                return -ECHILD;
-         * This could use version numbering or similar to avoid unnecessary
+                } else
-         * cache lookups, but then we'd have to do the first lookup in the
+                        follow_dotdot(nd);
-         * non-racy way. However in the common case here, everything should
+        }
-         * be hot in cache, so would it be a big win?
+        return 0;
-         */
+}
-        dentry = d_lookup(parent, name);
-        if (likely(!dentry)) {
+static void terminate_walk(struct nameidata *nd)
-                dentry = d_alloc_and_lookup(parent, name, nd);
+{
-                mutex_unlock(&dir->i_mutex);
+        if (!(nd->flags & LOOKUP_RCU)) {
-                if (IS_ERR(dentry))
+                path_put(&nd->path);
-                        goto fail;
+        } else {
-                goto done;
+                nd->flags &= ~LOOKUP_RCU;
+                if (!(nd->flags & LOOKUP_ROOT))
+                        nd->root.mnt = NULL;
+                rcu_read_unlock();
+                br_read_unlock(vfsmount_lock);
        }
+}
+static inline int walk_component(struct nameidata *nd, struct path *path,
+                struct qstr *name, int type, int follow)
+{
+        struct inode *inode;
+        int err;
        /*
-         * Uhhuh! Nasty case: the cache was re-populated while
+         * "." and ".." are special - ".." especially so because it has
-         * we waited on the semaphore. Need to revalidate.
+         * to be able to know about the current root directory and
+         * parent relationships.
         */
-        mutex_unlock(&dir->i_mutex);
+        if (unlikely(type != LAST_NORM))
-        goto found;
+                return handle_dots(nd, type);
+        err = do_lookup(nd, name, path, &inode);
+        if (unlikely(err)) {
+                terminate_walk(nd);
+                return err;
+        }
+        if (!inode) {
+                path_to_nameidata(path, nd);
+                terminate_walk(nd);
+                return -ENOENT;
+        }
+        if (unlikely(inode->i_op->follow_link) && follow) {
+                if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
+                        return -ECHILD;
+                BUG_ON(inode != path->dentry->d_inode);
+                return 1;
+        }
+        path_to_nameidata(path, nd);
+        nd->inode = inode;
+        return 0;
+}
-need_revalidate:
+/*
-        dentry = do_revalidate(dentry, nd);
+ * This limits recursive symlink follows to 8, while
-        if (!dentry)
+ * limiting consecutive symlinks to 40.
-                goto need_lookup;
+ *
-        if (IS_ERR(dentry))
+ * Without that kind of total limit, nasty chains of consecutive
-                goto fail;
+ * symlinks can cause almost arbitrarily long lookups.
-        if (nd->flags & LOOKUP_RCU)
+ */
-                goto done2;
+static inline int nested_symlink(struct path *path, struct nameidata *nd)
-        goto done;
+{
+        int res;
-fail:
+        BUG_ON(nd->depth >= MAX_NESTED_LINKS);
-        return PTR_ERR(dentry);
+        if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
+                path_put_conditional(path, nd);
+                path_put(&nd->path);
+                return -ELOOP;
+        }
+        nd->depth++;
+        current->link_count++;
+        do {
+                struct path link = *path;
+                void *cookie;
+                res = follow_link(&link, nd, &cookie);
+                if (!res)
+                        res = walk_component(nd, path, &nd->last,
+                                             nd->last_type, LOOKUP_FOLLOW);
+                put_link(nd, &link, cookie);
+        } while (res > 0);
+        current->link_count--;
+        nd->depth--;
+        return res;
 }
 /*
@@ -1340,30 +1420,18 @@ static int link_path_walk(const char *name, struct nameidata *nd)
        while (*name=='/')
                name++;
        if (!*name)
-                goto return_reval;
+                return 0;
-        if (nd->depth)
-                lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
        /* At this point we know we have a real path component. */
        for(;;) {
-                struct inode *inode;
                unsigned long hash;
                struct qstr this;
                unsigned int c;
+                int type;
                nd->flags |= LOOKUP_CONTINUE;
-                if (nd->flags & LOOKUP_RCU) {
-                        err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+                err = may_lookup(nd);
-                        if (err == -ECHILD) {
-                                if (nameidata_drop_rcu(nd))
-                                        return -ECHILD;
-                                goto exec_again;
-                        }
-                } else {
-exec_again:
-                        err = exec_permission(nd->inode, 0);
-                }
                if (err)
                        break;
@@ -1379,56 +1447,43 @@ exec_again:
                this.len = name - (const char *) this.name;
                this.hash = end_name_hash(hash);
+                type = LAST_NORM;
+                if (this.name[0] == '.') switch (this.len) {
+                        case 2:
+                                if (this.name[1] == '.') {
+                                        type = LAST_DOTDOT;
+                                        nd->flags |= LOOKUP_JUMPED;
+                                }
+                                break;
+                        case 1:
+                                type = LAST_DOT;
+                }
+                if (likely(type == LAST_NORM)) {
+                        struct dentry *parent = nd->path.dentry;
+                        nd->flags &= ~LOOKUP_JUMPED;
+                        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
+                                err = parent->d_op->d_hash(parent, nd->inode,
+                                                           &this);
+                                if (err < 0)
+                                        break;
+                        }
+                }
                /* remove trailing slashes? */
                if (!c)
                        goto last_component;
                while (*++name == '/');
                if (!*name)
-                        goto last_with_slashes;
+                        goto last_component;
-                /*
+                err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
-                 * "." and ".." are special - ".." especially so because it has
+                if (err < 0)
-                 * to be able to know about the current root directory and
+                        return err;
-                 * parent relationships.
-                 */
-                if (this.name[0] == '.') switch (this.len) {
-                        default:
-                                break;
-                        case 2:
-                                if (this.name[1] != '.')
-                                        break;
-                                if (nd->flags & LOOKUP_RCU) {
-                                        if (follow_dotdot_rcu(nd))
-                                                return -ECHILD;
-                                } else
-                                        follow_dotdot(nd);
-                                /* fallthrough */
-                        case 1:
-                                continue;
-                }
-                /* This does the actual lookups.. */
-                err = do_lookup(nd, &this, &next, &inode);
-                if (err)
-                        break;
-                err = -ENOENT;
-                if (!inode)
-                        goto out_dput;
-                if (inode->i_op->follow_link) {
+                if (err) {
-                        /* We commonly drop rcu-walk here */
+                        err = nested_symlink(&next, nd);
-                        if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
-                                return -ECHILD;
-                        BUG_ON(inode != next.dentry->d_inode);
-                        err = do_follow_link(&next, nd);
                        if (err)
-                                goto return_err;
+                                return err;
-                        nd->inode = nd->path.dentry->d_inode;
-                        err = -ENOENT;
-                        if (!nd->inode)
-                                break;
-                } else {
-                        path_to_nameidata(&next, nd);
-                        nd->inode = inode;
                }
                err = -ENOTDIR; 
                if (!nd->inode->i_op->lookup)
@@ -1436,209 +1491,109 @@ exec_again:
                continue;
                /* here ends the main loop */
-last_with_slashes:
-                lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 last_component:
                /* Clear LOOKUP_CONTINUE iff it was previously unset */
                nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
-                if (lookup_flags & LOOKUP_PARENT)
-                        goto lookup_parent;
-                if (this.name[0] == '.') switch (this.len) {
-                        default:
-                                break;
-                        case 2:
-                                if (this.name[1] != '.')
-                                        break;
-                                if (nd->flags & LOOKUP_RCU) {
-                                        if (follow_dotdot_rcu(nd))
-                                                return -ECHILD;
-                                } else
-                                        follow_dotdot(nd);
-                                /* fallthrough */
-                        case 1:
-                                goto return_reval;
-                }
-                err = do_lookup(nd, &this, &next, &inode);
-                if (err)
-                        break;
-                if (inode && unlikely(inode->i_op->follow_link) &&
-                    (lookup_flags & LOOKUP_FOLLOW)) {
-                        if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
-                                return -ECHILD;
-                        BUG_ON(inode != next.dentry->d_inode);
-                        err = do_follow_link(&next, nd);
-                        if (err)
-                                goto return_err;
-                        nd->inode = nd->path.dentry->d_inode;
-                } else {
-                        path_to_nameidata(&next, nd);
-                        nd->inode = inode;
-                }
-                err = -ENOENT;
-                if (!nd->inode)
-                        break;
-                if (lookup_flags & LOOKUP_DIRECTORY) {
-                        err = -ENOTDIR; 
-                        if (!nd->inode->i_op->lookup)
-                                break;
-                }
-                goto return_base;
-lookup_parent:
                nd->last = this;
-                nd->last_type = LAST_NORM;
+                nd->last_type = type;
-                if (this.name[0] != '.')
-                        goto return_base;
-                if (this.len == 1)
-                        nd->last_type = LAST_DOT;
-                else if (this.len == 2 && this.name[1] == '.')
-                        nd->last_type = LAST_DOTDOT;
-                else
-                        goto return_base;
-return_reval:
-                /*
-                 * We bypassed the ordinary revalidation routines.
-                 * We may need to check the cached dentry for staleness.
-                 */
-                if (need_reval_dot(nd->path.dentry)) {
-                        /* Note: we do not d_invalidate() */
-                        err = d_revalidate(nd->path.dentry, nd);
-                        if (!err)
-                                err = -ESTALE;
-                        if (err < 0)
-                                break;
-                }
-return_base:
-                if (nameidata_drop_rcu_last_maybe(nd))
-                        return -ECHILD;
                return 0;
-out_dput:
-                if (!(nd->flags & LOOKUP_RCU))
-                        path_put_conditional(&next, nd);
-                break;
        }
-        if (!(nd->flags & LOOKUP_RCU))
+        terminate_walk(nd);
-                path_put(&nd->path);
-return_err:
        return err;
 }
-static inline int path_walk_rcu(const char *name, struct nameidata *nd)
+static int path_init(int dfd, const char *name, unsigned int flags,
-{
+                     struct nameidata *nd, struct file **fp)
-        current->total_link_count = 0;
-        return link_path_walk(name, nd);
-}
-static inline int path_walk_simple(const char *name, struct nameidata *nd)
-{
-        current->total_link_count = 0;
-        return link_path_walk(name, nd);
-}
-static int path_walk(const char *name, struct nameidata *nd)
-{
-        struct path save = nd->path;
-        int result;
-        current->total_link_count = 0;
-        /* make sure the stuff we saved doesn't go away */
-        path_get(&save);
-        result = link_path_walk(name, nd);
-        if (result == -ESTALE) {
-                /* nd->path had been dropped */
-                current->total_link_count = 0;
-                nd->path = save;
-                path_get(&nd->path);
-                nd->flags |= LOOKUP_REVAL;
-                result = link_path_walk(name, nd);
-        }
-        path_put(&save);
-        return result;
-}
-static void path_finish_rcu(struct nameidata *nd)
-{
-        if (nd->flags & LOOKUP_RCU) {
-                /* RCU dangling. Cancel it. */
-                nd->flags &= ~LOOKUP_RCU;
-                nd->root.mnt = NULL;
-                rcu_read_unlock();
-                br_read_unlock(vfsmount_lock);
-        }
-        if (nd->file)
-                fput(nd->file);
-}
-static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
        int retval = 0;
        int fput_needed;
        struct file *file;
        nd->last_type = LAST_ROOT; /* if there are only slashes... */
-        nd->flags = flags | LOOKUP_RCU;
+        nd->flags = flags | LOOKUP_JUMPED;
        nd->depth = 0;
+        if (flags & LOOKUP_ROOT) {
+                struct inode *inode = nd->root.dentry->d_inode;
+                if (*name) {
+                        if (!inode->i_op->lookup)
+                                return -ENOTDIR;
+                        retval = inode_permission(inode, MAY_EXEC);
+                        if (retval)
+                                return retval;
+                }
+                nd->path = nd->root;
+                nd->inode = inode;
+                if (flags & LOOKUP_RCU) {
+                        br_read_lock(vfsmount_lock);
+                        rcu_read_lock();
+                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                } else {
+                        path_get(&nd->path);
+                }
+                return 0;
+        }
        nd->root.mnt = NULL;
-        nd->file = NULL;
        if (*name=='/') {
-                struct fs_struct *fs = current->fs;
+                if (flags & LOOKUP_RCU) {
-                unsigned seq;
+                        br_read_lock(vfsmount_lock);
+                        rcu_read_lock();
-                br_read_lock(vfsmount_lock);
+                        set_root_rcu(nd);
-                rcu_read_lock();
+                } else {
+                        set_root(nd);
-                do {
+                        path_get(&nd->root);
-                        seq = read_seqcount_begin(&fs->seq);
+                }
-                        nd->root = fs->root;
+                nd->path = nd->root;
-                        nd->path = nd->root;
-                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-                } while (read_seqcount_retry(&fs->seq, seq));
        } else if (dfd == AT_FDCWD) {
-                struct fs_struct *fs = current->fs;
+                if (flags & LOOKUP_RCU) {
-                unsigned seq;
+                        struct fs_struct *fs = current->fs;
+                        unsigned seq;
-                br_read_lock(vfsmount_lock);
+                        br_read_lock(vfsmount_lock);
-                rcu_read_lock();
+                        rcu_read_lock();
-                do {
-                        seq = read_seqcount_begin(&fs->seq);
-                        nd->path = fs->pwd;
-                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-                } while (read_seqcount_retry(&fs->seq, seq));
+                        do {
+                                seq = read_seqcount_begin(&fs->seq);
+                                nd->path = fs->pwd;
+                                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                        } while (read_seqcount_retry(&fs->seq, seq));
+                } else {
+                        get_fs_pwd(current->fs, &nd->path);
+                }
        } else {
                struct dentry *dentry;
-                file = fget_light(dfd, &fput_needed);
+                file = fget_raw_light(dfd, &fput_needed);
                retval = -EBADF;
                if (!file)
                        goto out_fail;
                dentry = file->f_path.dentry;
-                retval = -ENOTDIR;
+                if (*name) {
-                if (!S_ISDIR(dentry->d_inode->i_mode))
+                        retval = -ENOTDIR;
-                        goto fput_fail;
+                        if (!S_ISDIR(dentry->d_inode->i_mode))
+                                goto fput_fail;
-                retval = file_permission(file, MAY_EXEC);
+                        retval = file_permission(file, MAY_EXEC);
-                if (retval)
+                        if (retval)
-                        goto fput_fail;
+                                goto fput_fail;
+                }
                nd->path = file->f_path;
-                if (fput_needed)
+                if (flags & LOOKUP_RCU) {
-                        nd->file = file;
+                        if (fput_needed)
+                                *fp = file;
-                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-                br_read_lock(vfsmount_lock);
+                        br_read_lock(vfsmount_lock);
-                rcu_read_lock();
+                        rcu_read_lock();
+                } else {
+                        path_get(&file->f_path);
+                        fput_light(file, fput_needed);
+                }
        }
        nd->inode = nd->path.dentry->d_inode;
        return 0;
@@ -1648,60 +1603,23 @@ out_fail:
        return retval;
 }
-static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+static inline int lookup_last(struct nameidata *nd, struct path *path)
 {
-        int retval = 0;
+        if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
-        int fput_needed;
+                nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
-        struct file *file;
-        nd->last_type = LAST_ROOT; /* if there are only slashes... */
-        nd->flags = flags;
-        nd->depth = 0;
-        nd->root.mnt = NULL;
-        if (*name=='/') {
-                set_root(nd);
-                nd->path = nd->root;
-                path_get(&nd->root);
-        } else if (dfd == AT_FDCWD) {
-                get_fs_pwd(current->fs, &nd->path);
-        } else {
-                struct dentry *dentry;
-                file = fget_light(dfd, &fput_needed);
-                retval = -EBADF;
-                if (!file)
-                        goto out_fail;
-                dentry = file->f_path.dentry;
-                retval = -ENOTDIR;
+        nd->flags &= ~LOOKUP_PARENT;
-                if (!S_ISDIR(dentry->d_inode->i_mode))
+        return walk_component(nd, path, &nd->last, nd->last_type,
-                        goto fput_fail;
+                                        nd->flags & LOOKUP_FOLLOW);
-                retval = file_permission(file, MAY_EXEC);
-                if (retval)
-                        goto fput_fail;
-                nd->path = file->f_path;
-                path_get(&file->f_path);
-                fput_light(file, fput_needed);
-        }
-        nd->inode = nd->path.dentry->d_inode;
-        return 0;
-fput_fail:
-        fput_light(file, fput_needed);
-out_fail:
-        return retval;
 }
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-static int do_path_lookup(int dfd, const char *name,
+static int path_lookupat(int dfd, const char *name,
                                unsigned int flags, struct nameidata *nd)
 {
-        int retval;
+        struct file *base = NULL;
+        struct path path;
+        int err;
        /*
         * Path walking is largely split up into 2 different synchronisation
@@ -1717,44 +1635,78 @@ static int do_path_lookup(int dfd, const char *name,
         * be handled by restarting a traditional ref-walk (which will always
         * be able to complete).
         */
-        retval = path_init_rcu(dfd, name, flags, nd);
+        err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
-        if (unlikely(retval))
-                return retval;
+        if (unlikely(err))
-        retval = path_walk_rcu(name, nd);
+                return err;
-        path_finish_rcu(nd);
-        if (nd->root.mnt) {
+        current->total_link_count = 0;
-                path_put(&nd->root);
+        err = link_path_walk(name, nd);
-                nd->root.mnt = NULL;
+        if (!err && !(flags & LOOKUP_PARENT)) {
+                err = lookup_last(nd, &path);
+                while (err > 0) {
+                        void *cookie;
+                        struct path link = path;
+                        nd->flags |= LOOKUP_PARENT;
+                        err = follow_link(&link, nd, &cookie);
+                        if (!err)
+                                err = lookup_last(nd, &path);
+                        put_link(nd, &link, cookie);
+                }
        }
-        if (unlikely(retval == -ECHILD || retval == -ESTALE)) {
+        if (nd->flags & LOOKUP_RCU) {
-                /* slower, locked walk */
+                /* went all way through without dropping RCU */
-                if (retval == -ESTALE)
+                BUG_ON(err);
-                        flags |= LOOKUP_REVAL;
+                if (nameidata_drop_rcu_last(nd))
-                retval = path_init(dfd, name, flags, nd);
+                        err = -ECHILD;
-                if (unlikely(retval))
+        }
-                        return retval;
-                retval = path_walk(name, nd);
+        if (!err) {
-                if (nd->root.mnt) {
+                err = handle_reval_path(nd);
-                        path_put(&nd->root);
+                if (err)
-                        nd->root.mnt = NULL;
+                        path_put(&nd->path);
+        }
+        if (!err && nd->flags & LOOKUP_DIRECTORY) {
+                if (!nd->inode->i_op->lookup) {
+                        path_put(&nd->path);
+                        err = -ENOTDIR;
                }
        }
+        if (base)
+                fput(base);
+        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+                path_put(&nd->root);
+                nd->root.mnt = NULL;
+        }
+        return err;
+}
+static int do_path_lookup(int dfd, const char *name,
+                                unsigned int flags, struct nameidata *nd)
+{
+        int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
+        if (unlikely(retval == -ECHILD))
+                retval = path_lookupat(dfd, name, flags, nd);
+        if (unlikely(retval == -ESTALE))
+                retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
        if (likely(!retval)) {
                if (unlikely(!audit_dummy_context())) {
                        if (nd->path.dentry && nd->inode)
                                audit_inode(name, nd->path.dentry);
                }
        }
        return retval;
 }
-int path_lookup(const char *name, unsigned int flags,
+int kern_path_parent(const char *name, struct nameidata *nd)
-                        struct nameidata *nd)
 {
-        return do_path_lookup(AT_FDCWD, name, flags, nd);
+        return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd);
 }
 int kern_path(const char *name, unsigned int flags, struct path *path)
@@ -1778,29 +1730,10 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
                    const char *name, unsigned int flags,
                    struct nameidata *nd)
 {
-        int retval;
+        nd->root.dentry = dentry;
+        nd->root.mnt = mnt;
-        /* same as do_path_lookup */
+        /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
-        nd->last_type = LAST_ROOT;
+        return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd);
-        nd->flags = flags;
-        nd->depth = 0;
-        nd->path.dentry = dentry;
-        nd->path.mnt = mnt;
-        path_get(&nd->path);
-        nd->root = nd->path;
-        path_get(&nd->root);
-        nd->inode = nd->path.dentry->d_inode;
-        retval = path_walk(name, nd);
-        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
-                                nd->inode))
-                audit_inode(name, nd->path.dentry);
-        path_put(&nd->root);
-        nd->root.mnt = NULL;
-        return retval;
 }
 static struct dentry *__lookup_hash(struct qstr *name,
@@ -1815,17 +1748,6 @@ static struct dentry *__lookup_hash(struct qstr *name,
                return ERR_PTR(err);
        /*
-         * See if the low-level filesystem might want
-         * to use its own hash..
-         */
-        if (base->d_flags & DCACHE_OP_HASH) {
-                err = base->d_op->d_hash(base, inode, name);
-                dentry = ERR_PTR(err);
-                if (err < 0)
-                        goto out;
-        }
-        /*
         * Don't bother with __d_lookup: callers are for creat as
         * well as unlink, so a lot of the time it would cost
         * a double lookup.
@@ -1837,7 +1759,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
        if (!dentry)
                dentry = d_alloc_and_lookup(base, name, nd);
-out:
        return dentry;
 }
@@ -1851,28 +1773,6 @@ static struct dentry *lookup_hash(struct nameidata *nd)
        return __lookup_hash(&nd->last, nd->path.dentry, nd);
 }
-static int __lookup_one_len(const char *name, struct qstr *this,
-                struct dentry *base, int len)
-{
-        unsigned long hash;
-        unsigned int c;
-        this->name = name;
-        this->len = len;
-        if (!len)
-                return -EACCES;
-        hash = init_name_hash();
-        while (len--) {
-                c = *(const unsigned char *)name++;
-                if (c == '/' || c == '\0')
-                        return -EACCES;
-                hash = partial_name_hash(c, hash);
-        }
-        this->hash = end_name_hash(hash);
-        return 0;
-}
 /**
 * lookup_one_len - filesystem helper to lookup single pathname component
 * @name:       pathname component to lookup
@@ -1886,14 +1786,34 @@ static int __lookup_one_len(const char *name, struct qstr *this,
 */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
-        int err;
        struct qstr this;
+        unsigned long hash;
+        unsigned int c;
        WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
-        err = __lookup_one_len(name, &this, base, len);
+        this.name = name;
-        if (err)
+        this.len = len;
-                return ERR_PTR(err);
+        if (!len)
+                return ERR_PTR(-EACCES);
+        hash = init_name_hash();
+        while (len--) {
+                c = *(const unsigned char *)name++;
+                if (c == '/' || c == '\0')
+                        return ERR_PTR(-EACCES);
+                hash = partial_name_hash(c, hash);
+        }
+        this.hash = end_name_hash(hash);
+        /*
+         * See if the low-level filesystem might want
+         * to use its own hash..
+         */
+        if (base->d_flags & DCACHE_OP_HASH) {
+                int err = base->d_op->d_hash(base, base->d_inode, &this);
+                if (err < 0)
+                        return ERR_PTR(err);
+        }
        return __lookup_hash(&this, base, NULL);
 }
@@ -1902,7 +1822,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
                 struct path *path)
 {
        struct nameidata nd;
-        char *tmp = getname(name);
+        char *tmp = getname_flags(name, flags);
        int err = PTR_ERR(tmp);
        if (!IS_ERR(tmp)) {
@@ -1944,11 +1864,15 @@ static inline int check_sticky(struct inode *dir, struct inode *inode)
        if (!(dir->i_mode & S_ISVTX))
                return 0;
+        if (current_user_ns() != inode_userns(inode))
+                goto other_userns;
        if (inode->i_uid == fsuid)
                return 0;
        if (dir->i_uid == fsuid)
                return 0;
-        return !capable(CAP_FOWNER);
+other_userns:
+        return !ns_capable(inode_userns(inode), CAP_FOWNER);
 }
 /*
@@ -2082,12 +2006,16 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        return error;
 }
-int may_open(struct path *path, int acc_mode, int flag)
+static int may_open(struct path *path, int acc_mode, int flag)
 {
        struct dentry *dentry = path->dentry;
        struct inode *inode = dentry->d_inode;
        int error;
+        /* O_PATH? */
+        if (!acc_mode)
+                return 0;
        if (!inode)
                return -ENOENT;
@@ -2124,7 +2052,7 @@ int may_open(struct path *path, int acc_mode, int flag)
        }
        /* O_NOATIME can only be set by the owner or superuser */
-        if (flag & O_NOATIME && !is_owner_or_cap(inode))
+        if (flag & O_NOATIME && !inode_owner_or_capable(inode))
                return -EPERM;
        /*
@@ -2156,34 +2084,6 @@ static int handle_truncate(struct file *filp)
 }
 /*
- * Be careful about ever adding any more callers of this
- * function.  Its flags must be in the namei format, not
- * what get passed to sys_open().
- */
-static int __open_namei_create(struct nameidata *nd, struct path *path,
-                                int open_flag, int mode)
-{
-        int error;
-        struct dentry *dir = nd->path.dentry;
-        if (!IS_POSIXACL(dir->d_inode))
-                mode &= ~current_umask();
-        error = security_path_mknod(&nd->path, path->dentry, mode, 0);
-        if (error)
-                goto out_unlock;
-        error = vfs_create(dir->d_inode, path->dentry, mode, nd);
-out_unlock:
-        mutex_unlock(&dir->d_inode->i_mutex);
-        dput(nd->path.dentry);
-        nd->path.dentry = path->dentry;
-        if (error)
-                return error;
-        /* Don't check for write permission, don't truncate */
-        return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
-}
-/*
 * Note that while the flag value (low two bits) for sys_open means:
 *      00 - read-only
 *      01 - write-only
@@ -2207,128 +2107,115 @@ static inline int open_to_namei_flags(int flag)
        return flag;
 }
-static int open_will_truncate(int flag, struct inode *inode)
-{
-        /*
-         * We'll never write to the fs underlying
-         * a device file.
-         */
-        if (special_file(inode->i_mode))
-                return 0;
-        return (flag & O_TRUNC);
-}
-static struct file *finish_open(struct nameidata *nd,
-                                int open_flag, int acc_mode)
-{
-        struct file *filp;
-        int will_truncate;
-        int error;
-        will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
-        if (will_truncate) {
-                error = mnt_want_write(nd->path.mnt);
-                if (error)
-                        goto exit;
-        }
-        error = may_open(&nd->path, acc_mode, open_flag);
-        if (error) {
-                if (will_truncate)
-                        mnt_drop_write(nd->path.mnt);
-                goto exit;
-        }
-        filp = nameidata_to_filp(nd);
-        if (!IS_ERR(filp)) {
-                error = ima_file_check(filp, acc_mode);
-                if (error) {
-                        fput(filp);
-                        filp = ERR_PTR(error);
-                }
-        }
-        if (!IS_ERR(filp)) {
-                if (will_truncate) {
-                        error = handle_truncate(filp);
-                        if (error) {
-                                fput(filp);
-                                filp = ERR_PTR(error);
-                        }
-                }
-        }
-        /*
-         * It is now safe to drop the mnt write
-         * because the filp has had a write taken
-         * on its behalf.
-         */
-        if (will_truncate)
-                mnt_drop_write(nd->path.mnt);
-        path_put(&nd->path);
-        return filp;
-exit:
-        if (!IS_ERR(nd->intent.open.file))
-                release_open_intent(nd);
-        path_put(&nd->path);
-        return ERR_PTR(error);
-}
 /*
- * Handle O_CREAT case for do_filp_open
+ * Handle the last step of open()
 */
 static struct file *do_last(struct nameidata *nd, struct path *path,
-                            int open_flag, int acc_mode,
+                            const struct open_flags *op, const char *pathname)
-                            int mode, const char *pathname)
 {
        struct dentry *dir = nd->path.dentry;
+        struct dentry *dentry;
+        int open_flag = op->open_flag;
+        int will_truncate = open_flag & O_TRUNC;
+        int want_write = 0;
+        int acc_mode = op->acc_mode;
        struct file *filp;
-        int error = -EISDIR;
+        int error;
+        nd->flags &= ~LOOKUP_PARENT;
+        nd->flags |= op->intent;
        switch (nd->last_type) {
        case LAST_DOTDOT:
-                follow_dotdot(nd);
-                dir = nd->path.dentry;
        case LAST_DOT:
-                if (need_reval_dot(dir)) {
+                error = handle_dots(nd, nd->last_type);
-                        int status = d_revalidate(nd->path.dentry, nd);
+                if (error)
-                        if (!status)
+                        return ERR_PTR(error);
-                                status = -ESTALE;
-                        if (status < 0) {
-                                error = status;
-                                goto exit;
-                        }
-                }
                /* fallthrough */
        case LAST_ROOT:
-                goto exit;
+                if (nd->flags & LOOKUP_RCU) {
+                        if (nameidata_drop_rcu_last(nd))
+                                return ERR_PTR(-ECHILD);
+                }
+                error = handle_reval_path(nd);
+                if (error)
+                        goto exit;
+                audit_inode(pathname, nd->path.dentry);
+                if (open_flag & O_CREAT) {
+                        error = -EISDIR;
+                        goto exit;
+                }
+                goto ok;
        case LAST_BIND:
+                /* can't be RCU mode here */
+                error = handle_reval_path(nd);
+                if (error)
+                        goto exit;
                audit_inode(pathname, dir);
                goto ok;
        }
+        if (!(open_flag & O_CREAT)) {
+                int symlink_ok = 0;
+                if (nd->last.name[nd->last.len])
+                        nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+                if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
+                        symlink_ok = 1;
+                /* we _can_ be in RCU mode here */
+                error = walk_component(nd, path, &nd->last, LAST_NORM,
+                                        !symlink_ok);
+                if (error < 0)
+                        return ERR_PTR(error);
+                if (error) /* symlink */
+                        return NULL;
+                /* sayonara */
+                if (nd->flags & LOOKUP_RCU) {
+                        if (nameidata_drop_rcu_last(nd))
+                                return ERR_PTR(-ECHILD);
+                }
+                error = -ENOTDIR;
+                if (nd->flags & LOOKUP_DIRECTORY) {
+                        if (!nd->inode->i_op->lookup)
+                                goto exit;
+                }
+                audit_inode(pathname, nd->path.dentry);
+                goto ok;
+        }
+        /* create side of things */
+        if (nd->flags & LOOKUP_RCU) {
+                if (nameidata_drop_rcu_last(nd))
+                        return ERR_PTR(-ECHILD);
+        }
+        audit_inode(pathname, dir);
+        error = -EISDIR;
        /* trailing slashes? */
        if (nd->last.name[nd->last.len])
                goto exit;
        mutex_lock(&dir->d_inode->i_mutex);
-        path->dentry = lookup_hash(nd);
+        dentry = lookup_hash(nd);
-        path->mnt = nd->path.mnt;
+        error = PTR_ERR(dentry);
+        if (IS_ERR(dentry)) {
-        error = PTR_ERR(path->dentry);
-        if (IS_ERR(path->dentry)) {
                mutex_unlock(&dir->d_inode->i_mutex);
                goto exit;
        }
-        if (IS_ERR(nd->intent.open.file)) {
+        path->dentry = dentry;
-                error = PTR_ERR(nd->intent.open.file);
+        path->mnt = nd->path.mnt;
-                goto exit_mutex_unlock;
-        }
        /* Negative dentry, just create the file */
-        if (!path->dentry->d_inode) {
+        if (!dentry->d_inode) {
+                int mode = op->mode;
+                if (!IS_POSIXACL(dir->d_inode))
+                        mode &= ~current_umask();
                /*
                 * This write is needed to ensure that a
-                 * ro->rw transition does not occur between
+                 * rw->ro transition does not occur between
                 * the time when the file is created and when
                 * a permanent write count is taken through
                 * the 'struct file' in nameidata_to_filp().
@@ -2336,22 +2223,21 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                error = mnt_want_write(nd->path.mnt);
                if (error)
                        goto exit_mutex_unlock;
-                error = __open_namei_create(nd, path, open_flag, mode);
+                want_write = 1;
-                if (error) {
+                /* Don't check for write permission, don't truncate */
-                        mnt_drop_write(nd->path.mnt);
+                open_flag &= ~O_TRUNC;
-                        goto exit;
+                will_truncate = 0;
-                }
+                acc_mode = MAY_OPEN;
-                filp = nameidata_to_filp(nd);
+                error = security_path_mknod(&nd->path, dentry, mode, 0);
-                mnt_drop_write(nd->path.mnt);
+                if (error)
-                path_put(&nd->path);
+                        goto exit_mutex_unlock;
-                if (!IS_ERR(filp)) {
+                error = vfs_create(dir->d_inode, dentry, mode, nd);
-                        error = ima_file_check(filp, acc_mode);
+                if (error)
-                        if (error) {
+                        goto exit_mutex_unlock;
-                                fput(filp);
+                mutex_unlock(&dir->d_inode->i_mutex);
-                                filp = ERR_PTR(error);
+                dput(nd->path.dentry);
-                        }
+                nd->path.dentry = dentry;
-                }
+                goto common;
-                return filp;
        }
        /*
@@ -2381,7 +2267,40 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        if (S_ISDIR(nd->inode->i_mode))
                goto exit;
 ok:
-        filp = finish_open(nd, open_flag, acc_mode);
+        if (!S_ISREG(nd->inode->i_mode))
+                will_truncate = 0;
+        if (will_truncate) {
+                error = mnt_want_write(nd->path.mnt);
+                if (error)
+                        goto exit;
+                want_write = 1;
+        }
+common:
+        error = may_open(&nd->path, acc_mode, open_flag);
+        if (error)
+                goto exit;
+        filp = nameidata_to_filp(nd);
+        if (!IS_ERR(filp)) {
+                error = ima_file_check(filp, op->acc_mode);
+                if (error) {
+                        fput(filp);
+                        filp = ERR_PTR(error);
+                }
+        }
+        if (!IS_ERR(filp)) {
+                if (will_truncate) {
+                        error = handle_truncate(filp);
+                        if (error) {
+                                fput(filp);
+                                filp = ERR_PTR(error);
+                        }
+                }
+        }
+out:
+        if (want_write)
+                mnt_drop_write(nd->path.mnt);
+        path_put(&nd->path);
        return filp;
 exit_mutex_unlock:
@@ -2389,199 +2308,103 @@ exit_mutex_unlock:
 exit_dput:
        path_put_conditional(path, nd);
 exit:
-        if (!IS_ERR(nd->intent.open.file))
+        filp = ERR_PTR(error);
-                release_open_intent(nd);
+        goto out;
-        path_put(&nd->path);
-        return ERR_PTR(error);
 }
-/*
+static struct file *path_openat(int dfd, const char *pathname,
- * Note that the low bits of the passed in "open_flag"
+                struct nameidata *nd, const struct open_flags *op, int flags)
- * are not the same as in the local variable "flag". See
- * open_to_namei_flags() for more details.
- */
-struct file *do_filp_open(int dfd, const char *pathname,
-                int open_flag, int mode, int acc_mode)
 {
+        struct file *base = NULL;
        struct file *filp;
-        struct nameidata nd;
-        int error;
        struct path path;
-        int count = 0;
+        int error;
-        int flag = open_to_namei_flags(open_flag);
-        int flags;
-        if (!(open_flag & O_CREAT))
-                mode = 0;
-        /* Must never be set by userspace */
-        open_flag &= ~FMODE_NONOTIFY;
-        /*
-         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
-         * check for O_DSYNC if the need any syncing at all we enforce it's
-         * always set instead of having to deal with possibly weird behaviour
-         * for malicious applications setting only __O_SYNC.
-         */
-        if (open_flag & __O_SYNC)
-                open_flag |= O_DSYNC;
-        if (!acc_mode)
-                acc_mode = MAY_OPEN | ACC_MODE(open_flag);
-        /* O_TRUNC implies we need access checks for write permissions */
-        if (open_flag & O_TRUNC)
-                acc_mode |= MAY_WRITE;
-        /* Allow the LSM permission hook to distinguish append 
-           access from general write access. */
-        if (open_flag & O_APPEND)
-                acc_mode |= MAY_APPEND;
-        flags = LOOKUP_OPEN;
-        if (open_flag & O_CREAT) {
-                flags |= LOOKUP_CREATE;
-                if (open_flag & O_EXCL)
-                        flags |= LOOKUP_EXCL;
-        }
-        if (open_flag & O_DIRECTORY)
-                flags |= LOOKUP_DIRECTORY;
-        if (!(open_flag & O_NOFOLLOW))
-                flags |= LOOKUP_FOLLOW;
        filp = get_empty_filp();
        if (!filp)
                return ERR_PTR(-ENFILE);
-        filp->f_flags = open_flag;
+        filp->f_flags = op->open_flag;
-        nd.intent.open.file = filp;
+        nd->intent.open.file = filp;
-        nd.intent.open.flags = flag;
+        nd->intent.open.flags = open_to_namei_flags(op->open_flag);
-        nd.intent.open.create_mode = mode;
+        nd->intent.open.create_mode = op->mode;
-        if (open_flag & O_CREAT)
-                goto creat;
-        /* !O_CREAT, simple open */
+        error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base);
-        error = do_path_lookup(dfd, pathname, flags, &nd);
        if (unlikely(error))
                goto out_filp;
-        error = -ELOOP;
-        if (!(nd.flags & LOOKUP_FOLLOW)) {
-                if (nd.inode->i_op->follow_link)
-                        goto out_path;
-        }
-        error = -ENOTDIR;
-        if (nd.flags & LOOKUP_DIRECTORY) {
-                if (!nd.inode->i_op->lookup)
-                        goto out_path;
-        }
-        audit_inode(pathname, nd.path.dentry);
-        filp = finish_open(&nd, open_flag, acc_mode);
-        return filp;
-creat:
-        /* OK, have to create the file. Find the parent. */
-        error = path_init_rcu(dfd, pathname,
-                        LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
-        if (error)
-                goto out_filp;
-        error = path_walk_rcu(pathname, &nd);
-        path_finish_rcu(&nd);
-        if (unlikely(error == -ECHILD || error == -ESTALE)) {
-                /* slower, locked walk */
-                if (error == -ESTALE) {
-reval:
-                        flags |= LOOKUP_REVAL;
-                }
-                error = path_init(dfd, pathname,
-                                LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
-                if (error)
-                        goto out_filp;
-                error = path_walk_simple(pathname, &nd);
+        current->total_link_count = 0;
-        }
+        error = link_path_walk(pathname, nd);
        if (unlikely(error))
                goto out_filp;
-        if (unlikely(!audit_dummy_context()))
-                audit_inode(pathname, nd.path.dentry);
-        /*
+        filp = do_last(nd, &path, op, pathname);
-         * We have the parent and last component.
-         */
-        nd.flags = flags;
-        filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
        while (unlikely(!filp)) { /* trailing symlink */
                struct path link = path;
-                struct inode *linki = link.dentry->d_inode;
                void *cookie;
-                error = -ELOOP;
+                if (!(nd->flags & LOOKUP_FOLLOW)) {
-                if (!(nd.flags & LOOKUP_FOLLOW))
+                        path_put_conditional(&path, nd);
-                        goto exit_dput;
+                        path_put(&nd->path);
-                if (count++ == 32)
+                        filp = ERR_PTR(-ELOOP);
-                        goto exit_dput;
+                        break;
-                /*
-                 * This is subtle. Instead of calling do_follow_link() we do
-                 * the thing by hands. The reason is that this way we have zero
-                 * link_count and path_walk() (called from ->follow_link)
-                 * honoring LOOKUP_PARENT.  After that we have the parent and
-                 * last component, i.e. we are in the same situation as after
-                 * the first path_walk().  Well, almost - if the last component
-                 * is normal we get its copy stored in nd->last.name and we will
-                 * have to putname() it when we are done. Procfs-like symlinks
-                 * just set LAST_BIND.
-                 */
-                nd.flags |= LOOKUP_PARENT;
-                error = security_inode_follow_link(link.dentry, &nd);
-                if (error)
-                        goto exit_dput;
-                error = __do_follow_link(&link, &nd, &cookie);
-                if (unlikely(error)) {
-                        if (!IS_ERR(cookie) && linki->i_op->put_link)
-                                linki->i_op->put_link(link.dentry, &nd, cookie);
-                        /* nd.path had been dropped */
-                        nd.path = link;
-                        goto out_path;
                }
-                nd.flags &= ~LOOKUP_PARENT;
+                nd->flags |= LOOKUP_PARENT;
-                filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
+                nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
-                if (linki->i_op->put_link)
+                error = follow_link(&link, nd, &cookie);
-                        linki->i_op->put_link(link.dentry, &nd, cookie);
+                if (unlikely(error))
-                path_put(&link);
+                        filp = ERR_PTR(error);
+                else
+                        filp = do_last(nd, &path, op, pathname);
+                put_link(nd, &link, cookie);
        }
 out:
-        if (nd.root.mnt)
+        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
-                path_put(&nd.root);
+                path_put(&nd->root);
-        if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
+        if (base)
-                goto reval;
+                fput(base);
+        release_open_intent(nd);
        return filp;
-exit_dput:
-        path_put_conditional(&path, &nd);
-out_path:
-        path_put(&nd.path);
 out_filp:
-        if (!IS_ERR(nd.intent.open.file))
-                release_open_intent(&nd);
        filp = ERR_PTR(error);
        goto out;
 }
-/**
+struct file *do_filp_open(int dfd, const char *pathname,
- * filp_open - open file and return file pointer
+                const struct open_flags *op, int flags)
- *
- * @filename:   path to open
- * @flags:      open flags as per the open(2) second argument
- * @mode:       mode for the new file if O_CREAT is set, else ignored
- *
- * This is the helper to open a file from kernelspace if you really
- * have to.  But in generally you should not do this, so please move
- * along, nothing to see here..
- */
-struct file *filp_open(const char *filename, int flags, int mode)
 {
-        return do_filp_open(AT_FDCWD, filename, flags, mode, 0);
+        struct nameidata nd;
+        struct file *filp;
+        filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
+        if (unlikely(filp == ERR_PTR(-ECHILD)))
+                filp = path_openat(dfd, pathname, &nd, op, flags);
+        if (unlikely(filp == ERR_PTR(-ESTALE)))
+                filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
+        return filp;
+}
+struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+                const char *name, const struct open_flags *op, int flags)
+{
+        struct nameidata nd;
+        struct file *file;
+        nd.root.mnt = mnt;
+        nd.root.dentry = dentry;
+        flags |= LOOKUP_ROOT;
+        if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
+                return ERR_PTR(-ELOOP);
+        file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
+        if (unlikely(file == ERR_PTR(-ECHILD)))
+                file = path_openat(-1, name, &nd, op, flags);
+        if (unlikely(file == ERR_PTR(-ESTALE)))
+                file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
+        return file;
 }
-EXPORT_SYMBOL(filp_open);
 /**
 * lookup_create - lookup a dentry, creating it if it doesn't exist
@@ -2643,7 +2466,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        if (error)
                return error;
-        if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
+        if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
+            !ns_capable(inode_userns(dir), CAP_MKNOD))
                return -EPERM;
        if (!dir->i_op->mknod)
@@ -3120,7 +2944,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
                return error;
        mutex_lock(&inode->i_mutex);
-        error = dir->i_op->link(old_dentry, dir, new_dentry);
+        /* Make sure we don't allow creating hardlink to an unlinked file */
+        if (inode->i_nlink == 0)
+                error =  -ENOENT;
+        else
+                error = dir->i_op->link(old_dentry, dir, new_dentry);
        mutex_unlock(&inode->i_mutex);
        if (!error)
                fsnotify_link(dir, inode, new_dentry);
@@ -3142,15 +2970,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
        struct dentry *new_dentry;
        struct nameidata nd;
        struct path old_path;
+        int how = 0;
        int error;
        char *to;
-        if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
+        if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;
+        /*
+         * To use null names we require CAP_DAC_READ_SEARCH
+         * This ensures that not everyone will be able to create
+         * handlink using the passed filedescriptor.
+         */
+        if (flags & AT_EMPTY_PATH) {
+                if (!capable(CAP_DAC_READ_SEARCH))
+                        return -ENOENT;
+                how = LOOKUP_EMPTY;
+        }
+        if (flags & AT_SYMLINK_FOLLOW)
+                how |= LOOKUP_FOLLOW;
-        error = user_path_at(olddfd, oldname,
+        error = user_path_at(olddfd, oldname, how, &old_path);
-                             flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
-                             &old_path);
        if (error)
                return error;
@@ -3587,7 +3427,7 @@ EXPORT_SYMBOL(page_readlink);
 EXPORT_SYMBOL(__page_symlink);
 EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
-EXPORT_SYMBOL(path_lookup);
+EXPORT_SYMBOL(kern_path_parent);
 EXPORT_SYMBOL(kern_path);
 EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(inode_permission);
diff --git a/fs/namespace.c b/fs/namespace.c
index 7b0b95371696..7dba2ed03429 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -196,7 +196,7 @@ unsigned int mnt_get_count(struct vfsmount *mnt)
 #endif
 }
-struct vfsmount *alloc_vfsmnt(const char *name)
+static struct vfsmount *alloc_vfsmnt(const char *name)
 {
        struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
        if (mnt) {
@@ -466,15 +466,7 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt)
        br_write_unlock(vfsmount_lock);
 }
-void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
+static void free_vfsmnt(struct vfsmount *mnt)
-{
-        mnt->mnt_sb = sb;
-        mnt->mnt_root = dget(sb->s_root);
-}
-EXPORT_SYMBOL(simple_set_mnt);
-void free_vfsmnt(struct vfsmount *mnt)
 {
        kfree(mnt->mnt_devname);
        mnt_free_id(mnt);
@@ -678,6 +670,36 @@ static struct vfsmount *skip_mnt_tree(struct vfsmount *p)
        return p;
 }
+struct vfsmount *
+vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
+{
+        struct vfsmount *mnt;
+        struct dentry *root;
+        if (!type)
+                return ERR_PTR(-ENODEV);
+        mnt = alloc_vfsmnt(name);
+        if (!mnt)
+                return ERR_PTR(-ENOMEM);
+        if (flags & MS_KERNMOUNT)
+                mnt->mnt_flags = MNT_INTERNAL;
+        root = mount_fs(type, flags, name, data);
+        if (IS_ERR(root)) {
+                free_vfsmnt(mnt);
+                return ERR_CAST(root);
+        }
+        mnt->mnt_root = root;
+        mnt->mnt_sb = root->d_sb;
+        mnt->mnt_mountpoint = mnt->mnt_root;
+        mnt->mnt_parent = mnt;
+        return mnt;
+}
+EXPORT_SYMBOL_GPL(vfs_kern_mount);
 static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
                                        int flag)
 {
@@ -978,7 +1000,13 @@ static int show_vfsmnt(struct seq_file *m, void *v)
        int err = 0;
        struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
-        mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+        if (mnt->mnt_sb->s_op->show_devname) {
+                err = mnt->mnt_sb->s_op->show_devname(m, mnt);
+                if (err)
+                        goto out;
+        } else {
+                mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+        }
        seq_putc(m, ' ');
        seq_path(m, &mnt_path, " \t\n\\");
        seq_putc(m, ' ');
@@ -1002,6 +1030,18 @@ const struct seq_operations mounts_op = {
        .show   = show_vfsmnt
 };
+static int uuid_is_nil(u8 *uuid)
+{
+        int i;
+        u8  *cp = (u8 *)uuid;
+        for (i = 0; i < 16; i++) {
+                if (*cp++)
+                        return 0;
+        }
+        return 1;
+}
 static int show_mountinfo(struct seq_file *m, void *v)
 {
        struct proc_mounts *p = m->private;
@@ -1013,7 +1053,12 @@ static int show_mountinfo(struct seq_file *m, void *v)
        seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id,
                   MAJOR(sb->s_dev), MINOR(sb->s_dev));
-        seq_dentry(m, mnt->mnt_root, " \t\n\\");
+        if (sb->s_op->show_path)
+                err = sb->s_op->show_path(m, mnt);
+        else
+                seq_dentry(m, mnt->mnt_root, " \t\n\\");
+        if (err)
+                goto out;
        seq_putc(m, ' ');
        seq_path_root(m, &mnt_path, &root, " \t\n\\");
        if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) {
@@ -1040,11 +1085,20 @@ static int show_mountinfo(struct seq_file *m, void *v)
        if (IS_MNT_UNBINDABLE(mnt))
                seq_puts(m, " unbindable");
+        if (!uuid_is_nil(mnt->mnt_sb->s_uuid))
+                /* print the uuid */
+                seq_printf(m, " uuid:%pU", mnt->mnt_sb->s_uuid);
        /* Filesystem specific data */
        seq_puts(m, " - ");
        show_type(m, sb);
        seq_putc(m, ' ');
-        mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+        if (sb->s_op->show_devname)
+                err = sb->s_op->show_devname(m, mnt);
+        else
+                mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+        if (err)
+                goto out;
        seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
        err = show_sb_opts(m, sb);
        if (err)
@@ -1070,11 +1124,15 @@ static int show_vfsstat(struct seq_file *m, void *v)
        int err = 0;
        /* device */
-        if (mnt->mnt_devname) {
+        if (mnt->mnt_sb->s_op->show_devname) {
-                seq_puts(m, "device ");
+                err = mnt->mnt_sb->s_op->show_devname(m, mnt);
-                mangle(m, mnt->mnt_devname);
+        } else {
-        } else
+                if (mnt->mnt_devname) {
-                seq_puts(m, "no device");
+                        seq_puts(m, "device ");
+                        mangle(m, mnt->mnt_devname);
+                } else
+                        seq_puts(m, "no device");
+        }
        /* mount point */
        seq_puts(m, " mounted on ");
@@ -1088,7 +1146,8 @@ static int show_vfsstat(struct seq_file *m, void *v)
        /* optional statistics */
        if (mnt->mnt_sb->s_op->show_stats) {
                seq_putc(m, ' ');
-                err = mnt->mnt_sb->s_op->show_stats(m, mnt);
+                if (!err)
+                        err = mnt->mnt_sb->s_op->show_stats(m, mnt);
        }
        seq_putc(m, '\n');
@@ -1244,7 +1303,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
                 */
                br_write_lock(vfsmount_lock);
                if (mnt_get_count(mnt) != 2) {
-                        br_write_lock(vfsmount_lock);
+                        br_write_unlock(vfsmount_lock);
                        return -EBUSY;
                }
                br_write_unlock(vfsmount_lock);
@@ -1604,9 +1663,35 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
        return err;
 }
+static int lock_mount(struct path *path)
+{
+        struct vfsmount *mnt;
+retry:
+        mutex_lock(&path->dentry->d_inode->i_mutex);
+        if (unlikely(cant_mount(path->dentry))) {
+                mutex_unlock(&path->dentry->d_inode->i_mutex);
+                return -ENOENT;
+        }
+        down_write(&namespace_sem);
+        mnt = lookup_mnt(path);
+        if (likely(!mnt))
+                return 0;
+        up_write(&namespace_sem);
+        mutex_unlock(&path->dentry->d_inode->i_mutex);
+        path_put(path);
+        path->mnt = mnt;
+        path->dentry = dget(mnt->mnt_root);
+        goto retry;
+}
+static void unlock_mount(struct path *path)
+{
+        up_write(&namespace_sem);
+        mutex_unlock(&path->dentry->d_inode->i_mutex);
+}
 static int graft_tree(struct vfsmount *mnt, struct path *path)
 {
-        int err;
        if (mnt->mnt_sb->s_flags & MS_NOUSER)
                return -EINVAL;
@@ -1614,16 +1699,10 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
              S_ISDIR(mnt->mnt_root->d_inode->i_mode))
                return -ENOTDIR;
-        err = -ENOENT;
+        if (d_unlinked(path->dentry))
-        mutex_lock(&path->dentry->d_inode->i_mutex);
+                return -ENOENT;
-        if (cant_mount(path->dentry))
-                goto out_unlock;
-        if (!d_unlinked(path->dentry))
+        return attach_recursive_mnt(mnt, path, NULL);
-                err = attach_recursive_mnt(mnt, path, NULL);
-out_unlock:
-        mutex_unlock(&path->dentry->d_inode->i_mutex);
-        return err;
 }
 /*
@@ -1686,6 +1765,7 @@ static int do_change_type(struct path *path, int flag)
 static int do_loopback(struct path *path, char *old_name,
                                int recurse)
 {
+        LIST_HEAD(umount_list);
        struct path old_path;
        struct vfsmount *mnt = NULL;
        int err = mount_is_safe(path);
@@ -1697,13 +1777,16 @@ static int do_loopback(struct path *path, char *old_name,
        if (err)
                return err;
-        down_write(&namespace_sem);
+        err = lock_mount(path);
+        if (err)
+                goto out;
        err = -EINVAL;
        if (IS_MNT_UNBINDABLE(old_path.mnt))
-                goto out;
+                goto out2;
        if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
-                goto out;
+                goto out2;
        err = -ENOMEM;
        if (recurse)
@@ -1712,20 +1795,18 @@ static int do_loopback(struct path *path, char *old_name,
                mnt = clone_mnt(old_path.mnt, old_path.dentry, 0);
        if (!mnt)
-                goto out;
+                goto out2;
        err = graft_tree(mnt, path);
        if (err) {
-                LIST_HEAD(umount_list);
                br_write_lock(vfsmount_lock);
                umount_tree(mnt, 0, &umount_list);
                br_write_unlock(vfsmount_lock);
-                release_mounts(&umount_list);
        }
+out2:
+        unlock_mount(path);
+        release_mounts(&umount_list);
 out:
-        up_write(&namespace_sem);
        path_put(&old_path);
        return err;
 }
@@ -1767,6 +1848,10 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
        if (path->dentry != path->mnt->mnt_root)
                return -EINVAL;
+        err = security_sb_remount(sb, data);
+        if (err)
+                return err;
        down_write(&sb->s_umount);
        if (flags & MS_BIND)
                err = change_mount_flags(path->mnt, flags);
@@ -1810,18 +1895,12 @@ static int do_move_mount(struct path *path, char *old_name)
        if (err)
                return err;
-        down_write(&namespace_sem);
+        err = lock_mount(path);
-        err = follow_down(path, true);
        if (err < 0)
                goto out;
        err = -EINVAL;
        if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
-                goto out;
-        err = -ENOENT;
-        mutex_lock(&path->dentry->d_inode->i_mutex);
-        if (cant_mount(path->dentry))
                goto out1;
        if (d_unlinked(path->dentry))
@@ -1863,16 +1942,87 @@ static int do_move_mount(struct path *path, char *old_name)
         * automatically */
        list_del_init(&old_path.mnt->mnt_expire);
 out1:
-        mutex_unlock(&path->dentry->d_inode->i_mutex);
+        unlock_mount(path);
 out:
-        up_write(&namespace_sem);
        if (!err)
                path_put(&parent_path);
        path_put(&old_path);
        return err;
 }
-static int do_add_mount(struct vfsmount *, struct path *, int);
+static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
+{
+        int err;
+        const char *subtype = strchr(fstype, '.');
+        if (subtype) {
+                subtype++;
+                err = -EINVAL;
+                if (!subtype[0])
+                        goto err;
+        } else
+                subtype = "";
+        mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
+        err = -ENOMEM;
+        if (!mnt->mnt_sb->s_subtype)
+                goto err;
+        return mnt;
+ err:
+        mntput(mnt);
+        return ERR_PTR(err);
+}
+struct vfsmount *
+do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+{
+        struct file_system_type *type = get_fs_type(fstype);
+        struct vfsmount *mnt;
+        if (!type)
+                return ERR_PTR(-ENODEV);
+        mnt = vfs_kern_mount(type, flags, name, data);
+        if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
+            !mnt->mnt_sb->s_subtype)
+                mnt = fs_set_subtype(mnt, fstype);
+        put_filesystem(type);
+        return mnt;
+}
+EXPORT_SYMBOL_GPL(do_kern_mount);
+/*
+ * add a mount into a namespace's mount tree
+ */
+static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
+{
+        int err;
+        mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
+        err = lock_mount(path);
+        if (err)
+                return err;
+        err = -EINVAL;
+        if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
+                goto unlock;
+        /* Refuse the same filesystem on the same mount point */
+        err = -EBUSY;
+        if (path->mnt->mnt_sb == newmnt->mnt_sb &&
+            path->mnt->mnt_root == path->dentry)
+                goto unlock;
+        err = -EINVAL;
+        if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
+                goto unlock;
+        newmnt->mnt_flags = mnt_flags;
+        err = graft_tree(newmnt, path);
+unlock:
+        unlock_mount(path);
+        return err;
+}
 /*
 * create a new mount for userspace and request it to be added into the
@@ -1932,43 +2082,6 @@ fail:
        return err;
 }
-/*
- * add a mount into a namespace's mount tree
- */
-static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
-{
-        int err;
-        mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
-        down_write(&namespace_sem);
-        /* Something was mounted here while we slept */
-        err = follow_down(path, true);
-        if (err < 0)
-                goto unlock;
-        err = -EINVAL;
-        if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
-                goto unlock;
-        /* Refuse the same filesystem on the same mount point */
-        err = -EBUSY;
-        if (path->mnt->mnt_sb == newmnt->mnt_sb &&
-            path->mnt->mnt_root == path->dentry)
-                goto unlock;
-        err = -EINVAL;
-        if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
-                goto unlock;
-        newmnt->mnt_flags = mnt_flags;
-        err = graft_tree(newmnt, path);
-unlock:
-        up_write(&namespace_sem);
-        return err;
-}
 /**
 * mnt_set_expiry - Put a mount on an expiration list
 * @mnt: The mount to list.
@@ -2469,65 +2582,60 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        error = user_path_dir(new_root, &new);
        if (error)
                goto out0;
-        error = -EINVAL;
-        if (!check_mnt(new.mnt))
-                goto out1;
        error = user_path_dir(put_old, &old);
        if (error)
                goto out1;
        error = security_sb_pivotroot(&old, &new);
-        if (error) {
+        if (error)
-                path_put(&old);
+                goto out2;
-                goto out1;
-        }
        get_fs_root(current->fs, &root);
-        down_write(&namespace_sem);
+        error = lock_mount(&old);
-        mutex_lock(&old.dentry->d_inode->i_mutex);
+        if (error)
+                goto out3;
        error = -EINVAL;
        if (IS_MNT_SHARED(old.mnt) ||
                IS_MNT_SHARED(new.mnt->mnt_parent) ||
                IS_MNT_SHARED(root.mnt->mnt_parent))
-                goto out2;
+                goto out4;
-        if (!check_mnt(root.mnt))
+        if (!check_mnt(root.mnt) || !check_mnt(new.mnt))
-                goto out2;
+                goto out4;
        error = -ENOENT;
-        if (cant_mount(old.dentry))
-                goto out2;
        if (d_unlinked(new.dentry))
-                goto out2;
+                goto out4;
        if (d_unlinked(old.dentry))
-                goto out2;
+                goto out4;
        error = -EBUSY;
        if (new.mnt == root.mnt ||
            old.mnt == root.mnt)
-                goto out2; /* loop, on the same file system  */
+                goto out4; /* loop, on the same file system  */
        error = -EINVAL;
        if (root.mnt->mnt_root != root.dentry)
-                goto out2; /* not a mountpoint */
+                goto out4; /* not a mountpoint */
        if (root.mnt->mnt_parent == root.mnt)
-                goto out2; /* not attached */
+                goto out4; /* not attached */
        if (new.mnt->mnt_root != new.dentry)
-                goto out2; /* not a mountpoint */
+                goto out4; /* not a mountpoint */
        if (new.mnt->mnt_parent == new.mnt)
-                goto out2; /* not attached */
+                goto out4; /* not attached */
        /* make sure we can reach put_old from new_root */
        tmp = old.mnt;
-        br_write_lock(vfsmount_lock);
        if (tmp != new.mnt) {
                for (;;) {
                        if (tmp->mnt_parent == tmp)
-                                goto out3; /* already mounted on put_old */
+                                goto out4; /* already mounted on put_old */
                        if (tmp->mnt_parent == new.mnt)
                                break;
                        tmp = tmp->mnt_parent;
                }
                if (!is_subdir(tmp->mnt_mountpoint, new.dentry))
-                        goto out3;
+                        goto out4;
        } else if (!is_subdir(old.dentry, new.dentry))
-                goto out3;
+                goto out4;
+        br_write_lock(vfsmount_lock);
        detach_mnt(new.mnt, &parent_path);
        detach_mnt(root.mnt, &root_parent);
        /* mount old root on put_old */
@@ -2537,22 +2645,21 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        touch_mnt_namespace(current->nsproxy->mnt_ns);
        br_write_unlock(vfsmount_lock);
        chroot_fs_refs(&root, &new);
        error = 0;
-        path_put(&root_parent);
+out4:
-        path_put(&parent_path);
+        unlock_mount(&old);
-out2:
+        if (!error) {
-        mutex_unlock(&old.dentry->d_inode->i_mutex);
+                path_put(&root_parent);
-        up_write(&namespace_sem);
+                path_put(&parent_path);
+        }
+out3:
        path_put(&root);
+out2:
        path_put(&old);
 out1:
        path_put(&new);
 out0:
        return error;
-out3:
-        br_write_unlock(vfsmount_lock);
-        goto out2;
 }
 static void __init init_mount_tree(void)
@@ -2594,7 +2701,7 @@ void __init mnt_init(void)
        if (!mount_hashtable)
                panic("Failed to allocate mount hash table\n");
-        printk("Mount-cache hash table entries: %lu\n", HASH_SIZE);
+        printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE);
        for (u = 0; u < HASH_SIZE; u++)
                INIT_LIST_HEAD(&mount_hashtable[u]);
@@ -2627,3 +2734,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
        kfree(ns);
 }
 EXPORT_SYMBOL(put_mnt_ns);
+struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
+{
+        return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
+}
+EXPORT_SYMBOL_GPL(kern_mount_data);
diff --git a/fs/ncpfs/Makefile b/fs/ncpfs/Makefile
index 68ea095100a8..c66af563f2ce 100644
--- a/fs/ncpfs/Makefile
+++ b/fs/ncpfs/Makefile
@@ -11,6 +11,6 @@ ncpfs-$(CONFIG_NCPFS_EXTRAS)   += symlink.o
 ncpfs-$(CONFIG_NCPFS_NFS_NS)   += symlink.o
 # If you want debugging output, please uncomment the following line
-# EXTRA_CFLAGS += -DDEBUG_NCP=1
+# ccflags-y := -DDEBUG_NCP=1
 CFLAGS_ncplib_kernel.o := -finline-functions
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 199016528fcb..e3d294269058 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -135,33 +135,6 @@ out_err:
 #if defined(CONFIG_NFS_V4_1)
 /*
- *  * CB_SEQUENCE operations will fail until the callback sessionid is set.
- *   */
-int nfs4_set_callback_sessionid(struct nfs_client *clp)
-{
-        struct svc_serv *serv = clp->cl_rpcclient->cl_xprt->bc_serv;
-        struct nfs4_sessionid *bc_sid;
-        if (!serv->sv_bc_xprt)
-                return -EINVAL;
-        /* on success freed in xprt_free */
-        bc_sid = kmalloc(sizeof(struct nfs4_sessionid), GFP_KERNEL);
-        if (!bc_sid)
-                return -ENOMEM;
-        memcpy(bc_sid->data, &clp->cl_session->sess_id.data,
-                NFS4_MAX_SESSIONID_LEN);
-        spin_lock_bh(&serv->sv_cb_lock);
-        serv->sv_bc_xprt->xpt_bc_sid = bc_sid;
-        spin_unlock_bh(&serv->sv_cb_lock);
-        dprintk("%s set xpt_bc_sid=%u:%u:%u:%u for sv_bc_xprt %p\n", __func__,
-                ((u32 *)bc_sid->data)[0], ((u32 *)bc_sid->data)[1],
-                ((u32 *)bc_sid->data)[2], ((u32 *)bc_sid->data)[3],
-                serv->sv_bc_xprt);
-        return 0;
-}
-/*
 * The callback service for NFSv4.1 callbacks
 */
 static int
@@ -266,10 +239,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
                struct nfs_callback_data *cb_info)
 {
 }
-int nfs4_set_callback_sessionid(struct nfs_client *clp)
-{
-        return 0;
-}
 #endif /* CONFIG_NFS_V4_1 */
 /*
@@ -359,78 +328,58 @@ void nfs_callback_down(int minorversion)
        mutex_unlock(&nfs_callback_mutex);
 }
-static int check_gss_callback_principal(struct nfs_client *clp,
+/* Boolean check of RPC_AUTH_GSS principal */
-                                        struct svc_rqst *rqstp)
+int
+check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
 {
        struct rpc_clnt *r = clp->cl_rpcclient;
        char *p = svc_gss_principal(rqstp);
+        if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
+                return 1;
        /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
        if (clp->cl_minorversion != 0)
-                return SVC_DROP;
+                return 0;
        /*
         * It might just be a normal user principal, in which case
         * userspace won't bother to tell us the name at all.
         */
        if (p == NULL)
-                return SVC_DENIED;
+                return 0;
        /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
        if (memcmp(p, "nfs@", 4) != 0)
-                return SVC_DENIED;
+                return 0;
        p += 4;
        if (strcmp(p, r->cl_server) != 0)
-                return SVC_DENIED;
+                return 0;
-        return SVC_OK;
+        return 1;
 }
-/* pg_authenticate method helper */
+/*
-static struct nfs_client *nfs_cb_find_client(struct svc_rqst *rqstp)
+ * pg_authenticate method for nfsv4 callback threads.
-{
+ *
-        struct nfs4_sessionid *sessionid = bc_xprt_sid(rqstp);
+ * The authflavor has been negotiated, so an incorrect flavor is a server
-        int is_cb_compound = rqstp->rq_proc == CB_COMPOUND ? 1 : 0;
+ * bug. Drop packets with incorrect authflavor.
+ *
-        dprintk("--> %s rq_proc %d\n", __func__, rqstp->rq_proc);
+ * All other checking done after NFS decoding where the nfs_client can be
-        if (svc_is_backchannel(rqstp))
+ * found in nfs4_callback_compound
-                /* Sessionid (usually) set after CB_NULL ping */
+ */
-                return nfs4_find_client_sessionid(svc_addr(rqstp), sessionid,
-                                                  is_cb_compound);
-        else
-                /* No callback identifier in pg_authenticate */
-                return nfs4_find_client_no_ident(svc_addr(rqstp));
-}
-/* pg_authenticate method for nfsv4 callback threads. */
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
-        struct nfs_client *clp;
-        RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
-        int ret = SVC_OK;
-        /* Don't talk to strangers */
-        clp = nfs_cb_find_client(rqstp);
-        if (clp == NULL)
-                return SVC_DROP;
-        dprintk("%s: %s NFSv4 callback!\n", __func__,
-                        svc_print_addr(rqstp, buf, sizeof(buf)));
        switch (rqstp->rq_authop->flavour) {
-                case RPC_AUTH_NULL:
+        case RPC_AUTH_NULL:
-                        if (rqstp->rq_proc != CB_NULL)
+                if (rqstp->rq_proc != CB_NULL)
-                                ret = SVC_DENIED;
+                        return SVC_DROP;
-                        break;
+                break;
-                case RPC_AUTH_UNIX:
+        case RPC_AUTH_GSS:
-                        break;
+                /* No RPC_AUTH_GSS support yet in NFSv4.1 */
-                case RPC_AUTH_GSS:
+                 if (svc_is_backchannel(rqstp))
-                        ret = check_gss_callback_principal(clp, rqstp);
+                        return SVC_DROP;
-                        break;
-                default:
-                        ret = SVC_DENIED;
        }
-        nfs_put_client(clp);
+        return SVC_OK;
-        return ret;
 }
 /*
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index d3b44f9bd747..46d93ce7311b 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -7,6 +7,7 @@
 */
 #ifndef __LINUX_FS_NFS_CALLBACK_H
 #define __LINUX_FS_NFS_CALLBACK_H
+#include <linux/sunrpc/svc.h>
 #define NFS4_CALLBACK 0x40000000
 #define NFS4_CALLBACK_XDRSIZE 2048
@@ -37,7 +38,6 @@ enum nfs4_callback_opnum {
 struct cb_process_state {
        __be32                  drc_status;
        struct nfs_client       *clp;
-        struct nfs4_sessionid   *svc_sid; /* v4.1 callback service sessionid */
 };
 struct cb_compound_hdr_arg {
@@ -168,7 +168,7 @@ extern unsigned nfs4_callback_layoutrecall(
 extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
 extern void nfs4_cb_take_slot(struct nfs_client *clp);
 #endif /* CONFIG_NFS_V4_1 */
+extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
 extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
                                    struct cb_getattrres *res,
                                    struct cb_process_state *cps);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 4bb91cb2620d..2f41dccea18e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -188,10 +188,10 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
                        rv = NFS4ERR_DELAY;
                list_del_init(&lo->plh_bulk_recall);
                spin_unlock(&ino->i_lock);
+                pnfs_free_lseg_list(&free_me_list);
                put_layout_hdr(lo);
                iput(ino);
        }
-        pnfs_free_lseg_list(&free_me_list);
        return rv;
 }
@@ -373,17 +373,11 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 {
        struct nfs_client *clp;
        int i;
-        __be32 status;
+        __be32 status = htonl(NFS4ERR_BADSESSION);
        cps->clp = NULL;
-        status = htonl(NFS4ERR_BADSESSION);
+        clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
-        /* Incoming session must match the callback session */
-        if (memcmp(&args->csa_sessionid, cps->svc_sid, NFS4_MAX_SESSIONID_LEN))
-                goto out;
-        clp = nfs4_find_client_sessionid(args->csa_addr,
-                                         &args->csa_sessionid, 1);
        if (clp == NULL)
                goto out;
@@ -414,9 +408,9 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
        res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
        res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
        nfs4_cb_take_slot(clp);
-        cps->clp = clp; /* put in nfs4_callback_compound */
 out:
+        cps->clp = clp; /* put in nfs4_callback_compound */
        for (i = 0; i < args->csa_nrclists; i++)
                kfree(args->csa_rclists[i].rcl_refcalls);
        kfree(args->csa_rclists);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 23112c263f81..14e0f9371d14 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -794,10 +794,9 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        if (hdr_arg.minorversion == 0) {
                cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
-                if (!cps.clp)
+                if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
                        return rpc_drop_reply;
-        } else
+        }
-                cps.svc_sid = bc_xprt_sid(rqstp);
        hdr_res.taglen = hdr_arg.taglen;
        hdr_res.tag = hdr_arg.tag;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 192f2f860265..139be9647d80 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -82,6 +82,11 @@ retry:
 #endif /* CONFIG_NFS_V4 */
 /*
+ * Turn off NFSv4 uid/gid mapping when using AUTH_SYS
+ */
+static int nfs4_disable_idmapping = 0;
+/*
 * RPC cruft for NFS
 */
 static struct rpc_version *nfs_version[5] = {
@@ -481,7 +486,12 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 * Look up a client by IP address and protocol version
 * - creates a new record if one doesn't yet exist
 */
-static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
+static struct nfs_client *
+nfs_get_client(const struct nfs_client_initdata *cl_init,
+               const struct rpc_timeout *timeparms,
+               const char *ip_addr,
+               rpc_authflavor_t authflavour,
+               int noresvport)
 {
        struct nfs_client *clp, *new = NULL;
        int error;
@@ -512,6 +522,13 @@ install_client:
        clp = new;
        list_add(&clp->cl_share_link, &nfs_client_list);
        spin_unlock(&nfs_client_lock);
+        error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr,
+                                              authflavour, noresvport);
+        if (error < 0) {
+                nfs_put_client(clp);
+                return ERR_PTR(error);
+        }
        dprintk("--> nfs_get_client() = %p [new]\n", clp);
        return clp;
@@ -767,9 +784,9 @@ static int nfs_init_server_rpcclient(struct nfs_server *server,
 /*
 * Initialise an NFS2 or NFS3 client
 */
-static int nfs_init_client(struct nfs_client *clp,
+int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms,
-                           const struct rpc_timeout *timeparms,
+                    const char *ip_addr, rpc_authflavor_t authflavour,
-                           const struct nfs_parsed_mount_data *data)
+                    int noresvport)
 {
        int error;
@@ -784,7 +801,7 @@ static int nfs_init_client(struct nfs_client *clp,
         * - RFC 2623, sec 2.3.2
         */
        error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
-                                      0, data->flags & NFS_MOUNT_NORESVPORT);
+                                      0, noresvport);
        if (error < 0)
                goto error;
        nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -820,19 +837,17 @@ static int nfs_init_server(struct nfs_server *server,
                cl_init.rpc_ops = &nfs_v3_clientops;
 #endif
+        nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+                        data->timeo, data->retrans);
        /* Allocate or find a client reference we can use */
-        clp = nfs_get_client(&cl_init);
+        clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX,
+                             data->flags & NFS_MOUNT_NORESVPORT);
        if (IS_ERR(clp)) {
                dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
                return PTR_ERR(clp);
        }
-        nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
-                        data->timeo, data->retrans);
-        error = nfs_init_client(clp, &timeparms, data);
-        if (error < 0)
-                goto error;
        server->nfs_client = clp;
        /* Initialise the client representation from the mount data */
@@ -1009,14 +1024,19 @@ static void nfs_server_insert_lists(struct nfs_server *server)
        spin_lock(&nfs_client_lock);
        list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
        list_add_tail(&server->master_link, &nfs_volume_list);
+        clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
        spin_unlock(&nfs_client_lock);
 }
 static void nfs_server_remove_lists(struct nfs_server *server)
 {
+        struct nfs_client *clp = server->nfs_client;
        spin_lock(&nfs_client_lock);
        list_del_rcu(&server->client_link);
+        if (clp && list_empty(&clp->cl_superblocks))
+                set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
        list_del(&server->master_link);
        spin_unlock(&nfs_client_lock);
@@ -1206,16 +1226,11 @@ nfs4_find_client_ident(int cb_ident)
 * For CB_COMPOUND calls, find a client by IP address, protocol version,
 * minorversion, and sessionID
 *
- * CREATE_SESSION triggers a CB_NULL ping from servers. The callback service
- * sessionid can only be set after the CREATE_SESSION return, so a CB_NULL
- * can arrive before the callback sessionid is set. For CB_NULL calls,
- * find a client by IP address protocol version, and minorversion.
- *
 * Returns NULL if no such client
 */
 struct nfs_client *
 nfs4_find_client_sessionid(const struct sockaddr *addr,
-                           struct nfs4_sessionid *sid, int is_cb_compound)
+                           struct nfs4_sessionid *sid)
 {
        struct nfs_client *clp;
@@ -1227,9 +1242,9 @@ nfs4_find_client_sessionid(const struct sockaddr *addr,
                if (!nfs4_has_session(clp))
                        continue;
-                /* Match sessionid unless cb_null call*/
+                /* Match sessionid*/
-                if (is_cb_compound && (memcmp(clp->cl_session->sess_id.data,
+                if (memcmp(clp->cl_session->sess_id.data,
-                    sid->data, NFS4_MAX_SESSIONID_LEN) != 0))
+                    sid->data, NFS4_MAX_SESSIONID_LEN) != 0)
                        continue;
                atomic_inc(&clp->cl_count);
@@ -1244,7 +1259,7 @@ nfs4_find_client_sessionid(const struct sockaddr *addr,
 struct nfs_client *
 nfs4_find_client_sessionid(const struct sockaddr *addr,
-                           struct nfs4_sessionid *sid, int is_cb_compound)
+                           struct nfs4_sessionid *sid)
 {
        return NULL;
 }
@@ -1312,11 +1327,11 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
 /*
 * Initialise an NFS4 client record
 */
-static int nfs4_init_client(struct nfs_client *clp,
+int nfs4_init_client(struct nfs_client *clp,
-                const struct rpc_timeout *timeparms,
+                     const struct rpc_timeout *timeparms,
-                const char *ip_addr,
+                     const char *ip_addr,
-                rpc_authflavor_t authflavour,
+                     rpc_authflavor_t authflavour,
-                int flags)
+                     int noresvport)
 {
        int error;
@@ -1330,7 +1345,7 @@ static int nfs4_init_client(struct nfs_client *clp,
        clp->rpc_ops = &nfs_v4_clientops;
        error = nfs_create_rpc_client(clp, timeparms, authflavour,
-                                      1, flags & NFS_MOUNT_NORESVPORT);
+                                      1, noresvport);
        if (error < 0)
                goto error;
        strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
@@ -1383,27 +1398,71 @@ static int nfs4_set_client(struct nfs_server *server,
        dprintk("--> nfs4_set_client()\n");
        /* Allocate or find a client reference we can use */
-        clp = nfs_get_client(&cl_init);
+        clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour,
+                             server->flags & NFS_MOUNT_NORESVPORT);
        if (IS_ERR(clp)) {
                error = PTR_ERR(clp);
                goto error;
        }
-        error = nfs4_init_client(clp, timeparms, ip_addr, authflavour,
-                                        server->flags);
+        /*
-        if (error < 0)
+         * Query for the lease time on clientid setup or renewal
-                goto error_put;
+         *
+         * Note that this will be set on nfs_clients that were created
+         * only for the DS role and did not set this bit, but now will
+         * serve a dual role.
+         */
+        set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state);
        server->nfs_client = clp;
        dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
        return 0;
-error_put:
-        nfs_put_client(clp);
 error:
        dprintk("<-- nfs4_set_client() = xerror %d\n", error);
        return error;
 }
+/*
+ * Set up a pNFS Data Server client.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+                const struct sockaddr *ds_addr,
+                int ds_addrlen, int ds_proto)
+{
+        struct nfs_client_initdata cl_init = {
+                .addr = ds_addr,
+                .addrlen = ds_addrlen,
+                .rpc_ops = &nfs_v4_clientops,
+                .proto = ds_proto,
+                .minorversion = mds_clp->cl_minorversion,
+        };
+        struct rpc_timeout ds_timeout = {
+                .to_initval = 15 * HZ,
+                .to_maxval = 15 * HZ,
+                .to_retries = 1,
+                .to_exponential = 1,
+        };
+        struct nfs_client *clp;
+        /*
+         * Set an authflavor equual to the MDS value. Use the MDS nfs_client
+         * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
+         * (section 13.1 RFC 5661).
+         */
+        clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
+                             mds_clp->cl_rpcclient->cl_auth->au_flavor, 0);
+        dprintk("<-- %s %p\n", __func__, clp);
+        return clp;
+}
+EXPORT_SYMBOL(nfs4_set_ds_client);
 /*
 * Session has been established, and the client marked ready.
@@ -1440,6 +1499,10 @@ static int nfs4_server_common_setup(struct nfs_server *server,
        BUG_ON(!server->nfs_client->rpc_ops);
        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+        /* data servers support only a subset of NFSv4.1 */
+        if (is_ds_only_client(server->nfs_client))
+                return -EPROTONOSUPPORT;
        fattr = nfs_alloc_fattr();
        if (fattr == NULL)
                return -ENOMEM;
@@ -1509,6 +1572,13 @@ static int nfs4_init_server(struct nfs_server *server,
        if (error < 0)
                goto error;
+        /*
+         * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
+         * authentication.
+         */
+        if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX)
+                server->caps |= NFS_CAP_UIDGID_NOMAP;
        if (data->rsize)
                server->rsize = nfs_block_size(data->rsize, NULL);
        if (data->wsize)
@@ -1926,3 +1996,7 @@ void nfs_fs_proc_exit(void)
 }
 #endif /* CONFIG_PROC_FS */
+module_param(nfs4_disable_idmapping, bool, 0644);
+MODULE_PARM_DESC(nfs4_disable_idmapping,
+                "Turn off NFSv4 idmapping when using 'sec=sys'");
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 364e4328f392..bbbc6bf5cb2e 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -23,8 +23,6 @@
 static void nfs_do_free_delegation(struct nfs_delegation *delegation)
 {
-        if (delegation->cred)
-                put_rpccred(delegation->cred);
        kfree(delegation);
 }
@@ -37,6 +35,10 @@ static void nfs_free_delegation_callback(struct rcu_head *head)
 static void nfs_free_delegation(struct nfs_delegation *delegation)
 {
+        if (delegation->cred) {
+                put_rpccred(delegation->cred);
+                delegation->cred = NULL;
+        }
        call_rcu(&delegation->rcu, nfs_free_delegation_callback);
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2c3eb33b904d..abdf38d5971d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1169,11 +1169,23 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
        iput(inode);
 }
+static void nfs_d_release(struct dentry *dentry)
+{
+        /* free cached devname value, if it survived that far */
+        if (unlikely(dentry->d_fsdata)) {
+                if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+                        WARN_ON(1);
+                else
+                        kfree(dentry->d_fsdata);
+        }
+}
 const struct dentry_operations nfs_dentry_operations = {
        .d_revalidate   = nfs_lookup_revalidate,
        .d_delete       = nfs_dentry_delete,
        .d_iput         = nfs_dentry_iput,
        .d_automount    = nfs_d_automount,
+        .d_release      = nfs_d_release,
 };
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -1248,6 +1260,7 @@ const struct dentry_operations nfs4_dentry_operations = {
        .d_delete       = nfs_dentry_delete,
        .d_iput         = nfs_dentry_iput,
        .d_automount    = nfs_d_automount,
+        .d_release      = nfs_d_release,
 };
 /*
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e6ace0d93c71..8eea25366717 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -45,6 +45,7 @@
 #include <linux/pagemap.h>
 #include <linux/kref.h>
 #include <linux/slab.h>
+#include <linux/task_io_accounting_ops.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
@@ -407,15 +408,18 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
                pos += vec->iov_len;
        }
+        /*
+         * If no bytes were started, return the error, and let the
+         * generic layer handle the completion.
+         */
+        if (requested_bytes == 0) {
+                nfs_direct_req_release(dreq);
+                return result < 0 ? result : -EIO;
+        }
        if (put_dreq(dreq))
                nfs_direct_complete(dreq);
+        return 0;
-        if (requested_bytes != 0)
-                return 0;
-        if (result < 0)
-                return result;
-        return -EIO;
 }
 static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
@@ -646,8 +650,7 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 {
        struct nfs_write_data *data = calldata;
-        if (nfs_writeback_done(task, data) != 0)
+        nfs_writeback_done(task, data);
-                return;
 }
 /*
@@ -841,15 +844,18 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
                pos += vec->iov_len;
        }
+        /*
+         * If no bytes were started, return the error, and let the
+         * generic layer handle the completion.
+         */
+        if (requested_bytes == 0) {
+                nfs_direct_req_release(dreq);
+                return result < 0 ? result : -EIO;
+        }
        if (put_dreq(dreq))
                nfs_direct_write_complete(dreq, dreq->inode);
+        return 0;
-        if (requested_bytes != 0)
-                return 0;
-        if (result < 0)
-                return result;
-        return -EIO;
 }
 static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
@@ -932,6 +938,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
        if (retval)
                goto out;
+        task_io_account_read(count);
        retval = nfs_direct_read(iocb, iov, nr_segs, pos);
        if (retval > 0)
                iocb->ki_pos = pos + retval;
@@ -993,6 +1001,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
        if (retval)
                goto out;
+        task_io_account_write(count);
        retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
        if (retval > 0)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 7bf029ef4084..d85a534b15cd 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -387,10 +387,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
                file->f_path.dentry->d_name.name,
                mapping->host->i_ino, len, (long long) pos);
-        pnfs_update_layout(mapping->host,
-                           nfs_file_open_context(file),
-                           IOMODE_RW);
 start:
        /*
         * Prevent starvation issues if someone is doing a consistency
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b5ffe8fa291f..1084792bc0fe 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -75,18 +75,25 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
 /*
 * get an NFS2/NFS3 root dentry from the root filehandle
 */
-struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
+struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
+                            const char *devname)
 {
        struct nfs_server *server = NFS_SB(sb);
        struct nfs_fsinfo fsinfo;
        struct dentry *ret;
        struct inode *inode;
+        void *name = kstrdup(devname, GFP_KERNEL);
        int error;
+        if (!name)
+                return ERR_PTR(-ENOMEM);
        /* get the actual root for this mount */
        fsinfo.fattr = nfs_alloc_fattr();
-        if (fsinfo.fattr == NULL)
+        if (fsinfo.fattr == NULL) {
+                kfree(name);
                return ERR_PTR(-ENOMEM);
+        }
        error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
        if (error < 0) {
@@ -119,7 +126,15 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
        }
        security_d_instantiate(ret, inode);
+        spin_lock(&ret->d_lock);
+        if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
+                ret->d_fsdata = name;
+                name = NULL;
+        }
+        spin_unlock(&ret->d_lock);
 out:
+        if (name)
+                kfree(name);
        nfs_free_fattr(fsinfo.fattr);
        return ret;
 }
@@ -169,27 +184,35 @@ out:
 /*
 * get an NFS4 root dentry from the root filehandle
 */
-struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
+struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh,
+                             const char *devname)
 {
        struct nfs_server *server = NFS_SB(sb);
        struct nfs_fattr *fattr = NULL;
        struct dentry *ret;
        struct inode *inode;
+        void *name = kstrdup(devname, GFP_KERNEL);
        int error;
        dprintk("--> nfs4_get_root()\n");
+        if (!name)
+                return ERR_PTR(-ENOMEM);
        /* get the info about the server and filesystem */
        error = nfs4_server_capabilities(server, mntfh);
        if (error < 0) {
                dprintk("nfs_get_root: getcaps error = %d\n",
                        -error);
+                kfree(name);
                return ERR_PTR(error);
        }
        fattr = nfs_alloc_fattr();
-        if (fattr == NULL)
+        if (fattr == NULL) {
-                return ERR_PTR(-ENOMEM);;
+                kfree(name);
+                return ERR_PTR(-ENOMEM);
+        }
        /* get the actual root for this mount */
        error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
@@ -223,8 +246,15 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
        }
        security_d_instantiate(ret, inode);
+        spin_lock(&ret->d_lock);
+        if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
+                ret->d_fsdata = name;
+                name = NULL;
+        }
+        spin_unlock(&ret->d_lock);
 out:
+        if (name)
+                kfree(name);
        nfs_free_fattr(fattr);
        dprintk("<-- nfs4_get_root()\n");
        return ret;
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 18696882f1c6..79664a1025af 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -33,16 +33,41 @@
 *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
+{
+        unsigned long val;
+        char buf[16];
+        if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf))
+                return 0;
+        memcpy(buf, name, namelen);
+        buf[namelen] = '\0';
+        if (strict_strtoul(buf, 0, &val) != 0)
+                return 0;
+        *res = val;
+        return 1;
+}
+static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
+{
+        return snprintf(buf, buflen, "%u", id);
+}
 #ifdef CONFIG_NFS_USE_NEW_IDMAPPER
 #include <linux/slab.h>
 #include <linux/cred.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs_sb.h>
 #include <linux/nfs_idmap.h>
 #include <linux/keyctl.h>
 #include <linux/key-type.h>
 #include <linux/rcupdate.h>
-#include <linux/kernel.h>
 #include <linux/err.h>
 #include <keys/user-type.h>
@@ -219,23 +244,39 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen,
        return ret;
 }
-int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
 {
+        if (nfs_map_string_to_numeric(name, namelen, uid))
+                return 0;
        return nfs_idmap_lookup_id(name, namelen, "uid", uid);
 }
-int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid)
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
 {
+        if (nfs_map_string_to_numeric(name, namelen, gid))
+                return 0;
        return nfs_idmap_lookup_id(name, namelen, "gid", gid);
 }
-int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
 {
-        return nfs_idmap_lookup_name(uid, "user", buf, buflen);
+        int ret = -EINVAL;
+        if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+                ret = nfs_idmap_lookup_name(uid, "user", buf, buflen);
+        if (ret < 0)
+                ret = nfs_map_numeric_to_string(uid, buf, buflen);
+        return ret;
 }
-int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen)
+int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
 {
-        return nfs_idmap_lookup_name(gid, "group", buf, buflen);
+        int ret = -EINVAL;
+        if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+                ret = nfs_idmap_lookup_name(gid, "group", buf, buflen);
+        if (ret < 0)
+                ret = nfs_map_numeric_to_string(gid, buf, buflen);
+        return ret;
 }
 #else  /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
@@ -243,7 +284,6 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
-#include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/socket.h>
 #include <linux/in.h>
@@ -695,31 +735,45 @@ static unsigned int fnvhash32(const void *buf, size_t buflen)
        return hash;
 }
-int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
 {
-        struct idmap *idmap = clp->cl_idmap;
+        struct idmap *idmap = server->nfs_client->cl_idmap;
+        if (nfs_map_string_to_numeric(name, namelen, uid))
+                return 0;
        return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid);
 }
-int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
 {
-        struct idmap *idmap = clp->cl_idmap;
+        struct idmap *idmap = server->nfs_client->cl_idmap;
+        if (nfs_map_string_to_numeric(name, namelen, uid))
+                return 0;
        return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
 }
-int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
 {
-        struct idmap *idmap = clp->cl_idmap;
+        struct idmap *idmap = server->nfs_client->cl_idmap;
+        int ret = -EINVAL;
-        return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
+        if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+                ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
+        if (ret < 0)
+                ret = nfs_map_numeric_to_string(uid, buf, buflen);
+        return ret;
 }
-int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
 {
-        struct idmap *idmap = clp->cl_idmap;
+        struct idmap *idmap = server->nfs_client->cl_idmap;
+        int ret = -EINVAL;
-        return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
+        if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+                ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
+        if (ret < 0)
+                ret = nfs_map_numeric_to_string(uid, buf, buflen);
+        return ret;
 }
 #endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d8512423ba72..01768e5e2c9b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -37,6 +37,7 @@
 #include <linux/inet.h>
 #include <linux/nfs_xdr.h>
 #include <linux/slab.h>
+#include <linux/compat.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -89,7 +90,11 @@ int nfs_wait_bit_killable(void *word)
 */
 u64 nfs_compat_user_ino64(u64 fileid)
 {
-        int ino;
+#ifdef CONFIG_COMPAT
+        compat_ulong_t ino;
+#else   
+        unsigned long ino;
+#endif
        if (enable_ino64)
                return fileid;
@@ -881,9 +886,10 @@ out:
        return ret;
 }
-static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
+        unsigned long ret = 0;
        if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
                        && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
@@ -891,25 +897,32 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                nfsi->change_attr = fattr->change_attr;
                if (S_ISDIR(inode->i_mode))
                        nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+                ret |= NFS_INO_INVALID_ATTR;
        }
        /* If we have atomic WCC data, we may update some attributes */
        if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
                        && (fattr->valid & NFS_ATTR_FATTR_CTIME)
-                        && timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
+                        && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
-                        memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+                memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+                ret |= NFS_INO_INVALID_ATTR;
+        }
        if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
                        && (fattr->valid & NFS_ATTR_FATTR_MTIME)
                        && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
-                        memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+                memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-                        if (S_ISDIR(inode->i_mode))
+                if (S_ISDIR(inode->i_mode))
-                                nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+                        nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+                ret |= NFS_INO_INVALID_ATTR;
        }
        if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
                        && (fattr->valid & NFS_ATTR_FATTR_SIZE)
                        && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
-                        && nfsi->npages == 0)
+                        && nfsi->npages == 0) {
-                        i_size_write(inode, nfs_size_to_loff_t(fattr->size));
+                i_size_write(inode, nfs_size_to_loff_t(fattr->size));
+                ret |= NFS_INO_INVALID_ATTR;
+        }
+        return ret;
 }
 /**
@@ -1223,7 +1236,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        | NFS_INO_REVAL_PAGECACHE);
        /* Do atomic weak cache consistency updates */
-        nfs_wcc_update_inode(inode, fattr);
+        invalid |= nfs_wcc_update_inode(inode, fattr);
        /* More cache consistency checks */
        if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
@@ -1505,7 +1518,7 @@ static int nfsiod_start(void)
 {
        struct workqueue_struct *wq;
        dprintk("RPC:       creating workqueue nfsiod\n");
-        wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0);
+        wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0);
        if (wq == NULL)
                return -ENOMEM;
        nfsiod_workqueue = wq;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 4644f04b4b46..72e0bddf7a2f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -133,8 +133,7 @@ extern void nfs_put_client(struct nfs_client *);
 extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
 extern struct nfs_client *nfs4_find_client_ident(int);
 extern struct nfs_client *
-nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *,
+nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *);
-                           int);
 extern struct nfs_server *nfs_create_server(
                                        const struct nfs_parsed_mount_data *,
                                        struct nfs_fh *);
@@ -149,6 +148,9 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
                                           struct nfs_fattr *);
 extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
 extern int nfs4_check_client_ready(struct nfs_client *clp);
+extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+                                             const struct sockaddr *ds_addr,
+                                             int ds_addrlen, int ds_proto);
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
@@ -164,10 +166,10 @@ static inline void nfs_fs_proc_exit(void)
 /* nfs4namespace.c */
 #ifdef CONFIG_NFS_V4
-extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry);
+extern struct vfsmount *nfs_do_refmount(struct dentry *dentry);
 #else
 static inline
-struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+struct vfsmount *nfs_do_refmount(struct dentry *dentry)
 {
        return ERR_PTR(-ENOENT);
 }
@@ -214,8 +216,14 @@ extern const u32 nfs41_maxwrite_overhead;
 extern struct rpc_procinfo nfs4_procedures[];
 #endif
+extern int nfs4_init_ds_session(struct nfs_client *clp);
 /* proc.c */
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+extern int nfs_init_client(struct nfs_client *clp,
+                           const struct rpc_timeout *timeparms,
+                           const char *ip_addr, rpc_authflavor_t authflavour,
+                           int noresvport);
 /* dir.c */
 extern int nfs_access_cache_shrinker(struct shrinker *shrink,
@@ -248,24 +256,30 @@ extern void nfs_sb_active(struct super_block *sb);
 extern void nfs_sb_deactive(struct super_block *sb);
 /* namespace.c */
-extern char *nfs_path(const char *base,
+extern char *nfs_path(char **p, struct dentry *dentry,
-                      const struct dentry *droot,
-                      const struct dentry *dentry,
                      char *buffer, ssize_t buflen);
 extern struct vfsmount *nfs_d_automount(struct path *path);
 /* getroot.c */
-extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
+extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *,
+                                   const char *);
 #ifdef CONFIG_NFS_V4
-extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *);
+extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *,
+                                    const char *);
 extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
 #endif
 /* read.c */
+extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+                             const struct rpc_call_ops *call_ops);
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 /* write.c */
+extern int nfs_initiate_write(struct nfs_write_data *data,
+                              struct rpc_clnt *clnt,
+                              const struct rpc_call_ops *call_ops,
+                              int how);
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
@@ -275,6 +289,13 @@ extern int nfs_migrate_page(struct address_space *,
 #endif
 /* nfs4proc.c */
+extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
+extern int nfs4_init_client(struct nfs_client *clp,
+                            const struct rpc_timeout *timeparms,
+                            const char *ip_addr,
+                            rpc_authflavor_t authflavour,
+                            int noresvport);
+extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data);
 extern int _nfs4_call_sync(struct nfs_server *server,
                           struct rpc_message *msg,
                           struct nfs4_sequence_args *args,
@@ -289,12 +310,11 @@ extern int _nfs4_call_sync_session(struct nfs_server *server,
 /*
 * Determine the device name as a string
 */
-static inline char *nfs_devname(const struct vfsmount *mnt_parent,
+static inline char *nfs_devname(struct dentry *dentry,
-                                const struct dentry *dentry,
                                char *buffer, ssize_t buflen)
 {
-        return nfs_path(mnt_parent->mnt_devname, mnt_parent->mnt_root,
+        char *dummy;
-                        dentry, buffer, buflen);
+        return nfs_path(&dummy, dentry, buffer, buflen);
 }
 /*
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index f32b8603dca8..bf1c68009ffd 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -25,33 +25,30 @@ static LIST_HEAD(nfs_automount_list);
 static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts);
 int nfs_mountpoint_expiry_timeout = 500 * HZ;
-static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+static struct vfsmount *nfs_do_submount(struct dentry *dentry,
-                                        const struct dentry *dentry,
                                        struct nfs_fh *fh,
                                        struct nfs_fattr *fattr);
 /*
 * nfs_path - reconstruct the path given an arbitrary dentry
- * @base - arbitrary string to prepend to the path
+ * @base - used to return pointer to the end of devname part of path
- * @droot - pointer to root dentry for mountpoint
 * @dentry - pointer to dentry
 * @buffer - result buffer
 * @buflen - length of buffer
 *
- * Helper function for constructing the path from the
+ * Helper function for constructing the server pathname
- * root dentry to an arbitrary hashed dentry.
+ * by arbitrary hashed dentry.
 *
 * This is mainly for use in figuring out the path on the
- * server side when automounting on top of an existing partition.
+ * server side when automounting on top of an existing partition
+ * and in generating /proc/mounts and friends.
 */
-char *nfs_path(const char *base,
+char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen)
-               const struct dentry *droot,
-               const struct dentry *dentry,
-               char *buffer, ssize_t buflen)
 {
        char *end;
        int namelen;
        unsigned seq;
+        const char *base;
 rename_retry:
        end = buffer+buflen;
@@ -60,7 +57,10 @@ rename_retry:
        seq = read_seqbegin(&rename_lock);
        rcu_read_lock();
-        while (!IS_ROOT(dentry) && dentry != droot) {
+        while (1) {
+                spin_lock(&dentry->d_lock);
+                if (IS_ROOT(dentry))
+                        break;
                namelen = dentry->d_name.len;
                buflen -= namelen + 1;
                if (buflen < 0)
@@ -68,27 +68,47 @@ rename_retry:
                end -= namelen;
                memcpy(end, dentry->d_name.name, namelen);
                *--end = '/';
+                spin_unlock(&dentry->d_lock);
                dentry = dentry->d_parent;
        }
-        rcu_read_unlock();
+        if (read_seqretry(&rename_lock, seq)) {
-        if (read_seqretry(&rename_lock, seq))
+                spin_unlock(&dentry->d_lock);
+                rcu_read_unlock();
                goto rename_retry;
+        }
        if (*end != '/') {
-                if (--buflen < 0)
+                if (--buflen < 0) {
+                        spin_unlock(&dentry->d_lock);
+                        rcu_read_unlock();
                        goto Elong;
+                }
                *--end = '/';
        }
+        *p = end;
+        base = dentry->d_fsdata;
+        if (!base) {
+                spin_unlock(&dentry->d_lock);
+                rcu_read_unlock();
+                WARN_ON(1);
+                return end;
+        }
        namelen = strlen(base);
        /* Strip off excess slashes in base string */
        while (namelen > 0 && base[namelen - 1] == '/')
                namelen--;
        buflen -= namelen;
-        if (buflen < 0)
+        if (buflen < 0) {
+                spin_unlock(&dentry->d_lock);
+                rcu_read_unlock();
                goto Elong;
+        }
        end -= namelen;
        memcpy(end, base, namelen);
+        spin_unlock(&dentry->d_lock);
+        rcu_read_unlock();
        return end;
 Elong_unlock:
+        spin_unlock(&dentry->d_lock);
        rcu_read_unlock();
        if (read_seqretry(&rename_lock, seq))
                goto rename_retry;
@@ -143,9 +163,9 @@ struct vfsmount *nfs_d_automount(struct path *path)
        }
        if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
-                mnt = nfs_do_refmount(path->mnt, path->dentry);
+                mnt = nfs_do_refmount(path->dentry);
        else
-                mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr);
+                mnt = nfs_do_submount(path->dentry, fh, fattr);
        if (IS_ERR(mnt))
                goto out;
@@ -209,19 +229,17 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
 /**
 * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
- * @mnt_parent - mountpoint of parent directory
 * @dentry - parent directory
 * @fh - filehandle for new root dentry
 * @fattr - attributes for new root inode
 *
 */
-static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+static struct vfsmount *nfs_do_submount(struct dentry *dentry,
-                                        const struct dentry *dentry,
                                        struct nfs_fh *fh,
                                        struct nfs_fattr *fattr)
 {
        struct nfs_clone_mount mountdata = {
-                .sb = mnt_parent->mnt_sb,
+                .sb = dentry->d_sb,
                .dentry = dentry,
                .fh = fh,
                .fattr = fattr,
@@ -237,11 +255,11 @@ static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
                        dentry->d_name.name);
        if (page == NULL)
                goto out;
-        devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
+        devname = nfs_devname(dentry, page, PAGE_SIZE);
        mnt = (struct vfsmount *)devname;
        if (IS_ERR(devname))
                goto free_page;
-        mnt = nfs_do_clone_mount(NFS_SB(mnt_parent->mnt_sb), devname, &mountdata);
+        mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata);
 free_page:
        free_page((unsigned long)page);
 out:
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9f88c5f4c7e2..274342771655 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -311,8 +311,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
        if (!nfs_server_capable(inode, NFS_CAP_ACLS))
                goto out;
-        /* We are doing this here, because XDR marshalling can only
+        /* We are doing this here because XDR marshalling does not
-           return -ENOMEM. */
+         * return any results, it BUGs. */
        status = -ENOSPC;
        if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES)
                goto out;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ce939c062a52..d0c80d8b3f96 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -885,4 +885,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
        .lock           = nfs3_proc_lock,
        .clear_acl_cache = nfs3_forget_cached_acls,
        .close_context  = nfs_close_context,
+        .init_client    = nfs_init_client,
 };
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 01c5e8b1941d..183c6b123d0f 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1328,10 +1328,13 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
        encode_nfs_fh3(xdr, NFS_FH(args->inode));
        encode_uint32(xdr, args->mask);
+        base = req->rq_slen;
        if (args->npages != 0)
                xdr_write_pages(xdr, args->pages, 0, args->len);
+        else
+                xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE);
-        base = req->rq_slen;
        error = nfsacl_encode(xdr->buf, base, args->inode,
                            (args->mask & NFS_ACL) ?
                            args->acl_access : NULL, 1, 0);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 7a7474073148..c64be1cff080 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -252,6 +252,9 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
 extern int nfs4_setup_sequence(const struct nfs_server *server,
                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
                int cache_reply, struct rpc_task *task);
+extern int nfs41_setup_sequence(struct nfs4_session *session,
+                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+                int cache_reply, struct rpc_task *task);
 extern void nfs4_destroy_session(struct nfs4_session *session);
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
 extern int nfs4_proc_create_session(struct nfs_client *);
@@ -259,6 +262,19 @@ extern int nfs4_proc_destroy_session(struct nfs4_session *);
 extern int nfs4_init_session(struct nfs_server *server);
 extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
                struct nfs_fsinfo *fsinfo);
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+        return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) ==
+                EXCHGID4_FLAG_USE_PNFS_DS;
+}
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+        return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS;
+}
 #else /* CONFIG_NFS_v4_1 */
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
 {
@@ -276,6 +292,18 @@ static inline int nfs4_init_session(struct nfs_server *server)
 {
        return 0;
 }
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+        return false;
+}
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+        return false;
+}
 #endif /* CONFIG_NFS_V4_1 */
 extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
@@ -298,6 +326,11 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
 #if defined(CONFIG_NFS_V4_1)
 struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
 struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
+extern void nfs4_schedule_session_recovery(struct nfs4_session *);
+#else
+static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
@@ -307,10 +340,9 @@ extern void nfs4_put_open_state(struct nfs4_state *);
 extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t);
 extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t);
 extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
-extern void nfs4_schedule_state_recovery(struct nfs_client *);
+extern void nfs4_schedule_lease_recovery(struct nfs_client *);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
-extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
+extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
-extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
 extern void nfs41_handle_recall_slot(struct nfs_client *clp);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 23f930caf1e2..428558464817 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -40,32 +40,309 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
 MODULE_DESCRIPTION("The NFSv4 file layout driver");
-static int
+#define FILELAYOUT_POLL_RETRY_MAX     (15*HZ)
-filelayout_set_layoutdriver(struct nfs_server *nfss)
+static loff_t
+filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
+                            loff_t offset)
 {
-        int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
+        u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
-                                                nfs4_fl_free_deviceid_callback);
+        u64 tmp;
-        if (status) {
-                printk(KERN_WARNING "%s: deviceid cache could not be "
+        offset -= flseg->pattern_offset;
-                        "initialized\n", __func__);
+        tmp = offset;
-                return status;
+        do_div(tmp, stripe_width);
+        return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit);
+}
+/* This function is used by the layout driver to calculate the
+ * offset of the file on the dserver based on whether the
+ * layout type is STRIPE_DENSE or STRIPE_SPARSE
+ */
+static loff_t
+filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+        struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+        switch (flseg->stripe_type) {
+        case STRIPE_SPARSE:
+                return offset;
+        case STRIPE_DENSE:
+                return filelayout_get_dense_offset(flseg, offset);
        }
-        dprintk("%s: deviceid cache has been initialized successfully\n",
-                __func__);
+        BUG();
+}
+/* For data server errors we don't recover from */
+static void
+filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+        if (lseg->pls_range.iomode == IOMODE_RW) {
+                dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
+                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+        } else {
+                dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
+                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+        }
+}
+static int filelayout_async_handle_error(struct rpc_task *task,
+                                         struct nfs4_state *state,
+                                         struct nfs_client *clp,
+                                         int *reset)
+{
+        if (task->tk_status >= 0)
+                return 0;
+        *reset = 0;
+        switch (task->tk_status) {
+        case -NFS4ERR_BADSESSION:
+        case -NFS4ERR_BADSLOT:
+        case -NFS4ERR_BAD_HIGH_SLOT:
+        case -NFS4ERR_DEADSESSION:
+        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+        case -NFS4ERR_SEQ_FALSE_RETRY:
+        case -NFS4ERR_SEQ_MISORDERED:
+                dprintk("%s ERROR %d, Reset session. Exchangeid "
+                        "flags 0x%x\n", __func__, task->tk_status,
+                        clp->cl_exchange_flags);
+                nfs4_schedule_session_recovery(clp->cl_session);
+                break;
+        case -NFS4ERR_DELAY:
+        case -NFS4ERR_GRACE:
+        case -EKEYEXPIRED:
+                rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
+                break;
+        default:
+                dprintk("%s DS error. Retry through MDS %d\n", __func__,
+                        task->tk_status);
+                *reset = 1;
+                break;
+        }
+        task->tk_status = 0;
+        return -EAGAIN;
+}
+/* NFS_PROTO call done callback routines */
+static int filelayout_read_done_cb(struct rpc_task *task,
+                                struct nfs_read_data *data)
+{
+        struct nfs_client *clp = data->ds_clp;
+        int reset = 0;
+        dprintk("%s DS read\n", __func__);
+        if (filelayout_async_handle_error(task, data->args.context->state,
+                                          data->ds_clp, &reset) == -EAGAIN) {
+                dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
+                        __func__, data->ds_clp, data->ds_clp->cl_session);
+                if (reset) {
+                        filelayout_set_lo_fail(data->lseg);
+                        nfs4_reset_read(task, data);
+                        clp = NFS_SERVER(data->inode)->nfs_client;
+                }
+                nfs_restart_rpc(task, clp);
+                return -EAGAIN;
+        }
        return 0;
 }
-/* Clear out the layout by destroying its device list */
+/*
-static int
+ * Call ops for the async read/write cases
-filelayout_clear_layoutdriver(struct nfs_server *nfss)
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void filelayout_read_prepare(struct rpc_task *task, void *data)
 {
-        dprintk("--> %s\n", __func__);
+        struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+        rdata->read_done_cb = filelayout_read_done_cb;
+        if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
+                                &rdata->args.seq_args, &rdata->res.seq_res,
+                                0, task))
+                return;
+        rpc_call_start(task);
+}
+static void filelayout_read_call_done(struct rpc_task *task, void *data)
+{
+        struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+        dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+        /* Note this may cause RPC to be resent */
+        rdata->mds_ops->rpc_call_done(task, data);
+}
+static void filelayout_read_release(void *data)
+{
+        struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+        rdata->mds_ops->rpc_release(data);
+}
+static int filelayout_write_done_cb(struct rpc_task *task,
+                                struct nfs_write_data *data)
+{
+        int reset = 0;
+        if (filelayout_async_handle_error(task, data->args.context->state,
+                                          data->ds_clp, &reset) == -EAGAIN) {
+                struct nfs_client *clp;
+                dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
+                        __func__, data->ds_clp, data->ds_clp->cl_session);
+                if (reset) {
+                        filelayout_set_lo_fail(data->lseg);
+                        nfs4_reset_write(task, data);
+                        clp = NFS_SERVER(data->inode)->nfs_client;
+                } else
+                        clp = data->ds_clp;
+                nfs_restart_rpc(task, clp);
+                return -EAGAIN;
+        }
-        if (nfss->nfs_client->cl_devid_cache)
-                pnfs_put_deviceid_cache(nfss->nfs_client);
        return 0;
 }
+static void filelayout_write_prepare(struct rpc_task *task, void *data)
+{
+        struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+        if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
+                                &wdata->args.seq_args, &wdata->res.seq_res,
+                                0, task))
+                return;
+        rpc_call_start(task);
+}
+static void filelayout_write_call_done(struct rpc_task *task, void *data)
+{
+        struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+        /* Note this may cause RPC to be resent */
+        wdata->mds_ops->rpc_call_done(task, data);
+}
+static void filelayout_write_release(void *data)
+{
+        struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+        wdata->mds_ops->rpc_release(data);
+}
+struct rpc_call_ops filelayout_read_call_ops = {
+        .rpc_call_prepare = filelayout_read_prepare,
+        .rpc_call_done = filelayout_read_call_done,
+        .rpc_release = filelayout_read_release,
+};
+struct rpc_call_ops filelayout_write_call_ops = {
+        .rpc_call_prepare = filelayout_write_prepare,
+        .rpc_call_done = filelayout_write_call_done,
+        .rpc_release = filelayout_write_release,
+};
+static enum pnfs_try_status
+filelayout_read_pagelist(struct nfs_read_data *data)
+{
+        struct pnfs_layout_segment *lseg = data->lseg;
+        struct nfs4_pnfs_ds *ds;
+        loff_t offset = data->args.offset;
+        u32 j, idx;
+        struct nfs_fh *fh;
+        int status;
+        dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+                __func__, data->inode->i_ino,
+                data->args.pgbase, (size_t)data->args.count, offset);
+        /* Retrieve the correct rpc_client for the byte range */
+        j = nfs4_fl_calc_j_index(lseg, offset);
+        idx = nfs4_fl_calc_ds_index(lseg, j);
+        ds = nfs4_fl_prepare_ds(lseg, idx);
+        if (!ds) {
+                /* Either layout fh index faulty, or ds connect failed */
+                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+                return PNFS_NOT_ATTEMPTED;
+        }
+        dprintk("%s USE DS:ip %x %hu\n", __func__,
+                ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+        /* No multipath support. Use first DS */
+        data->ds_clp = ds->ds_clp;
+        fh = nfs4_fl_select_ds_fh(lseg, j);
+        if (fh)
+                data->args.fh = fh;
+        data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+        data->mds_offset = offset;
+        /* Perform an asynchronous read to ds */
+        status = nfs_initiate_read(data, ds->ds_clp->cl_rpcclient,
+                                   &filelayout_read_call_ops);
+        BUG_ON(status != 0);
+        return PNFS_ATTEMPTED;
+}
+/* Perform async writes. */
+static enum pnfs_try_status
+filelayout_write_pagelist(struct nfs_write_data *data, int sync)
+{
+        struct pnfs_layout_segment *lseg = data->lseg;
+        struct nfs4_pnfs_ds *ds;
+        loff_t offset = data->args.offset;
+        u32 j, idx;
+        struct nfs_fh *fh;
+        int status;
+        /* Retrieve the correct rpc_client for the byte range */
+        j = nfs4_fl_calc_j_index(lseg, offset);
+        idx = nfs4_fl_calc_ds_index(lseg, j);
+        ds = nfs4_fl_prepare_ds(lseg, idx);
+        if (!ds) {
+                printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+                return PNFS_NOT_ATTEMPTED;
+        }
+        dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__,
+                data->inode->i_ino, sync, (size_t) data->args.count, offset,
+                ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+        /* We can't handle commit to ds yet */
+        if (!FILELAYOUT_LSEG(lseg)->commit_through_mds)
+                data->args.stable = NFS_FILE_SYNC;
+        data->write_done_cb = filelayout_write_done_cb;
+        data->ds_clp = ds->ds_clp;
+        fh = nfs4_fl_select_ds_fh(lseg, j);
+        if (fh)
+                data->args.fh = fh;
+        /*
+         * Get the file offset on the dserver. Set the write offset to
+         * this offset and save the original offset.
+         */
+        data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+        data->mds_offset = offset;
+        /* Perform an asynchronous write */
+        status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient,
+                                    &filelayout_write_call_ops, sync);
+        BUG_ON(status != 0);
+        return PNFS_ATTEMPTED;
+}
 /*
 * filelayout_check_layout()
 *
@@ -92,14 +369,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
                goto out;
        }
-        if (fl->stripe_unit % PAGE_SIZE) {
+        if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) {
-                dprintk("%s Stripe unit (%u) not page aligned\n",
+                dprintk("%s Invalid stripe unit (%u)\n",
                        __func__, fl->stripe_unit);
                goto out;
        }
        /* find and reference the deviceid */
-        dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
+        dsaddr = nfs4_fl_find_get_deviceid(id);
        if (dsaddr == NULL) {
                dsaddr = get_device_info(lo->plh_inode, id);
                if (dsaddr == NULL)
@@ -134,7 +411,7 @@ out:
        dprintk("--> %s returns %d\n", __func__, status);
        return status;
 out_put:
-        pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
+        nfs4_fl_put_deviceid(dsaddr);
        goto out;
 }
@@ -243,23 +520,47 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
 static void
 filelayout_free_lseg(struct pnfs_layout_segment *lseg)
 {
-        struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
        dprintk("--> %s\n", __func__);
-        pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
+        nfs4_fl_put_deviceid(fl->dsaddr);
-                          &fl->dsaddr->deviceid);
        _filelayout_free_lseg(fl);
 }
+/*
+ * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
+ *
+ * return 1 :  coalesce page
+ * return 0 :  don't coalesce page
+ */
+int
+filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+                   struct nfs_page *req)
+{
+        u64 p_stripe, r_stripe;
+        u32 stripe_unit;
+        if (!pgio->pg_lseg)
+                return 1;
+        p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
+        r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
+        stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+        do_div(p_stripe, stripe_unit);
+        do_div(r_stripe, stripe_unit);
+        return (p_stripe == r_stripe);
+}
 static struct pnfs_layoutdriver_type filelayout_type = {
-        .id = LAYOUT_NFSV4_1_FILES,
+        .id                     = LAYOUT_NFSV4_1_FILES,
-        .name = "LAYOUT_NFSV4_1_FILES",
+        .name                   = "LAYOUT_NFSV4_1_FILES",
-        .owner = THIS_MODULE,
+        .owner                  = THIS_MODULE,
-        .set_layoutdriver = filelayout_set_layoutdriver,
+        .alloc_lseg             = filelayout_alloc_lseg,
-        .clear_layoutdriver = filelayout_clear_layoutdriver,
+        .free_lseg              = filelayout_free_lseg,
-        .alloc_lseg              = filelayout_alloc_lseg,
+        .pg_test                = filelayout_pg_test,
-        .free_lseg               = filelayout_free_lseg,
+        .read_pagelist          = filelayout_read_pagelist,
+        .write_pagelist         = filelayout_write_pagelist,
 };
 static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index bbf60dd2ab9d..ee0c907742b5 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -55,8 +55,14 @@ struct nfs4_pnfs_ds {
        atomic_t                ds_count;
 };
+/* nfs4_file_layout_dsaddr flags */
+#define NFS4_DEVICE_ID_NEG_ENTRY        0x00000001
 struct nfs4_file_layout_dsaddr {
-        struct pnfs_deviceid_node       deviceid;
+        struct hlist_node               node;
+        struct nfs4_deviceid            deviceid;
+        atomic_t                        ref;
+        unsigned long                   flags;
        u32                             stripe_count;
        u8                              *stripe_indices;
        u32                             ds_num;
@@ -83,11 +89,18 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
                            generic_hdr);
 }
-extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
+extern struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
 extern void print_ds(struct nfs4_pnfs_ds *ds);
 extern void print_deviceid(struct nfs4_deviceid *dev_id);
+u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
+u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
+struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
+                                        u32 ds_idx);
 extern struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
+nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
+extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 struct nfs4_file_layout_dsaddr *
 get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 51fe64ace55a..68143c162e3b 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -37,6 +37,30 @@
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 /*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+#define NFS4_FL_DEVICE_ID_HASH_BITS     5
+#define NFS4_FL_DEVICE_ID_HASH_SIZE     (1 << NFS4_FL_DEVICE_ID_HASH_BITS)
+#define NFS4_FL_DEVICE_ID_HASH_MASK     (NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
+static inline u32
+nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
+{
+        unsigned char *cptr = (unsigned char *)id->data;
+        unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+        u32 x = 0;
+        while (nbytes--) {
+                x *= 37;
+                x += *cptr++;
+        }
+        return x & NFS4_FL_DEVICE_ID_HASH_MASK;
+}
+static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
+static DEFINE_SPINLOCK(filelayout_deviceid_lock);
+/*
 * Data server cache
 *
 * Data servers can be mapped to different device ids.
@@ -104,6 +128,67 @@ _data_server_lookup_locked(u32 ip_addr, u32 port)
        return NULL;
 }
+/*
+ * Create an rpc connection to the nfs4_pnfs_ds data server
+ * Currently only support IPv4
+ */
+static int
+nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
+{
+        struct nfs_client *clp;
+        struct sockaddr_in sin;
+        int status = 0;
+        dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__,
+                ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+                mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
+        sin.sin_family = AF_INET;
+        sin.sin_addr.s_addr = ds->ds_ip_addr;
+        sin.sin_port = ds->ds_port;
+        clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin,
+                                 sizeof(sin), IPPROTO_TCP);
+        if (IS_ERR(clp)) {
+                status = PTR_ERR(clp);
+                goto out;
+        }
+        if ((clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) != 0) {
+                if (!is_ds_client(clp)) {
+                        status = -ENODEV;
+                        goto out_put;
+                }
+                ds->ds_clp = clp;
+                dprintk("%s [existing] ip=%x, port=%hu\n", __func__,
+                        ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+                goto out;
+        }
+        /*
+         * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to
+         * be equal to the MDS lease. Renewal is scheduled in create_session.
+         */
+        spin_lock(&mds_srv->nfs_client->cl_lock);
+        clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time;
+        spin_unlock(&mds_srv->nfs_client->cl_lock);
+        clp->cl_last_renewal = jiffies;
+        /* New nfs_client */
+        status = nfs4_init_ds_session(clp);
+        if (status)
+                goto out_put;
+        ds->ds_clp = clp;
+        dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr),
+                ntohs(ds->ds_port));
+out:
+        return status;
+out_put:
+        nfs_put_client(clp);
+        goto out;
+}
 static void
 destroy_ds(struct nfs4_pnfs_ds *ds)
 {
@@ -122,7 +207,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
        struct nfs4_pnfs_ds *ds;
        int i;
-        print_deviceid(&dsaddr->deviceid.de_id);
+        print_deviceid(&dsaddr->deviceid);
        for (i = 0; i < dsaddr->ds_num; i++) {
                ds = dsaddr->ds_list[i];
@@ -139,15 +224,6 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
        kfree(dsaddr);
 }
-void
-nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
-{
-        struct nfs4_file_layout_dsaddr *dsaddr =
-                container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
-        nfs4_fl_free_deviceid(dsaddr);
-}
 static struct nfs4_pnfs_ds *
 nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
 {
@@ -214,17 +290,26 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
        /* ipv6 length plus port is legal */
        if (rlen > INET6_ADDRSTRLEN + 8) {
-                dprintk("%s Invalid address, length %d\n", __func__,
+                dprintk("%s: Invalid address, length %d\n", __func__,
                        rlen);
                goto out_err;
        }
        buf = kmalloc(rlen + 1, GFP_KERNEL);
+        if (!buf) {
+                dprintk("%s: Not enough memory\n", __func__);
+                goto out_err;
+        }
        buf[rlen] = '\0';
        memcpy(buf, r_addr, rlen);
        /* replace the port dots with dashes for the in4_pton() delimiter*/
        for (i = 0; i < 2; i++) {
                char *res = strrchr(buf, '.');
+                if (!res) {
+                        dprintk("%s: Failed finding expected dots in port\n",
+                                __func__);
+                        goto out_free;
+                }
                *res = '-';
        }
@@ -240,7 +325,7 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
        port = htons((tmp[0] << 8) | (tmp[1]));
        ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
-        dprintk("%s Decoded address and port %s\n", __func__, buf);
+        dprintk("%s: Decoded address and port %s\n", __func__, buf);
 out_free:
        kfree(buf);
 out_err:
@@ -291,7 +376,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
        dsaddr->stripe_count = cnt;
        dsaddr->ds_num = num;
-        memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
+        memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id));
        /* Go back an read stripe indices */
        p = indicesp;
@@ -341,28 +426,37 @@ out_err:
 }
 /*
- * Decode the opaque device specified in 'dev'
+ * Decode the opaque device specified in 'dev' and add it to the cache of
- * and add it to the list of available devices.
+ * available devices.
- * If the deviceid is already cached, nfs4_add_deviceid will return
- * a pointer to the cached struct and throw away the new.
 */
-static struct nfs4_file_layout_dsaddr*
+static struct nfs4_file_layout_dsaddr *
 decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
 {
-        struct nfs4_file_layout_dsaddr *dsaddr;
+        struct nfs4_file_layout_dsaddr *d, *new;
-        struct pnfs_deviceid_node *d;
+        long hash;
-        dsaddr = decode_device(inode, dev);
+        new = decode_device(inode, dev);
-        if (!dsaddr) {
+        if (!new) {
                printk(KERN_WARNING "%s: Could not decode or add device\n",
                        __func__);
                return NULL;
        }
-        d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
+        spin_lock(&filelayout_deviceid_lock);
-                              &dsaddr->deviceid);
+        d = nfs4_fl_find_get_deviceid(&new->deviceid);
+        if (d) {
+                spin_unlock(&filelayout_deviceid_lock);
+                nfs4_fl_free_deviceid(new);
+                return d;
+        }
-        return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+        INIT_HLIST_NODE(&new->node);
+        atomic_set(&new->ref, 1);
+        hash = nfs4_fl_deviceid_hash(&new->deviceid);
+        hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
+        spin_unlock(&filelayout_deviceid_lock);
+        return new;
 }
 /*
@@ -437,12 +531,123 @@ out_free:
        return dsaddr;
 }
+void
+nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+        if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
+                hlist_del_rcu(&dsaddr->node);
+                spin_unlock(&filelayout_deviceid_lock);
+                synchronize_rcu();
+                nfs4_fl_free_deviceid(dsaddr);
+        }
+}
 struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
+nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
+{
+        struct nfs4_file_layout_dsaddr *d;
+        struct hlist_node *n;
+        long hash = nfs4_fl_deviceid_hash(id);
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
+                if (!memcmp(&d->deviceid, id, sizeof(*id))) {
+                        if (!atomic_inc_not_zero(&d->ref))
+                                goto fail;
+                        rcu_read_unlock();
+                        return d;
+                }
+        }
+fail:
+        rcu_read_unlock();
+        return NULL;
+}
+/*
+ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
+ * Then: ((res + fsi) % dsaddr->stripe_count)
+ */
+u32
+nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+        struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+        u64 tmp;
+        tmp = offset - flseg->pattern_offset;
+        do_div(tmp, flseg->stripe_unit);
+        tmp += flseg->first_stripe_index;
+        return do_div(tmp, flseg->dsaddr->stripe_count);
+}
+u32
+nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
+{
+        return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
+}
+struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
+{
+        struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+        u32 i;
+        if (flseg->stripe_type == STRIPE_SPARSE) {
+                if (flseg->num_fh == 1)
+                        i = 0;
+                else if (flseg->num_fh == 0)
+                        /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
+                        return NULL;
+                else
+                        i = nfs4_fl_calc_ds_index(lseg, j);
+        } else
+                i = j;
+        return flseg->fh_array[i];
+}
+static void
+filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
+                               int err, u32 ds_addr)
+{
+        u32 *p = (u32 *)&dsaddr->deviceid;
+        printk(KERN_ERR "NFS: data server %x connection error %d."
+                " Deviceid [%x%x%x%x] marked out of use.\n",
+                ds_addr, err, p[0], p[1], p[2], p[3]);
+        spin_lock(&filelayout_deviceid_lock);
+        dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
+        spin_unlock(&filelayout_deviceid_lock);
+}
+struct nfs4_pnfs_ds *
+nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
 {
-        struct pnfs_deviceid_node *d;
+        struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
+        struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
+        if (ds == NULL) {
+                printk(KERN_ERR "%s: No data server for offset index %d\n",
+                        __func__, ds_idx);
+                return NULL;
+        }
-        d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
+        if (!ds->ds_clp) {
-        return (d == NULL) ? NULL :
+                struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
-                container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+                int err;
+                if (dsaddr->flags & NFS4_DEVICE_ID_NEG_ENTRY) {
+                        /* Already tried to connect, don't try again */
+                        dprintk("%s Deviceid marked out of use\n", __func__);
+                        return NULL;
+                }
+                err = nfs4_ds_connect(s, ds);
+                if (err) {
+                        filelayout_mark_devid_negative(dsaddr, err,
+                                                       ntohl(ds->ds_ip_addr));
+                        return NULL;
+                }
+        }
+        return ds;
 }
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 3c2a1724fbd2..bb80c49b6533 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -54,33 +54,29 @@ Elong:
 /*
 * Determine the mount path as a string
 */
-static char *nfs4_path(const struct vfsmount *mnt_parent,
+static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen)
-                       const struct dentry *dentry,
-                       char *buffer, ssize_t buflen)
 {
-        const char *srvpath;
+        char *limit;
+        char *path = nfs_path(&limit, dentry, buffer, buflen);
-        srvpath = strchr(mnt_parent->mnt_devname, ':');
+        if (!IS_ERR(path)) {
-        if (srvpath)
+                char *colon = strchr(path, ':');
-                srvpath++;
+                if (colon && colon < limit)
-        else
+                        path = colon + 1;
-                srvpath = mnt_parent->mnt_devname;
+        }
+        return path;
-        return nfs_path(srvpath, mnt_parent->mnt_root, dentry, buffer, buflen);
 }
 /*
 * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we
 * believe to be the server path to this dentry
 */
-static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
+static int nfs4_validate_fspath(struct dentry *dentry,
-                                const struct dentry *dentry,
                                const struct nfs4_fs_locations *locations,
                                char *page, char *page2)
 {
        const char *path, *fs_path;
-        path = nfs4_path(mnt_parent, dentry, page, PAGE_SIZE);
+        path = nfs4_path(dentry, page, PAGE_SIZE);
        if (IS_ERR(path))
                return PTR_ERR(path);
@@ -165,20 +161,18 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 /**
 * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
- * @mnt_parent - mountpoint of parent directory
 * @dentry - parent directory
 * @locations - array of NFSv4 server location information
 *
 */
-static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
+static struct vfsmount *nfs_follow_referral(struct dentry *dentry,
-                                            const struct dentry *dentry,
                                            const struct nfs4_fs_locations *locations)
 {
        struct vfsmount *mnt = ERR_PTR(-ENOENT);
        struct nfs_clone_mount mountdata = {
-                .sb = mnt_parent->mnt_sb,
+                .sb = dentry->d_sb,
                .dentry = dentry,
-                .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
+                .authflavor = NFS_SB(dentry->d_sb)->client->cl_auth->au_flavor,
        };
        char *page = NULL, *page2 = NULL;
        int loc, error;
@@ -198,7 +192,7 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
                goto out;
        /* Ensure fs path is a prefix of current dentry path */
-        error = nfs4_validate_fspath(mnt_parent, dentry, locations, page, page2);
+        error = nfs4_validate_fspath(dentry, locations, page, page2);
        if (error < 0) {
                mnt = ERR_PTR(error);
                goto out;
@@ -225,11 +219,10 @@ out:
 /*
 * nfs_do_refmount - handle crossing a referral on server
- * @mnt_parent - mountpoint of referral
 * @dentry - dentry of referral
 *
 */
-struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+struct vfsmount *nfs_do_refmount(struct dentry *dentry)
 {
        struct vfsmount *mnt = ERR_PTR(-ENOMEM);
        struct dentry *parent;
@@ -262,7 +255,7 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr
            fs_locations->fs_path.ncomponents <= 0)
                goto out_free;
-        mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations);
+        mnt = nfs_follow_referral(dentry, fs_locations);
 out_free:
        __free_page(page);
        kfree(fs_locations);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9d992b0346e3..1d84e7088af9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -50,6 +50,7 @@
 #include <linux/module.h>
 #include <linux/sunrpc/bc_xprt.h>
 #include <linux/xattr.h>
+#include <linux/utsname.h>
 #include "nfs4_fs.h"
 #include "delegation.h"
@@ -84,6 +85,9 @@ static int nfs4_map_errors(int err)
        switch (err) {
        case -NFS4ERR_RESOURCE:
                return -EREMOTEIO;
+        case -NFS4ERR_BADOWNER:
+        case -NFS4ERR_BADNAME:
+                return -EINVAL;
        default:
                dprintk("%s could not handle NFSv4 error %d\n",
                                __func__, -err);
@@ -240,7 +244,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 /* This is the error handling routine for processes that are allowed
 * to sleep.
 */
-static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
 {
        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state *state = exception->state;
@@ -255,12 +259,13 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
                case -NFS4ERR_OPENMODE:
                        if (state == NULL)
                                break;
-                        nfs4_state_mark_reclaim_nograce(clp, state);
+                        nfs4_schedule_stateid_recovery(server, state);
-                        goto do_state_recovery;
+                        goto wait_on_recovery;
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_EXPIRED:
-                        goto do_state_recovery;
+                        nfs4_schedule_lease_recovery(clp);
+                        goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
                case -NFS4ERR_BADSESSION:
                case -NFS4ERR_BADSLOT:
@@ -271,7 +276,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
                case -NFS4ERR_SEQ_MISORDERED:
                        dprintk("%s ERROR: %d Reset session\n", __func__,
                                errorcode);
-                        nfs4_schedule_state_recovery(clp);
+                        nfs4_schedule_session_recovery(clp->cl_session);
                        exception->retry = 1;
                        break;
 #endif /* defined(CONFIG_NFS_V4_1) */
@@ -291,11 +296,23 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
                                break;
                case -NFS4ERR_OLD_STATEID:
                        exception->retry = 1;
+                        break;
+                case -NFS4ERR_BADOWNER:
+                        /* The following works around a Linux server bug! */
+                case -NFS4ERR_BADNAME:
+                        if (server->caps & NFS_CAP_UIDGID_NOMAP) {
+                                server->caps &= ~NFS_CAP_UIDGID_NOMAP;
+                                exception->retry = 1;
+                                printk(KERN_WARNING "NFS: v4 server %s "
+                                                "does not accept raw "
+                                                "uid/gids. "
+                                                "Reenabling the idmapper.\n",
+                                                server->nfs_client->cl_hostname);
+                        }
        }
        /* We failed to handle the error */
        return nfs4_map_errors(ret);
-do_state_recovery:
+wait_on_recovery:
-        nfs4_schedule_state_recovery(clp);
        ret = nfs4_wait_clnt_recover(clp);
        if (ret == 0)
                exception->retry = 1;
@@ -434,8 +451,8 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
                clp = res->sr_session->clp;
                do_renew_lease(clp, timestamp);
                /* Check sequence flags */
-                if (atomic_read(&clp->cl_count) > 1)
+                if (res->sr_status_flags != 0)
-                        nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+                        nfs4_schedule_lease_recovery(clp);
                break;
        case -NFS4ERR_DELAY:
                /* The server detected a resend of the RPC call and
@@ -504,7 +521,7 @@ out:
        return ret_id;
 }
-static int nfs41_setup_sequence(struct nfs4_session *session,
+int nfs41_setup_sequence(struct nfs4_session *session,
                                struct nfs4_sequence_args *args,
                                struct nfs4_sequence_res *res,
                                int cache_reply,
@@ -570,6 +587,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        res->sr_status = 1;
        return 0;
 }
+EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
 int nfs4_setup_sequence(const struct nfs_server *server,
                        struct nfs4_sequence_args *args,
@@ -1254,14 +1272,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
                        case -NFS4ERR_BAD_HIGH_SLOT:
                        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
                        case -NFS4ERR_DEADSESSION:
-                                nfs4_schedule_state_recovery(
+                                nfs4_schedule_session_recovery(server->nfs_client->cl_session);
-                                        server->nfs_client);
                                goto out;
                        case -NFS4ERR_STALE_CLIENTID:
                        case -NFS4ERR_STALE_STATEID:
                        case -NFS4ERR_EXPIRED:
                                /* Don't recall a delegation if it was lost */
-                                nfs4_schedule_state_recovery(server->nfs_client);
+                                nfs4_schedule_lease_recovery(server->nfs_client);
                                goto out;
                        case -ERESTARTSYS:
                                /*
@@ -1270,7 +1287,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
                                 */
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_BAD_STATEID:
-                                nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+                                nfs4_schedule_stateid_recovery(server, state);
                        case -EKEYEXPIRED:
                                /*
                                 * User RPCSEC_GSS context has expired.
@@ -1573,9 +1590,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
        return 0;
 }
-static int nfs4_recover_expired_lease(struct nfs_server *server)
+static int nfs4_client_recover_expired_lease(struct nfs_client *clp)
 {
-        struct nfs_client *clp = server->nfs_client;
        unsigned int loop;
        int ret;
@@ -1586,12 +1602,17 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
                if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
                    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
                        break;
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_state_manager(clp);
                ret = -EIO;
        }
        return ret;
 }
+static int nfs4_recover_expired_lease(struct nfs_server *server)
+{
+        return nfs4_client_recover_expired_lease(server->nfs_client);
+}
 /*
 * OPEN_EXPIRED:
 *      reclaim state on the server after a network partition.
@@ -3069,15 +3090,10 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
        return err;
 }
-static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
 {
        struct nfs_server *server = NFS_SERVER(data->inode);
-        dprintk("--> %s\n", __func__);
-        if (!nfs4_sequence_done(task, &data->res.seq_res))
-                return -EAGAIN;
        if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
                nfs_restart_rpc(task, server->nfs_client);
                return -EAGAIN;
@@ -3089,19 +3105,44 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
        return 0;
 }
+static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+{
+        dprintk("--> %s\n", __func__);
+        if (!nfs4_sequence_done(task, &data->res.seq_res))
+                return -EAGAIN;
+        return data->read_done_cb(task, data);
+}
 static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
 {
        data->timestamp   = jiffies;
+        data->read_done_cb = nfs4_read_done_cb;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
 }
-static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+/* Reset the the nfs_read_data to send the read to the MDS. */
+void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
+{
+        dprintk("%s Reset task for i/o through\n", __func__);
+        put_lseg(data->lseg);
+        data->lseg = NULL;
+        /* offsets will differ in the dense stripe case */
+        data->args.offset = data->mds_offset;
+        data->ds_clp = NULL;
+        data->args.fh     = NFS_FH(data->inode);
+        data->read_done_cb = nfs4_read_done_cb;
+        task->tk_ops = data->mds_ops;
+        rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+}
+EXPORT_SYMBOL_GPL(nfs4_reset_read);
+static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
        
-        if (!nfs4_sequence_done(task, &data->res.seq_res))
-                return -EAGAIN;
        if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
                nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
                return -EAGAIN;
@@ -3113,11 +3154,41 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
        return 0;
 }
+static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+{
+        if (!nfs4_sequence_done(task, &data->res.seq_res))
+                return -EAGAIN;
+        return data->write_done_cb(task, data);
+}
+/* Reset the the nfs_write_data to send the write to the MDS. */
+void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data)
+{
+        dprintk("%s Reset task for i/o through\n", __func__);
+        put_lseg(data->lseg);
+        data->lseg          = NULL;
+        data->ds_clp        = NULL;
+        data->write_done_cb = nfs4_write_done_cb;
+        data->args.fh       = NFS_FH(data->inode);
+        data->args.bitmask  = data->res.server->cache_consistency_bitmask;
+        data->args.offset   = data->mds_offset;
+        data->res.fattr     = &data->fattr;
+        task->tk_ops        = data->mds_ops;
+        rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+}
+EXPORT_SYMBOL_GPL(nfs4_reset_write);
 static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
        struct nfs_server *server = NFS_SERVER(data->inode);
-        data->args.bitmask = server->cache_consistency_bitmask;
+        if (data->lseg) {
+                data->args.bitmask = NULL;
+                data->res.fattr = NULL;
+        } else
+                data->args.bitmask = server->cache_consistency_bitmask;
+        if (!data->write_done_cb)
+                data->write_done_cb = nfs4_write_done_cb;
        data->res.server = server;
        data->timestamp   = jiffies;
@@ -3177,7 +3248,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
        if (task->tk_status < 0) {
                /* Unless we're shutting down, schedule state recovery! */
                if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0)
-                        nfs4_schedule_state_recovery(clp);
+                        nfs4_schedule_lease_recovery(clp);
                return;
        }
        do_renew_lease(clp, timestamp);
@@ -3251,6 +3322,35 @@ static void buf_to_pages(const void *buf, size_t buflen,
        }
 }
+static int buf_to_pages_noslab(const void *buf, size_t buflen,
+                struct page **pages, unsigned int *pgbase)
+{
+        struct page *newpage, **spages;
+        int rc = 0;
+        size_t len;
+        spages = pages;
+        do {
+                len = min_t(size_t, PAGE_CACHE_SIZE, buflen);
+                newpage = alloc_page(GFP_KERNEL);
+                if (newpage == NULL)
+                        goto unwind;
+                memcpy(page_address(newpage), buf, len);
+                buf += len;
+                buflen -= len;
+                *pages++ = newpage;
+                rc++;
+        } while (buflen != 0);
+        return rc;
+unwind:
+        for(; rc > 0; rc--)
+                __free_page(spages[rc-1]);
+        return -ENOMEM;
+}
 struct nfs4_cached_acl {
        int cached;
        size_t len;
@@ -3419,13 +3519,23 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
                .rpc_argp       = &arg,
                .rpc_resp       = &res,
        };
-        int ret;
+        int ret, i;
        if (!nfs4_server_supports_acls(server))
                return -EOPNOTSUPP;
+        i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
+        if (i < 0)
+                return i;
        nfs_inode_return_delegation(inode);
-        buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
        ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
+        /*
+         * Free each page after tx, so the only ref left is
+         * held by the network stack
+         */
+        for (; i > 0; i--)
+                put_page(pages[i-1]);
        /*
         * Acl update can result in inode attribute update.
         * so mark the attribute cache invalid.
@@ -3463,12 +3573,13 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                case -NFS4ERR_OPENMODE:
                        if (state == NULL)
                                break;
-                        nfs4_state_mark_reclaim_nograce(clp, state);
+                        nfs4_schedule_stateid_recovery(server, state);
-                        goto do_state_recovery;
+                        goto wait_on_recovery;
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_EXPIRED:
-                        goto do_state_recovery;
+                        nfs4_schedule_lease_recovery(clp);
+                        goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
                case -NFS4ERR_BADSESSION:
                case -NFS4ERR_BADSLOT:
@@ -3479,7 +3590,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                case -NFS4ERR_SEQ_MISORDERED:
                        dprintk("%s ERROR %d, Reset session\n", __func__,
                                task->tk_status);
-                        nfs4_schedule_state_recovery(clp);
+                        nfs4_schedule_session_recovery(clp->cl_session);
                        task->tk_status = 0;
                        return -EAGAIN;
 #endif /* CONFIG_NFS_V4_1 */
@@ -3496,9 +3607,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
        }
        task->tk_status = nfs4_map_errors(task->tk_status);
        return 0;
-do_state_recovery:
+wait_on_recovery:
        rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
-        nfs4_schedule_state_recovery(clp);
        if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
                rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
        task->tk_status = 0;
@@ -4109,7 +4219,7 @@ static void nfs4_lock_release(void *calldata)
                task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
                                data->arg.lock_seqid);
                if (!IS_ERR(task))
-                        rpc_put_task(task);
+                        rpc_put_task_async(task);
                dprintk("%s: cancelling lock!\n", __func__);
        } else
                nfs_free_seqid(data->arg.lock_seqid);
@@ -4133,23 +4243,18 @@ static const struct rpc_call_ops nfs4_recover_lock_ops = {
 static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
 {
-        struct nfs_client *clp = server->nfs_client;
-        struct nfs4_state *state = lsp->ls_state;
        switch (error) {
        case -NFS4ERR_ADMIN_REVOKED:
        case -NFS4ERR_BAD_STATEID:
-        case -NFS4ERR_EXPIRED:
+                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
                if (new_lock_owner != 0 ||
                   (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
-                        nfs4_state_mark_reclaim_nograce(clp, state);
+                        nfs4_schedule_stateid_recovery(server, lsp->ls_state);
-                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
                break;
        case -NFS4ERR_STALE_STATEID:
-                if (new_lock_owner != 0 ||
-                    (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
-                        nfs4_state_mark_reclaim_reboot(clp, state);
                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+        case -NFS4ERR_EXPIRED:
+                nfs4_schedule_lease_recovery(server->nfs_client);
        };
 }
@@ -4365,12 +4470,14 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                        case -NFS4ERR_EXPIRED:
                        case -NFS4ERR_STALE_CLIENTID:
                        case -NFS4ERR_STALE_STATEID:
+                                nfs4_schedule_lease_recovery(server->nfs_client);
+                                goto out;
                        case -NFS4ERR_BADSESSION:
                        case -NFS4ERR_BADSLOT:
                        case -NFS4ERR_BAD_HIGH_SLOT:
                        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
                        case -NFS4ERR_DEADSESSION:
-                                nfs4_schedule_state_recovery(server->nfs_client);
+                                nfs4_schedule_session_recovery(server->nfs_client->cl_session);
                                goto out;
                        case -ERESTARTSYS:
                                /*
@@ -4380,7 +4487,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_BAD_STATEID:
                        case -NFS4ERR_OPENMODE:
-                                nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+                                nfs4_schedule_stateid_recovery(server, state);
                                err = 0;
                                goto out;
                        case -EKEYEXPIRED:
@@ -4572,27 +4679,16 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
        *p = htonl((u32)clp->cl_boot_time.tv_nsec);
        args.verifier = &verifier;
-        while (1) {
+        args.id_len = scnprintf(args.id, sizeof(args.id),
-                args.id_len = scnprintf(args.id, sizeof(args.id),
+                                "%s/%s.%s/%u",
-                                        "%s/%s %u",
+                                clp->cl_ipaddr,
-                                        clp->cl_ipaddr,
+                                init_utsname()->nodename,
-                                        rpc_peeraddr2str(clp->cl_rpcclient,
+                                init_utsname()->domainname,
-                                                         RPC_DISPLAY_ADDR),
+                                clp->cl_rpcclient->cl_auth->au_flavor);
-                                        clp->cl_id_uniquifier);
-                status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
-                if (status != -NFS4ERR_CLID_INUSE)
-                        break;
-                if (signalled())
-                        break;
-                if (++clp->cl_id_uniquifier == 0)
-                        break;
-        }
-        status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
+        status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
+        if (!status)
+                status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
        dprintk("<-- %s status= %d\n", __func__, status);
        return status;
 }
@@ -4998,10 +5094,20 @@ int nfs4_proc_create_session(struct nfs_client *clp)
        int status;
        unsigned *ptr;
        struct nfs4_session *session = clp->cl_session;
+        long timeout = 0;
+        int err;
        dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
-        status = _nfs4_proc_create_session(clp);
+        do {
+                status = _nfs4_proc_create_session(clp);
+                if (status == -NFS4ERR_DELAY) {
+                        err = nfs4_delay(clp->cl_rpcclient, &timeout);
+                        if (err)
+                                status = err;
+                }
+        } while (status == -NFS4ERR_DELAY);
        if (status)
                goto out;
@@ -5083,6 +5189,27 @@ int nfs4_init_session(struct nfs_server *server)
        return ret;
 }
+int nfs4_init_ds_session(struct nfs_client *clp)
+{
+        struct nfs4_session *session = clp->cl_session;
+        int ret;
+        if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
+                return 0;
+        ret = nfs4_client_recover_expired_lease(clp);
+        if (!ret)
+                /* Test for the DS role */
+                if (!is_ds_client(clp))
+                        ret = -ENODEV;
+        if (!ret)
+                ret = nfs4_check_client_ready(clp);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
 /*
 * Renew the cl_session lease.
 */
@@ -5110,7 +5237,7 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client
                rpc_delay(task, NFS4_POLL_RETRY_MAX);
                return -EAGAIN;
        default:
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_lease_recovery(clp);
        }
        return 0;
 }
@@ -5197,7 +5324,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
        if (IS_ERR(task))
                ret = PTR_ERR(task);
        else
-                rpc_put_task(task);
+                rpc_put_task_async(task);
        dprintk("<-- %s status=%d\n", __func__, ret);
        return ret;
 }
@@ -5213,8 +5340,13 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
                goto out;
        }
        ret = rpc_wait_for_completion_task(task);
-        if (!ret)
+        if (!ret) {
+                struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
+                if (task->tk_status == 0)
+                        nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
                ret = task->tk_status;
+        }
        rpc_put_task(task);
 out:
        dprintk("<-- %s status=%d\n", __func__, ret);
@@ -5251,7 +5383,7 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
                rpc_delay(task, NFS4_POLL_RETRY_MAX);
                return -EAGAIN;
        default:
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_lease_recovery(clp);
        }
        return 0;
 }
@@ -5319,6 +5451,9 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
                status = PTR_ERR(task);
                goto out;
        }
+        status = nfs4_wait_for_completion_rpc_task(task);
+        if (status == 0)
+                status = task->tk_status;
        rpc_put_task(task);
        return 0;
 out:
@@ -5605,6 +5740,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .clear_acl_cache = nfs4_zap_acl_attr,
        .close_context  = nfs4_close_context,
        .open_context   = nfs4_atomic_open,
+        .init_client    = nfs4_init_client,
 };
 static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 402143d75fc5..df8e7f3ca56d 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -64,12 +64,8 @@ nfs4_renew_state(struct work_struct *work)
        ops = clp->cl_mvops->state_renewal_ops;
        dprintk("%s: start\n", __func__);
-        rcu_read_lock();
+        if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state))
-        if (list_empty(&clp->cl_superblocks)) {
-                rcu_read_unlock();
                goto out;
-        }
-        rcu_read_unlock();
        spin_lock(&clp->cl_lock);
        lease = clp->cl_lease_time;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2336d532cf66..ab1bf5bb021f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -153,6 +153,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
        int status;
        struct nfs_fsinfo fsinfo;
+        if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
+                nfs4_schedule_state_renewal(clp);
+                return 0;
+        }
        status = nfs4_proc_get_lease_time(clp, &fsinfo);
        if (status == 0) {
                /* Update lease time and schedule renewal */
@@ -232,12 +237,6 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
        status = nfs4_proc_create_session(clp);
        if (status != 0)
                goto out;
-        status = nfs4_set_callback_sessionid(clp);
-        if (status != 0) {
-                printk(KERN_WARNING "Sessionid not set. No callback service\n");
-                nfs_callback_down(1);
-                status = 0;
-        }
        nfs41_setup_state_renewal(clp);
        nfs_mark_client_ready(clp, NFS_CS_READY);
 out:
@@ -1013,9 +1012,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
 }
 /*
- * Schedule a state recovery attempt
+ * Schedule a lease recovery attempt
 */
-void nfs4_schedule_state_recovery(struct nfs_client *clp)
+void nfs4_schedule_lease_recovery(struct nfs_client *clp)
 {
        if (!clp)
                return;
@@ -1024,7 +1023,7 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp)
        nfs4_schedule_state_manager(clp);
 }
-int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
+static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
 {
        set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -1038,7 +1037,7 @@ int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *st
        return 1;
 }
-int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
+static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
 {
        set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
        clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -1047,6 +1046,14 @@ int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *s
        return 1;
 }
+void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state)
+{
+        struct nfs_client *clp = server->nfs_client;
+        nfs4_state_mark_reclaim_nograce(clp, state);
+        nfs4_schedule_state_manager(clp);
+}
 static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
 {
        struct inode *inode = state->inode;
@@ -1442,10 +1449,16 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
 }
 #ifdef CONFIG_NFS_V4_1
+void nfs4_schedule_session_recovery(struct nfs4_session *session)
+{
+        nfs4_schedule_lease_recovery(session->clp);
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
 void nfs41_handle_recall_slot(struct nfs_client *clp)
 {
        set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
-        nfs4_schedule_state_recovery(clp);
+        nfs4_schedule_state_manager(clp);
 }
 static void nfs4_reset_all_state(struct nfs_client *clp)
@@ -1453,7 +1466,7 @@ static void nfs4_reset_all_state(struct nfs_client *clp)
        if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
                clp->cl_boot_time = CURRENT_TIME;
                nfs4_state_start_reclaim_nograce(clp);
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_state_manager(clp);
        }
 }
@@ -1461,7 +1474,7 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp)
 {
        if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
                nfs4_state_start_reclaim_reboot(clp);
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_state_manager(clp);
        }
 }
@@ -1481,7 +1494,7 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp)
 {
        nfs_expire_all_delegations(clp);
        if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_state_manager(clp);
 }
 void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 2ab8e5cb8f59..0cf560f77884 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -844,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
        if (iap->ia_valid & ATTR_MODE)
                len += 4;
        if (iap->ia_valid & ATTR_UID) {
-                owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ);
+                owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
                if (owner_namelen < 0) {
                        dprintk("nfs: couldn't resolve uid %d to string\n",
                                        iap->ia_uid);
@@ -856,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
                len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
        }
        if (iap->ia_valid & ATTR_GID) {
-                owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ);
+                owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);
                if (owner_grouplen < 0) {
                        dprintk("nfs: couldn't resolve gid %d to string\n",
                                        iap->ia_gid);
@@ -1384,7 +1384,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
        hdr->replen += decode_putrootfh_maxsz;
 }
-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx)
+static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid)
 {
        nfs4_stateid stateid;
        __be32 *p;
@@ -1392,6 +1392,8 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
        p = reserve_space(xdr, NFS4_STATEID_SIZE);
        if (ctx->state != NULL) {
                nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
+                if (zero_seqid)
+                        stateid.stateid.seqid = 0;
                xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
        } else
                xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
@@ -1404,7 +1406,8 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(OP_READ);
-        encode_stateid(xdr, args->context, args->lock_context);
+        encode_stateid(xdr, args->context, args->lock_context,
+                       hdr->minorversion);
        p = reserve_space(xdr, 12);
        p = xdr_encode_hyper(p, args->offset);
@@ -1592,7 +1595,8 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(OP_WRITE);
-        encode_stateid(xdr, args->context, args->lock_context);
+        encode_stateid(xdr, args->context, args->lock_context,
+                       hdr->minorversion);
        p = reserve_space(xdr, 16);
        p = xdr_encode_hyper(p, args->offset);
@@ -1660,7 +1664,7 @@ static void encode_create_session(struct xdr_stream *xdr,
        p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
        *p++ = cpu_to_be32(OP_CREATE_SESSION);
-        p = xdr_encode_hyper(p, clp->cl_ex_clid);
+        p = xdr_encode_hyper(p, clp->cl_clientid);
        *p++ = cpu_to_be32(clp->cl_seqid);                      /*Sequence id */
        *p++ = cpu_to_be32(args->flags);                        /*flags */
@@ -2271,7 +2275,8 @@ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
        encode_putfh(xdr, args->fh, &hdr);
        encode_write(xdr, args, &hdr);
        req->rq_snd_buf.flags |= XDRBUF_WRITE;
-        encode_getfattr(xdr, args->bitmask, &hdr);
+        if (args->bitmask)
+                encode_getfattr(xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
 }
@@ -3382,7 +3387,7 @@ out_overflow:
 }
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
-                struct nfs_client *clp, uint32_t *uid, int may_sleep)
+                const struct nfs_server *server, uint32_t *uid, int may_sleep)
 {
        uint32_t len;
        __be32 *p;
@@ -3402,7 +3407,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
                if (!may_sleep) {
                        /* do nothing */
                } else if (len < XDR_MAX_NETOBJ) {
-                        if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
+                        if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0)
                                ret = NFS_ATTR_FATTR_OWNER;
                        else
                                dprintk("%s: nfs_map_name_to_uid failed!\n",
@@ -3420,7 +3425,7 @@ out_overflow:
 }
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
-                struct nfs_client *clp, uint32_t *gid, int may_sleep)
+                const struct nfs_server *server, uint32_t *gid, int may_sleep)
 {
        uint32_t len;
        __be32 *p;
@@ -3440,7 +3445,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
                if (!may_sleep) {
                        /* do nothing */
                } else if (len < XDR_MAX_NETOBJ) {
-                        if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
+                        if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0)
                                ret = NFS_ATTR_FATTR_GROUP;
                        else
                                dprintk("%s: nfs_map_group_to_gid failed!\n",
@@ -3939,14 +3944,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
                goto xdr_error;
        fattr->valid |= status;
-        status = decode_attr_owner(xdr, bitmap, server->nfs_client,
+        status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep);
-                        &fattr->uid, may_sleep);
        if (status < 0)
                goto xdr_error;
        fattr->valid |= status;
-        status = decode_attr_group(xdr, bitmap, server->nfs_client,
+        status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep);
-                        &fattr->gid, may_sleep);
        if (status < 0)
                goto xdr_error;
        fattr->valid |= status;
@@ -4694,7 +4697,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
        p = xdr_inline_decode(xdr, 8);
        if (unlikely(!p))
                goto out_overflow;
-        xdr_decode_hyper(p, &clp->cl_ex_clid);
+        xdr_decode_hyper(p, &clp->cl_clientid);
        p = xdr_inline_decode(xdr, 12);
        if (unlikely(!p))
                goto out_overflow;
@@ -5690,8 +5693,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_write(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(xdr, res->fattr, res->server,
+        if (res->fattr)
-                        !RPC_IS_ASYNC(rqstp->rq_task));
+                decode_getfattr(xdr, res->fattr, res->server,
+                                !RPC_IS_ASYNC(rqstp->rq_task));
        if (!status)
                status = res->count;
 out:
@@ -6086,11 +6090,11 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
        __be32 *p = xdr_inline_decode(xdr, 4);
        if (unlikely(!p))
                goto out_overflow;
-        if (!ntohl(*p++)) {
+        if (*p == xdr_zero) {
                p = xdr_inline_decode(xdr, 4);
                if (unlikely(!p))
                        goto out_overflow;
-                if (!ntohl(*p++))
+                if (*p == xdr_zero)
                        return -EAGAIN;
                entry->eof = 1;
                return -EBADCOOKIE;
@@ -6101,7 +6105,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
                goto out_overflow;
        entry->prev_cookie = entry->cookie;
        p = xdr_decode_hyper(p, &entry->cookie);
-        entry->len = ntohl(*p++);
+        entry->len = be32_to_cpup(p);
        p = xdr_inline_decode(xdr, entry->len);
        if (unlikely(!p))
@@ -6132,9 +6136,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
        if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
                entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
-        if (verify_attr_len(xdr, p, len) < 0)
-                goto out_overflow;
        return 0;
 out_overflow:
@@ -6170,8 +6171,6 @@ static struct {
        { NFS4ERR_DQUOT,        -EDQUOT         },
        { NFS4ERR_STALE,        -ESTALE         },
        { NFS4ERR_BADHANDLE,    -EBADHANDLE     },
-        { NFS4ERR_BADOWNER,     -EINVAL         },
-        { NFS4ERR_BADNAME,      -EINVAL         },
        { NFS4ERR_BAD_COOKIE,   -EBADCOOKIE     },
        { NFS4ERR_NOTSUPP,      -ENOTSUPP       },
        { NFS4ERR_TOOSMALL,     -ETOOSMALL      },
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 903908a20023..c541093a5bf2 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -86,11 +86,14 @@
 /* Default path we try to mount. "%s" gets replaced by our IP address */
 #define NFS_ROOT                "/tftpboot/%s"
+/* Default NFSROOT mount options. */
+#define NFS_DEF_OPTIONS         "udp"
 /* Parameters passed from the kernel command line */
 static char nfs_root_parms[256] __initdata = "";
 /* Text-based mount options passed to super.c */
-static char nfs_root_options[256] __initdata = "";
+static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS;
 /* Address of NFS server */
 static __be32 servaddr __initdata = htonl(INADDR_NONE);
@@ -160,8 +163,14 @@ static int __init root_nfs_copy(char *dest, const char *src,
 }
 static int __init root_nfs_cat(char *dest, const char *src,
-                                  const size_t destlen)
+                               const size_t destlen)
 {
+        size_t len = strlen(dest);
+        if (len && dest[len - 1] != ',')
+                if (strlcat(dest, ",", destlen) > destlen)
+                        return -1;
        if (strlcat(dest, src, destlen) > destlen)
                return -1;
        return 0;
@@ -194,16 +203,6 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath,
                if (root_nfs_cat(nfs_root_options, incoming,
                                                sizeof(nfs_root_options)))
                        return -1;
-        /*
-         * Possibly prepare for more options to be appended
-         */
-        if (nfs_root_options[0] != '\0' &&
-            nfs_root_options[strlen(nfs_root_options)] != ',')
-                if (root_nfs_cat(nfs_root_options, ",",
-                                                sizeof(nfs_root_options)))
-                        return -1;
        return 0;
 }
@@ -217,7 +216,7 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath,
 */
 static int __init root_nfs_data(char *cmdline)
 {
-        char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
+        char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
        int len, retval = -1;
        char *tmp = NULL;
        const size_t tmplen = sizeof(nfs_export_path);
@@ -244,9 +243,9 @@ static int __init root_nfs_data(char *cmdline)
         * Append mandatory options for nfsroot so they override
         * what has come before
         */
-        snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4",
+        snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4",
                        &servaddr);
-        if (root_nfs_cat(nfs_root_options, addr_option,
+        if (root_nfs_cat(nfs_root_options, mand_options,
                                                sizeof(nfs_root_options)))
                goto out_optionstoolong;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e1164e3f9e69..23e794410669 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -20,6 +20,7 @@
 #include <linux/nfs_mount.h>
 #include "internal.h"
+#include "pnfs.h"
 static struct kmem_cache *nfs_page_cachep;
@@ -213,7 +214,7 @@ nfs_wait_on_request(struct nfs_page *req)
 */
 void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
                     struct inode *inode,
-                     int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
+                     int (*doio)(struct nfs_pageio_descriptor *),
                     size_t bsize,
                     int io_flags)
 {
@@ -226,6 +227,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
        desc->pg_doio = doio;
        desc->pg_ioflags = io_flags;
        desc->pg_error = 0;
+        desc->pg_lseg = NULL;
 }
 /**
@@ -240,7 +242,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 * Return 'true' if this is the case, else return 'false'.
 */
 static int nfs_can_coalesce_requests(struct nfs_page *prev,
-                                     struct nfs_page *req)
+                                     struct nfs_page *req,
+                                     struct nfs_pageio_descriptor *pgio)
 {
        if (req->wb_context->cred != prev->wb_context->cred)
                return 0;
@@ -254,6 +257,12 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
                return 0;
        if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
                return 0;
+        /*
+         * Non-whole file layouts need to check that req is inside of
+         * pgio->pg_lseg.
+         */
+        if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
+                return 0;
        return 1;
 }
@@ -286,7 +295,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
                if (newlen > desc->pg_bsize)
                        return 0;
                prev = nfs_list_entry(desc->pg_list.prev);
-                if (!nfs_can_coalesce_requests(prev, req))
+                if (!nfs_can_coalesce_requests(prev, req, desc))
                        return 0;
        } else
                desc->pg_base = req->wb_pgbase;
@@ -302,12 +311,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
 static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
 {
        if (!list_empty(&desc->pg_list)) {
-                int error = desc->pg_doio(desc->pg_inode,
+                int error = desc->pg_doio(desc);
-                                          &desc->pg_list,
-                                          nfs_page_array_len(desc->pg_base,
-                                                             desc->pg_count),
-                                          desc->pg_count,
-                                          desc->pg_ioflags);
                if (error < 0)
                        desc->pg_error = error;
                else
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index bc4089769735..f38813a0a295 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -30,6 +30,7 @@
 #include <linux/nfs_fs.h>
 #include "internal.h"
 #include "pnfs.h"
+#include "iostat.h"
 #define NFSDBG_FACILITY         NFSDBG_PNFS
@@ -74,10 +75,8 @@ find_pnfs_driver(u32 id)
 void
 unset_pnfs_layoutdriver(struct nfs_server *nfss)
 {
-        if (nfss->pnfs_curr_ld) {
+        if (nfss->pnfs_curr_ld)
-                nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
                module_put(nfss->pnfs_curr_ld->owner);
-        }
        nfss->pnfs_curr_ld = NULL;
 }
@@ -115,13 +114,7 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
                goto out_no_driver;
        }
        server->pnfs_curr_ld = ld_type;
-        if (ld_type->set_layoutdriver(server)) {
-                printk(KERN_ERR
-                       "%s: Error initializing mount point for layout driver %u.\n",
-                       __func__, id);
-                module_put(ld_type->owner);
-                goto out_no_driver;
-        }
        dprintk("%s: pNFS module for %u set\n", __func__, id);
        return;
@@ -230,37 +223,41 @@ static void free_lseg(struct pnfs_layout_segment *lseg)
        put_layout_hdr(NFS_I(ino)->layout);
 }
-/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
+static void
- * could sleep, so must be called outside of the lock.
+put_lseg_common(struct pnfs_layout_segment *lseg)
- * Returns 1 if object was removed, otherwise return 0.
+{
- */
+        struct inode *inode = lseg->pls_layout->plh_inode;
-static int
-put_lseg_locked(struct pnfs_layout_segment *lseg,
+        BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
-                struct list_head *tmp_list)
+        list_del_init(&lseg->pls_list);
+        if (list_empty(&lseg->pls_layout->plh_segs)) {
+                set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
+                /* Matched by initial refcount set in alloc_init_layout_hdr */
+                put_layout_hdr_locked(lseg->pls_layout);
+        }
+        rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
+}
+void
+put_lseg(struct pnfs_layout_segment *lseg)
 {
+        struct inode *inode;
+        if (!lseg)
+                return;
        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
                atomic_read(&lseg->pls_refcount),
                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
-        if (atomic_dec_and_test(&lseg->pls_refcount)) {
+        inode = lseg->pls_layout->plh_inode;
-                struct inode *ino = lseg->pls_layout->plh_inode;
+        if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
+                LIST_HEAD(free_me);
-                BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+                put_lseg_common(lseg);
-                list_del(&lseg->pls_list);
+                list_add(&lseg->pls_list, &free_me);
-                if (list_empty(&lseg->pls_layout->plh_segs)) {
+                spin_unlock(&inode->i_lock);
-                        struct nfs_client *clp;
+                pnfs_free_lseg_list(&free_me);
-                        clp = NFS_SERVER(ino)->nfs_client;
-                        spin_lock(&clp->cl_lock);
-                        /* List does not take a reference, so no need for put here */
-                        list_del_init(&lseg->pls_layout->plh_layouts);
-                        spin_unlock(&clp->cl_lock);
-                        clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
-                }
-                rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
-                list_add(&lseg->pls_list, tmp_list);
-                return 1;
        }
-        return 0;
 }
 static bool
@@ -281,7 +278,13 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
                 * list.  It will now be removed when all
                 * outstanding io is finished.
                 */
-                rv = put_lseg_locked(lseg, tmp_list);
+                dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+                        atomic_read(&lseg->pls_refcount));
+                if (atomic_dec_and_test(&lseg->pls_refcount)) {
+                        put_lseg_common(lseg);
+                        list_add(&lseg->pls_list, tmp_list);
+                        rv = 1;
+                }
        }
        return rv;
 }
@@ -299,6 +302,11 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
        dprintk("%s:Begin lo %p\n", __func__, lo);
+        if (list_empty(&lo->plh_segs)) {
+                if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
+                        put_layout_hdr_locked(lo);
+                return 0;
+        }
        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
                if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
                        dprintk("%s: freeing lseg %p iomode %d "
@@ -312,11 +320,27 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
        return invalid - removed;
 }
+/* note free_me must contain lsegs from a single layout_hdr */
 void
 pnfs_free_lseg_list(struct list_head *free_me)
 {
        struct pnfs_layout_segment *lseg, *tmp;
+        struct pnfs_layout_hdr *lo;
+        if (list_empty(free_me))
+                return;
+        lo = list_first_entry(free_me, struct pnfs_layout_segment,
+                              pls_list)->pls_layout;
+        if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
+                struct nfs_client *clp;
+                clp = NFS_SERVER(lo->plh_inode)->nfs_client;
+                spin_lock(&clp->cl_lock);
+                list_del_init(&lo->plh_layouts);
+                spin_unlock(&clp->cl_lock);
+        }
        list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
                list_del(&lseg->pls_list);
                free_lseg(lseg);
@@ -332,10 +356,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
        spin_lock(&nfsi->vfs_inode.i_lock);
        lo = nfsi->layout;
        if (lo) {
-                set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
+                lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
                mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
-                /* Matched by refcount set to 1 in alloc_init_layout_hdr */
-                put_layout_hdr_locked(lo);
        }
        spin_unlock(&nfsi->vfs_inode.i_lock);
        pnfs_free_lseg_list(&tmp_list);
@@ -403,6 +425,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
            (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
                return true;
        return lo->plh_block_lgets ||
+                test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
                (list_empty(&lo->plh_segs) &&
                 (atomic_read(&lo->plh_outstanding) > lget));
@@ -674,7 +697,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
                    is_matching_lseg(lseg, iomode)) {
-                        ret = lseg;
+                        ret = get_lseg(lseg);
                        break;
                }
                if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
@@ -699,6 +722,7 @@ pnfs_update_layout(struct inode *ino,
        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
        struct pnfs_layout_hdr *lo;
        struct pnfs_layout_segment *lseg = NULL;
+        bool first = false;
        if (!pnfs_enabled_sb(NFS_SERVER(ino)))
                return NULL;
@@ -715,21 +739,25 @@ pnfs_update_layout(struct inode *ino,
                dprintk("%s matches recall, use MDS\n", __func__);
                goto out_unlock;
        }
-        /* Check to see if the layout for the given range already exists */
-        lseg = pnfs_find_lseg(lo, iomode);
-        if (lseg)
-                goto out_unlock;
        /* if LAYOUTGET already failed once we don't try again */
        if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
                goto out_unlock;
+        /* Check to see if the layout for the given range already exists */
+        lseg = pnfs_find_lseg(lo, iomode);
+        if (lseg)
+                goto out_unlock;
        if (pnfs_layoutgets_blocked(lo, NULL, 0))
                goto out_unlock;
        atomic_inc(&lo->plh_outstanding);
        get_layout_hdr(lo);
-        if (list_empty(&lo->plh_segs)) {
+        if (list_empty(&lo->plh_segs))
+                first = true;
+        spin_unlock(&ino->i_lock);
+        if (first) {
                /* The lo must be on the clp list if there is any
                 * chance of a CB_LAYOUTRECALL(FILE) coming in.
                 */
@@ -738,24 +766,18 @@ pnfs_update_layout(struct inode *ino,
                list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
                spin_unlock(&clp->cl_lock);
        }
-        spin_unlock(&ino->i_lock);
        lseg = send_layoutget(lo, ctx, iomode);
-        if (!lseg) {
+        if (!lseg && first) {
-                spin_lock(&ino->i_lock);
+                spin_lock(&clp->cl_lock);
-                if (list_empty(&lo->plh_segs)) {
+                list_del_init(&lo->plh_layouts);
-                        spin_lock(&clp->cl_lock);
+                spin_unlock(&clp->cl_lock);
-                        list_del_init(&lo->plh_layouts);
-                        spin_unlock(&clp->cl_lock);
-                        clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
-                }
-                spin_unlock(&ino->i_lock);
        }
        atomic_dec(&lo->plh_outstanding);
        put_layout_hdr(lo);
 out:
        dprintk("%s end, state 0x%lx lseg %p\n", __func__,
-                nfsi->layout->plh_flags, lseg);
+                nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
        return lseg;
 out_unlock:
        spin_unlock(&ino->i_lock);
@@ -808,7 +830,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        }
        init_lseg(lo, lseg);
        lseg->pls_range = res->range;
-        *lgp->lsegpp = lseg;
+        *lgp->lsegpp = get_lseg(lseg);
        pnfs_insert_layout(lo, lseg);
        if (res->return_on_close) {
@@ -829,137 +851,97 @@ out_forget_reply:
        goto out;
 }
-/*
+static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
- * Device ID cache. Currently supports one layout type per struct nfs_client.
+                             struct nfs_page *prev,
- * Add layout type to the lookup key to expand to support multiple types.
+                             struct nfs_page *req)
- */
-int
-pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
-                         void (*free_callback)(struct pnfs_deviceid_node *))
 {
-        struct pnfs_deviceid_cache *c;
+        if (pgio->pg_count == prev->wb_bytes) {
+                /* This is first coelesce call for a series of nfs_pages */
-        c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
+                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-        if (!c)
+                                                   prev->wb_context,
-                return -ENOMEM;
+                                                   IOMODE_READ);
-        spin_lock(&clp->cl_lock);
-        if (clp->cl_devid_cache != NULL) {
-                atomic_inc(&clp->cl_devid_cache->dc_ref);
-                dprintk("%s [kref [%d]]\n", __func__,
-                        atomic_read(&clp->cl_devid_cache->dc_ref));
-                kfree(c);
-        } else {
-                /* kzalloc initializes hlists */
-                spin_lock_init(&c->dc_lock);
-                atomic_set(&c->dc_ref, 1);
-                c->dc_free_callback = free_callback;
-                clp->cl_devid_cache = c;
-                dprintk("%s [new]\n", __func__);
        }
-        spin_unlock(&clp->cl_lock);
+        return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
-        return 0;
 }
-EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
-/*
- * Called from pnfs_layoutdriver_type->free_lseg
- * last layout segment reference frees deviceid
- */
 void
-pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
-                  struct pnfs_deviceid_node *devid)
 {
-        struct nfs4_deviceid *id = &devid->de_id;
+        struct pnfs_layoutdriver_type *ld;
-        struct pnfs_deviceid_node *d;
-        struct hlist_node *n;
-        long h = nfs4_deviceid_hash(id);
-        dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
+        ld = NFS_SERVER(inode)->pnfs_curr_ld;
-        if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
+        pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
-                return;
+}
-        hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
+static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
-                if (!memcmp(&d->de_id, id, sizeof(*id))) {
+                              struct nfs_page *prev,
-                        hlist_del_rcu(&d->de_node);
+                              struct nfs_page *req)
-                        spin_unlock(&c->dc_lock);
+{
-                        synchronize_rcu();
+        if (pgio->pg_count == prev->wb_bytes) {
-                        c->dc_free_callback(devid);
+                /* This is first coelesce call for a series of nfs_pages */
-                        return;
+                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-                }
+                                                   prev->wb_context,
-        spin_unlock(&c->dc_lock);
+                                                   IOMODE_RW);
-        /* Why wasn't it found in  the list? */
-        BUG();
-}
-EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
-/* Find and reference a deviceid */
-struct pnfs_deviceid_node *
-pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
-{
-        struct pnfs_deviceid_node *d;
-        struct hlist_node *n;
-        long hash = nfs4_deviceid_hash(id);
-        dprintk("--> %s hash %ld\n", __func__, hash);
-        rcu_read_lock();
-        hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
-                if (!memcmp(&d->de_id, id, sizeof(*id))) {
-                        if (!atomic_inc_not_zero(&d->de_ref)) {
-                                goto fail;
-                        } else {
-                                rcu_read_unlock();
-                                return d;
-                        }
-                }
        }
-fail:
+        return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
-        rcu_read_unlock();
+}
-        return NULL;
+void
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
+{
+        struct pnfs_layoutdriver_type *ld;
+        ld = NFS_SERVER(inode)->pnfs_curr_ld;
+        pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
+}
+enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_write_data *wdata,
+                        const struct rpc_call_ops *call_ops, int how)
+{
+        struct inode *inode = wdata->inode;
+        enum pnfs_try_status trypnfs;
+        struct nfs_server *nfss = NFS_SERVER(inode);
+        wdata->mds_ops = call_ops;
+        dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
+                inode->i_ino, wdata->args.count, wdata->args.offset, how);
+        trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
+        if (trypnfs == PNFS_NOT_ATTEMPTED) {
+                put_lseg(wdata->lseg);
+                wdata->lseg = NULL;
+        } else
+                nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
+        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+        return trypnfs;
 }
-EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
 /*
- * Add a deviceid to the cache.
+ * Call the appropriate parallel I/O subsystem read function.
- * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
 */
-struct pnfs_deviceid_node *
+enum pnfs_try_status
-pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
+pnfs_try_to_read_data(struct nfs_read_data *rdata,
-{
+                       const struct rpc_call_ops *call_ops)
-        struct pnfs_deviceid_node *d;
-        long hash = nfs4_deviceid_hash(&new->de_id);
-        dprintk("--> %s hash %ld\n", __func__, hash);
-        spin_lock(&c->dc_lock);
-        d = pnfs_find_get_deviceid(c, &new->de_id);
-        if (d) {
-                spin_unlock(&c->dc_lock);
-                dprintk("%s [discard]\n", __func__);
-                c->dc_free_callback(new);
-                return d;
-        }
-        INIT_HLIST_NODE(&new->de_node);
-        atomic_set(&new->de_ref, 1);
-        hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
-        spin_unlock(&c->dc_lock);
-        dprintk("%s [new]\n", __func__);
-        return new;
-}
-EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
-void
-pnfs_put_deviceid_cache(struct nfs_client *clp)
 {
-        struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
+        struct inode *inode = rdata->inode;
+        struct nfs_server *nfss = NFS_SERVER(inode);
+        enum pnfs_try_status trypnfs;
-        dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
+        rdata->mds_ops = call_ops;
-        if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
-                int i;
+        dprintk("%s: Reading ino:%lu %u@%llu\n",
-                /* Verify cache is empty */
+                __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
-                for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
-                        BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
+        trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
-                clp->cl_devid_cache = NULL;
+        if (trypnfs == PNFS_NOT_ATTEMPTED) {
-                spin_unlock(&clp->cl_lock);
+                put_lseg(rdata->lseg);
-                kfree(local);
+                rdata->lseg = NULL;
+        } else {
+                nfs_inc_stats(inode, NFSIOS_PNFS_READ);
        }
+        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+        return trypnfs;
 }
-EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e2612ea0cbed..6380b9405bcd 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,8 @@
 #ifndef FS_NFS_PNFS_H
 #define FS_NFS_PNFS_H
+#include <linux/nfs_page.h>
 enum {
        NFS_LSEG_VALID = 0,     /* cleared when lseg is recalled/returned */
        NFS_LSEG_ROC,           /* roc bit received from server */
@@ -43,6 +45,11 @@ struct pnfs_layout_segment {
        struct pnfs_layout_hdr *pls_layout;
 };
+enum pnfs_try_status {
+        PNFS_ATTEMPTED     = 0,
+        PNFS_NOT_ATTEMPTED = 1,
+};
 #ifdef CONFIG_NFS_V4_1
 #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
@@ -61,10 +68,18 @@ struct pnfs_layoutdriver_type {
        const u32 id;
        const char *name;
        struct module *owner;
-        int (*set_layoutdriver) (struct nfs_server *);
-        int (*clear_layoutdriver) (struct nfs_server *);
        struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
        void (*free_lseg) (struct pnfs_layout_segment *lseg);
+        /* test for nfs page cache coalescing */
+        int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+        /*
+         * Return PNFS_ATTEMPTED to indicate the layout code has attempted
+         * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
+         */
+        enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
+        enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
 };
 struct pnfs_layout_hdr {
@@ -90,52 +105,6 @@ struct pnfs_device {
        unsigned int  pglen;
 };
-/*
- * Device ID RCU cache. A device ID is unique per client ID and layout type.
- */
-#define NFS4_DEVICE_ID_HASH_BITS        5
-#define NFS4_DEVICE_ID_HASH_SIZE        (1 << NFS4_DEVICE_ID_HASH_BITS)
-#define NFS4_DEVICE_ID_HASH_MASK        (NFS4_DEVICE_ID_HASH_SIZE - 1)
-static inline u32
-nfs4_deviceid_hash(struct nfs4_deviceid *id)
-{
-        unsigned char *cptr = (unsigned char *)id->data;
-        unsigned int nbytes = NFS4_DEVICEID4_SIZE;
-        u32 x = 0;
-        while (nbytes--) {
-                x *= 37;
-                x += *cptr++;
-        }
-        return x & NFS4_DEVICE_ID_HASH_MASK;
-}
-struct pnfs_deviceid_node {
-        struct hlist_node       de_node;
-        struct nfs4_deviceid    de_id;
-        atomic_t                de_ref;
-};
-struct pnfs_deviceid_cache {
-        spinlock_t              dc_lock;
-        atomic_t                dc_ref;
-        void                    (*dc_free_callback)(struct pnfs_deviceid_node *);
-        struct hlist_head       dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
-};
-extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
-                        void (*free_callback)(struct pnfs_deviceid_node *));
-extern void pnfs_put_deviceid_cache(struct nfs_client *);
-extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
-                                struct pnfs_deviceid_cache *,
-                                struct nfs4_deviceid *);
-extern struct pnfs_deviceid_node *pnfs_add_deviceid(
-                                struct pnfs_deviceid_cache *,
-                                struct pnfs_deviceid_node *);
-extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
-                              struct pnfs_deviceid_node *devid);
 extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
 extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
@@ -146,11 +115,18 @@ extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
 /* pnfs.c */
 void get_layout_hdr(struct pnfs_layout_hdr *lo);
+void put_lseg(struct pnfs_layout_segment *lseg);
 struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
                   enum pnfs_iomode access_type);
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
+enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
+                                             const struct rpc_call_ops *, int);
+enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
+                                            const struct rpc_call_ops *);
+void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
+void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
@@ -177,6 +153,16 @@ static inline int lo_fail_bit(u32 iomode)
                         NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
 }
+static inline struct pnfs_layout_segment *
+get_lseg(struct pnfs_layout_segment *lseg)
+{
+        if (lseg) {
+                atomic_inc(&lseg->pls_refcount);
+                smp_mb__after_atomic_inc();
+        }
+        return lseg;
+}
 /* Return true if a layout driver is being used for this mountpoint */
 static inline int pnfs_enabled_sb(struct nfs_server *nfss)
 {
@@ -194,12 +180,36 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
 }
 static inline struct pnfs_layout_segment *
+get_lseg(struct pnfs_layout_segment *lseg)
+{
+        return NULL;
+}
+static inline void put_lseg(struct pnfs_layout_segment *lseg)
+{
+}
+static inline struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
                   enum pnfs_iomode access_type)
 {
        return NULL;
 }
+static inline enum pnfs_try_status
+pnfs_try_to_read_data(struct nfs_read_data *data,
+                      const struct rpc_call_ops *call_ops)
+{
+        return PNFS_NOT_ATTEMPTED;
+}
+static inline enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_write_data *data,
+                       const struct rpc_call_ops *call_ops, int how)
+{
+        return PNFS_NOT_ATTEMPTED;
+}
 static inline bool
 pnfs_roc(struct inode *ino)
 {
@@ -230,6 +240,18 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
 {
 }
+static inline void
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+{
+        pgio->pg_test = NULL;
+}
+static inline void
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+{
+        pgio->pg_test = NULL;
+}
 #endif /* CONFIG_NFS_V4_1 */
 #endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 77d5e21c4ad6..b8ec170f2a0f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -741,4 +741,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .lock           = nfs_proc_lock,
        .lock_check_bounds = nfs_lock_check_bounds,
        .close_context  = nfs_close_context,
+        .init_client    = nfs_init_client,
 };
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index aedcaa7f291f..7cded2b12a05 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -18,19 +18,20 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
+#include <linux/module.h>
 #include <asm/system.h>
+#include "pnfs.h"
 #include "nfs4_fs.h"
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
-#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_PAGECACHE
-static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int);
+static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc);
-static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int);
+static int nfs_pagein_one(struct nfs_pageio_descriptor *desc);
 static const struct rpc_call_ops nfs_read_partial_ops;
 static const struct rpc_call_ops nfs_read_full_ops;
@@ -69,6 +70,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
 static void nfs_readdata_release(struct nfs_read_data *rdata)
 {
+        put_lseg(rdata->lseg);
        put_nfs_open_context(rdata->args.context);
        nfs_readdata_free(rdata);
 }
@@ -114,14 +116,13 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
 int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
                       struct page *page)
 {
-        LIST_HEAD(one_request);
        struct nfs_page *new;
        unsigned int len;
+        struct nfs_pageio_descriptor pgio;
        len = nfs_page_length(page);
        if (len == 0)
                return nfs_return_empty_page(page);
-        pnfs_update_layout(inode, ctx, IOMODE_READ);
        new = nfs_create_request(ctx, inode, page, 0, len);
        if (IS_ERR(new)) {
                unlock_page(page);
@@ -130,11 +131,14 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
        if (len < PAGE_CACHE_SIZE)
                zero_user_segment(page, len, PAGE_CACHE_SIZE);
-        nfs_list_add_request(new, &one_request);
+        nfs_pageio_init(&pgio, inode, NULL, 0, 0);
+        nfs_list_add_request(new, &pgio.pg_list);
+        pgio.pg_count = len;
        if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
-                nfs_pagein_multi(inode, &one_request, 1, len, 0);
+                nfs_pagein_multi(&pgio);
        else
-                nfs_pagein_one(inode, &one_request, 1, len, 0);
+                nfs_pagein_one(&pgio);
        return 0;
 }
@@ -155,24 +159,20 @@ static void nfs_readpage_release(struct nfs_page *req)
        nfs_release_request(req);
 }
-/*
+int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
- * Set up the NFS read request struct
+                      const struct rpc_call_ops *call_ops)
- */
-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
-                const struct rpc_call_ops *call_ops,
-                unsigned int count, unsigned int offset)
 {
-        struct inode *inode = req->wb_context->path.dentry->d_inode;
+        struct inode *inode = data->inode;
        int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
        struct rpc_task *task;
        struct rpc_message msg = {
                .rpc_argp = &data->args,
                .rpc_resp = &data->res,
-                .rpc_cred = req->wb_context->cred,
+                .rpc_cred = data->cred,
        };
        struct rpc_task_setup task_setup_data = {
                .task = &data->task,
-                .rpc_client = NFS_CLIENT(inode),
+                .rpc_client = clnt,
                .rpc_message = &msg,
                .callback_ops = call_ops,
                .callback_data = data,
@@ -180,9 +180,39 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
                .flags = RPC_TASK_ASYNC | swap_flags,
        };
+        /* Set up the initial task struct. */
+        NFS_PROTO(inode)->read_setup(data, &msg);
+        dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ "
+                        "offset %llu)\n",
+                        data->task.tk_pid,
+                        inode->i_sb->s_id,
+                        (long long)NFS_FILEID(inode),
+                        data->args.count,
+                        (unsigned long long)data->args.offset);
+        task = rpc_run_task(&task_setup_data);
+        if (IS_ERR(task))
+                return PTR_ERR(task);
+        rpc_put_task(task);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_read);
+/*
+ * Set up the NFS read request struct
+ */
+static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
+                const struct rpc_call_ops *call_ops,
+                unsigned int count, unsigned int offset,
+                struct pnfs_layout_segment *lseg)
+{
+        struct inode *inode = req->wb_context->path.dentry->d_inode;
        data->req         = req;
        data->inode       = inode;
-        data->cred        = msg.rpc_cred;
+        data->cred        = req->wb_context->cred;
+        data->lseg        = get_lseg(lseg);
        data->args.fh     = NFS_FH(inode);
        data->args.offset = req_offset(req) + offset;
@@ -197,21 +227,11 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
        data->res.eof     = 0;
        nfs_fattr_init(&data->fattr);
-        /* Set up the initial task struct. */
+        if (data->lseg &&
-        NFS_PROTO(inode)->read_setup(data, &msg);
+            (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED))
+                return 0;
-        dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
-                        data->task.tk_pid,
-                        inode->i_sb->s_id,
-                        (long long)NFS_FILEID(inode),
-                        count,
-                        (unsigned long long)data->args.offset);
-        task = rpc_run_task(&task_setup_data);
+        return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
-        if (IS_ERR(task))
-                return PTR_ERR(task);
-        rpc_put_task(task);
-        return 0;
 }
 static void
@@ -240,20 +260,21 @@ nfs_async_read_error(struct list_head *head)
 * won't see the new data until our attribute cache is updated.  This is more
 * or less conventional NFS client behavior.
 */
-static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
+static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
 {
-        struct nfs_page *req = nfs_list_entry(head->next);
+        struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
        struct page *page = req->wb_page;
        struct nfs_read_data *data;
-        size_t rsize = NFS_SERVER(inode)->rsize, nbytes;
+        size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes;
        unsigned int offset;
        int requests = 0;
        int ret = 0;
+        struct pnfs_layout_segment *lseg;
        LIST_HEAD(list);
        nfs_list_remove_request(req);
-        nbytes = count;
+        nbytes = desc->pg_count;
        do {
                size_t len = min(nbytes,rsize);
@@ -266,9 +287,11 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
        } while(nbytes != 0);
        atomic_set(&req->wb_complete, requests);
+        BUG_ON(desc->pg_lseg != NULL);
+        lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
        ClearPageError(page);
        offset = 0;
-        nbytes = count;
+        nbytes = desc->pg_count;
        do {
                int ret2;
@@ -280,12 +303,14 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
                if (nbytes < rsize)
                        rsize = nbytes;
                ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
-                                  rsize, offset);
+                                         rsize, offset, lseg);
                if (ret == 0)
                        ret = ret2;
                offset += rsize;
                nbytes -= rsize;
        } while (nbytes != 0);
+        put_lseg(lseg);
+        desc->pg_lseg = NULL;
        return ret;
@@ -300,16 +325,21 @@ out_bad:
        return -ENOMEM;
 }
-static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
+static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
 {
        struct nfs_page         *req;
        struct page             **pages;
        struct nfs_read_data    *data;
+        struct list_head *head = &desc->pg_list;
+        struct pnfs_layout_segment *lseg = desc->pg_lseg;
        int ret = -ENOMEM;
-        data = nfs_readdata_alloc(npages);
+        data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base,
-        if (!data)
+                                                     desc->pg_count));
-                goto out_bad;
+        if (!data) {
+                nfs_async_read_error(head);
+                goto out;
+        }
        pages = data->pagevec;
        while (!list_empty(head)) {
@@ -320,10 +350,14 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
                *pages++ = req->wb_page;
        }
        req = nfs_list_entry(data->pages.next);
+        if ((!lseg) && list_is_singular(&data->pages))
+                lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
-        return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
+        ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
-out_bad:
+                                0, lseg);
-        nfs_async_read_error(head);
+out:
+        put_lseg(lseg);
+        desc->pg_lseg = NULL;
        return ret;
 }
@@ -366,6 +400,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
                return;
        /* Yes, so retry the read at the end of the data */
+        data->mds_offset += resp->count;
        argp->offset += resp->count;
        argp->pgbase += resp->count;
        argp->count -= resp->count;
@@ -625,7 +660,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
        if (ret == 0)
                goto read_complete; /* all pages were read */
-        pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
+        pnfs_pageio_init_read(&pgio, inode);
        if (rsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
        else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b68c8607770f..2b8e9a5e366a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -263,8 +263,11 @@ static match_table_t nfs_local_lock_tokens = {
 static void nfs_umount_begin(struct super_block *);
 static int  nfs_statfs(struct dentry *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
+static int  nfs_show_devname(struct seq_file *, struct vfsmount *);
+static int  nfs_show_path(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
-static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
+static struct dentry *nfs_fs_mount(struct file_system_type *,
+                int, const char *, void *);
 static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
                int flags, const char *dev_name, void *raw_data);
 static void nfs_put_super(struct super_block *);
@@ -274,7 +277,7 @@ static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
 static struct file_system_type nfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs",
-        .get_sb         = nfs_get_sb,
+        .mount          = nfs_fs_mount,
        .kill_sb        = nfs_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -296,6 +299,8 @@ static const struct super_operations nfs_sops = {
        .evict_inode    = nfs_evict_inode,
        .umount_begin   = nfs_umount_begin,
        .show_options   = nfs_show_options,
+        .show_devname   = nfs_show_devname,
+        .show_path      = nfs_show_path,
        .show_stats     = nfs_show_stats,
        .remount_fs     = nfs_remount,
 };
@@ -303,16 +308,16 @@ static const struct super_operations nfs_sops = {
 #ifdef CONFIG_NFS_V4
 static int nfs4_validate_text_mount_data(void *options,
        struct nfs_parsed_mount_data *args, const char *dev_name);
-static int nfs4_try_mount(int flags, const char *dev_name,
+static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
-        struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
+        struct nfs_parsed_mount_data *data);
-static int nfs4_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+        int flags, const char *dev_name, void *raw_data);
 static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data);
 static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data);
-static int nfs4_referral_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+        int flags, const char *dev_name, void *raw_data);
 static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data);
 static void nfs4_kill_super(struct super_block *sb);
@@ -320,7 +325,7 @@ static void nfs4_kill_super(struct super_block *sb);
 static struct file_system_type nfs4_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
-        .get_sb         = nfs4_get_sb,
+        .mount          = nfs4_mount,
        .kill_sb        = nfs4_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -352,7 +357,7 @@ static struct file_system_type nfs4_remote_referral_fs_type = {
 struct file_system_type nfs4_referral_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
-        .get_sb         = nfs4_referral_get_sb,
+        .mount          = nfs4_referral_mount,
        .kill_sb        = nfs4_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -366,6 +371,8 @@ static const struct super_operations nfs4_sops = {
        .evict_inode    = nfs4_evict_inode,
        .umount_begin   = nfs_umount_begin,
        .show_options   = nfs_show_options,
+        .show_devname   = nfs_show_devname,
+        .show_path      = nfs_show_path,
        .show_stats     = nfs_show_stats,
        .remount_fs     = nfs_remount,
 };
@@ -726,6 +733,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
        return 0;
 }
+static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
+{
+        char *page = (char *) __get_free_page(GFP_KERNEL);
+        char *devname, *dummy;
+        int err = 0;
+        if (!page)
+                return -ENOMEM;
+        devname = nfs_path(&dummy, mnt->mnt_root, page, PAGE_SIZE);
+        if (IS_ERR(devname))
+                err = PTR_ERR(devname);
+        else
+                seq_escape(m, devname, " \t\n\\");
+        free_page((unsigned long)page);
+        return err;
+}
+static int nfs_show_path(struct seq_file *m, struct vfsmount *mnt)
+{
+        seq_puts(m, "/");
+        return 0;
+}
 /*
 * Present statistical information for this VFS mountpoint
 */
@@ -979,6 +1008,27 @@ static int nfs_parse_security_flavors(char *value,
        return 1;
 }
+static int nfs_get_option_str(substring_t args[], char **option)
+{
+        kfree(*option);
+        *option = match_strdup(args);
+        return !option;
+}
+static int nfs_get_option_ul(substring_t args[], unsigned long *option)
+{
+        int rc;
+        char *string;
+        string = match_strdup(args);
+        if (string == NULL)
+                return -ENOMEM;
+        rc = strict_strtoul(string, 10, option);
+        kfree(string);
+        return rc;
+}
 /*
 * Error-check and convert a string of mount options from user space into
 * a data structure.  The whole mount string is processed; bad options are
@@ -1127,155 +1177,82 @@ static int nfs_parse_mount_options(char *raw,
                 * options that take numeric values
                 */
                case Opt_port:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option) ||
-                        if (string == NULL)
+                            option > USHRT_MAX)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0 || option > USHRT_MAX)
                                goto out_invalid_value;
                        mnt->nfs_server.port = option;
                        break;
                case Opt_rsize:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->rsize = option;
                        break;
                case Opt_wsize:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->wsize = option;
                        break;
                case Opt_bsize:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->bsize = option;
                        break;
                case Opt_timeo:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option) || option == 0)
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0 || option == 0)
                                goto out_invalid_value;
                        mnt->timeo = option;
                        break;
                case Opt_retrans:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option) || option == 0)
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0 || option == 0)
                                goto out_invalid_value;
                        mnt->retrans = option;
                        break;
                case Opt_acregmin:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->acregmin = option;
                        break;
                case Opt_acregmax:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->acregmax = option;
                        break;
                case Opt_acdirmin:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->acdirmin = option;
                        break;
                case Opt_acdirmax:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->acdirmax = option;
                        break;
                case Opt_actimeo:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->acregmin = mnt->acregmax =
                        mnt->acdirmin = mnt->acdirmax = option;
                        break;
                case Opt_namelen:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->namlen = option;
                        break;
                case Opt_mountport:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option) ||
-                        if (string == NULL)
+                            option > USHRT_MAX)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0 || option > USHRT_MAX)
                                goto out_invalid_value;
                        mnt->mount_server.port = option;
                        break;
                case Opt_mountvers:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option) ||
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0 ||
                            option < NFS_MNT_VERSION ||
                            option > NFS_MNT3_VERSION)
                                goto out_invalid_value;
                        mnt->mount_server.version = option;
                        break;
                case Opt_nfsvers:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        switch (option) {
                        case NFS2_VERSION:
@@ -1295,12 +1272,7 @@ static int nfs_parse_mount_options(char *raw,
                        }
                        break;
                case Opt_minorversion:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        if (option > NFS4_MAX_MINOR_VERSION)
                                goto out_invalid_value;
@@ -1336,21 +1308,18 @@ static int nfs_parse_mount_options(char *raw,
                        case Opt_xprt_udp:
                                mnt->flags &= ~NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
-                                kfree(string);
                                break;
                        case Opt_xprt_tcp6:
                                protofamily = AF_INET6;
                        case Opt_xprt_tcp:
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-                                kfree(string);
                                break;
                        case Opt_xprt_rdma:
                                /* vector side protocols to TCP */
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
                                xprt_load_transport(string);
-                                kfree(string);
                                break;
                        default:
                                dfprintk(MOUNT, "NFS:   unrecognized "
@@ -1358,6 +1327,7 @@ static int nfs_parse_mount_options(char *raw,
                                kfree(string);
                                return 0;
                        }
+                        kfree(string);
                        break;
                case Opt_mountproto:
                        string = match_strdup(args);
@@ -1400,18 +1370,13 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_invalid_address;
                        break;
                case Opt_clientaddr:
-                        string = match_strdup(args);
+                        if (nfs_get_option_str(args, &mnt->client_address))
-                        if (string == NULL)
                                goto out_nomem;
-                        kfree(mnt->client_address);
-                        mnt->client_address = string;
                        break;
                case Opt_mounthost:
-                        string = match_strdup(args);
+                        if (nfs_get_option_str(args,
-                        if (string == NULL)
+                                               &mnt->mount_server.hostname))
                                goto out_nomem;
-                        kfree(mnt->mount_server.hostname);
-                        mnt->mount_server.hostname = string;
                        break;
                case Opt_mountaddr:
                        string = match_strdup(args);
@@ -1451,11 +1416,8 @@ static int nfs_parse_mount_options(char *raw,
                        };
                        break;
                case Opt_fscache_uniq:
-                        string = match_strdup(args);
+                        if (nfs_get_option_str(args, &mnt->fscache_uniq))
-                        if (string == NULL)
                                goto out_nomem;
-                        kfree(mnt->fscache_uniq);
-                        mnt->fscache_uniq = string;
                        mnt->options |= NFS_OPTION_FSCACHE;
                        break;
                case Opt_local_lock:
@@ -1665,99 +1627,59 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
        return nfs_walk_authlist(args, &request);
 }
-static int nfs_parse_simple_hostname(const char *dev_name,
+/*
-                                     char **hostname, size_t maxnamlen,
+ * Split "dev_name" into "hostname:export_path".
-                                     char **export_path, size_t maxpathlen)
+ *
+ * The leftmost colon demarks the split between the server's hostname
+ * and the export path.  If the hostname starts with a left square
+ * bracket, then it may contain colons.
+ *
+ * Note: caller frees hostname and export path, even on error.
+ */
+static int nfs_parse_devname(const char *dev_name,
+                             char **hostname, size_t maxnamlen,
+                             char **export_path, size_t maxpathlen)
 {
        size_t len;
-        char *colon, *comma;
+        char *end;
-        colon = strchr(dev_name, ':');
-        if (colon == NULL)
-                goto out_bad_devname;
-        len = colon - dev_name;
-        if (len > maxnamlen)
-                goto out_hostname;
-        /* N.B. caller will free nfs_server.hostname in all cases */
+        /* Is the host name protected with square brakcets? */
-        *hostname = kstrndup(dev_name, len, GFP_KERNEL);
+        if (*dev_name == '[') {
-        if (!*hostname)
+                end = strchr(++dev_name, ']');
-                goto out_nomem;
+                if (end == NULL || end[1] != ':')
-        /* kill possible hostname list: not supported */
-        comma = strchr(*hostname, ',');
-        if (comma != NULL) {
-                if (comma == *hostname)
                        goto out_bad_devname;
-                *comma = '\0';
-        }
-        colon++;
+                len = end - dev_name;
-        len = strlen(colon);
+                end++;
-        if (len > maxpathlen)
+        } else {
-                goto out_path;
+                char *comma;
-        *export_path = kstrndup(colon, len, GFP_KERNEL);
-        if (!*export_path)
-                goto out_nomem;
-        dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
-        return 0;
-out_bad_devname:
-        dfprintk(MOUNT, "NFS: device name not in host:path format\n");
-        return -EINVAL;
-out_nomem:
-        dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
-        return -ENOMEM;
-out_hostname:
-        dfprintk(MOUNT, "NFS: server hostname too long\n");
-        return -ENAMETOOLONG;
-out_path:
-        dfprintk(MOUNT, "NFS: export pathname too long\n");
-        return -ENAMETOOLONG;
-}
-/*
- * Hostname has square brackets around it because it contains one or
- * more colons.  We look for the first closing square bracket, and a
- * colon must follow it.
- */
-static int nfs_parse_protected_hostname(const char *dev_name,
-                                        char **hostname, size_t maxnamlen,
-                                        char **export_path, size_t maxpathlen)
-{
-        size_t len;
-        char *start, *end;
-        start = (char *)(dev_name + 1);
+                end = strchr(dev_name, ':');
+                if (end == NULL)
+                        goto out_bad_devname;
+                len = end - dev_name;
-        end = strchr(start, ']');
+                /* kill possible hostname list: not supported */
-        if (end == NULL)
+                comma = strchr(dev_name, ',');
-                goto out_bad_devname;
+                if (comma != NULL && comma < end)
-        if (*(end + 1) != ':')
+                        *comma = 0;
-                goto out_bad_devname;
+        }
-        len = end - start;
        if (len > maxnamlen)
                goto out_hostname;
        /* N.B. caller will free nfs_server.hostname in all cases */
-        *hostname = kstrndup(start, len, GFP_KERNEL);
+        *hostname = kstrndup(dev_name, len, GFP_KERNEL);
        if (*hostname == NULL)
                goto out_nomem;
+        len = strlen(++end);
-        end += 2;
-        len = strlen(end);
        if (len > maxpathlen)
                goto out_path;
        *export_path = kstrndup(end, len, GFP_KERNEL);
        if (!*export_path)
                goto out_nomem;
+        dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
        return 0;
 out_bad_devname:
@@ -1778,29 +1700,6 @@ out_path:
 }
 /*
- * Split "dev_name" into "hostname:export_path".
- *
- * The leftmost colon demarks the split between the server's hostname
- * and the export path.  If the hostname starts with a left square
- * bracket, then it may contain colons.
- *
- * Note: caller frees hostname and export path, even on error.
- */
-static int nfs_parse_devname(const char *dev_name,
-                             char **hostname, size_t maxnamlen,
-                             char **export_path, size_t maxpathlen)
-{
-        if (*dev_name == '[')
-                return nfs_parse_protected_hostname(dev_name,
-                                                    hostname, maxnamlen,
-                                                    export_path, maxpathlen);
-        return nfs_parse_simple_hostname(dev_name,
-                                         hostname, maxnamlen,
-                                         export_path, maxpathlen);
-}
-/*
 * Validate the NFS2/NFS3 mount data
 * - fills in the mount root filehandle
 *
@@ -2267,19 +2166,19 @@ static int nfs_bdi_register(struct nfs_server *server)
        return bdi_register_dev(&server->backing_dev_info, server->s_dev);
 }
-static int nfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *raw_data)
 {
        struct nfs_server *server = NULL;
        struct super_block *s;
        struct nfs_parsed_mount_data *data;
        struct nfs_fh *mntfh;
-        struct dentry *mntroot;
+        struct dentry *mntroot = ERR_PTR(-ENOMEM);
        int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
        struct nfs_sb_mountdata sb_mntdata = {
                .mntflags = flags,
        };
-        int error = -ENOMEM;
+        int error;
        data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
        mntfh = nfs_alloc_fhandle();
@@ -2290,12 +2189,14 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        /* Validate the mount data */
        error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
-        if (error < 0)
+        if (error < 0) {
+                mntroot = ERR_PTR(error);
                goto out;
+        }
 #ifdef CONFIG_NFS_V4
        if (data->version == 4) {
-                error = nfs4_try_mount(flags, dev_name, data, mnt);
+                mntroot = nfs4_try_mount(flags, dev_name, data);
                kfree(data->client_address);
                kfree(data->nfs_server.export_path);
                goto out;
@@ -2305,7 +2206,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        /* Get a volume representation */
        server = nfs_create_server(data, mntfh);
        if (IS_ERR(server)) {
-                error = PTR_ERR(server);
+                mntroot = ERR_CAST(server);
                goto out;
        }
        sb_mntdata.server = server;
@@ -2316,7 +2217,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        /* Get a superblock - note that we may end up sharing one that already exists */
        s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata);
        if (IS_ERR(s)) {
-                error = PTR_ERR(s);
+                mntroot = ERR_CAST(s);
                goto out_err_nosb;
        }
@@ -2325,8 +2226,10 @@ static int nfs_get_sb(struct file_system_type *fs_type,
                server = NULL;
        } else {
                error = nfs_bdi_register(server);
-                if (error)
+                if (error) {
+                        mntroot = ERR_PTR(error);
                        goto error_splat_bdi;
+                }
        }
        if (!s->s_root) {
@@ -2336,20 +2239,15 @@ static int nfs_get_sb(struct file_system_type *fs_type,
                        s, data ? data->fscache_uniq : NULL, NULL);
        }
-        mntroot = nfs_get_root(s, mntfh);
+        mntroot = nfs_get_root(s, mntfh, dev_name);
-        if (IS_ERR(mntroot)) {
+        if (IS_ERR(mntroot))
-                error = PTR_ERR(mntroot);
                goto error_splat_super;
-        }
        error = security_sb_set_mnt_opts(s, &data->lsm_opts);
        if (error)
                goto error_splat_root;
        s->s_flags |= MS_ACTIVE;
-        mnt->mnt_sb = s;
-        mnt->mnt_root = mntroot;
-        error = 0;
 out:
        kfree(data->nfs_server.hostname);
@@ -2359,7 +2257,7 @@ out:
 out_free_fh:
        nfs_free_fhandle(mntfh);
        kfree(data);
-        return error;
+        return mntroot;
 out_err_nosb:
        nfs_free_server(server);
@@ -2367,6 +2265,7 @@ out_err_nosb:
 error_splat_root:
        dput(mntroot);
+        mntroot = ERR_PTR(error);
 error_splat_super:
        if (server && !s->s_root)
                bdi_unregister(&server->backing_dev_info);
@@ -2450,7 +2349,7 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
                nfs_fscache_get_super_cookie(s, NULL, data);
        }
-        mntroot = nfs_get_root(s, data->fh);
+        mntroot = nfs_get_root(s, data->fh, dev_name);
        if (IS_ERR(mntroot)) {
                error = PTR_ERR(mntroot);
                goto error_splat_super;
@@ -2718,7 +2617,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
                        s, data ? data->fscache_uniq : NULL, NULL);
        }
-        mntroot = nfs4_get_root(s, mntfh);
+        mntroot = nfs4_get_root(s, mntfh, dev_name);
        if (IS_ERR(mntroot)) {
                error = PTR_ERR(mntroot);
                goto error_splat_super;
@@ -2771,27 +2670,6 @@ static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type,
        return root_mnt;
 }
-static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt)
-{
-        char *page = (char *) __get_free_page(GFP_KERNEL);
-        char *devname, *tmp;
-        if (page == NULL)
-                return;
-        devname = nfs_path(path->mnt->mnt_devname,
-                        path->mnt->mnt_root, path->dentry,
-                        page, PAGE_SIZE);
-        if (IS_ERR(devname))
-                goto out_freepage;
-        tmp = kstrdup(devname, GFP_KERNEL);
-        if (tmp == NULL)
-                goto out_freepage;
-        kfree(mnt->mnt_devname);
-        mnt->mnt_devname = tmp;
-out_freepage:
-        free_page((unsigned long)page);
-}
 struct nfs_referral_count {
        struct list_head list;
        const struct task_struct *task;
@@ -2858,17 +2736,18 @@ static void nfs_referral_loop_unprotect(void)
        kfree(p);
 }
-static int nfs_follow_remote_path(struct vfsmount *root_mnt,
+static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
-                const char *export_path, struct vfsmount *mnt_target)
+                const char *export_path)
 {
        struct nameidata *nd = NULL;
        struct mnt_namespace *ns_private;
        struct super_block *s;
+        struct dentry *dentry;
        int ret;
        nd = kmalloc(sizeof(*nd), GFP_KERNEL);
        if (nd == NULL)
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
        ns_private = create_mnt_ns(root_mnt);
        ret = PTR_ERR(ns_private);
@@ -2890,32 +2769,27 @@ static int nfs_follow_remote_path(struct vfsmount *root_mnt,
        s = nd->path.mnt->mnt_sb;
        atomic_inc(&s->s_active);
-        mnt_target->mnt_sb = s;
+        dentry = dget(nd->path.dentry);
-        mnt_target->mnt_root = dget(nd->path.dentry);
-        /* Correct the device pathname */
-        nfs_fix_devname(&nd->path, mnt_target);
        path_put(&nd->path);
        kfree(nd);
        down_write(&s->s_umount);
-        return 0;
+        return dentry;
 out_put_mnt_ns:
        put_mnt_ns(ns_private);
 out_mntput:
        mntput(root_mnt);
 out_err:
        kfree(nd);
-        return ret;
+        return ERR_PTR(ret);
 }
-static int nfs4_try_mount(int flags, const char *dev_name,
+static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
-                         struct nfs_parsed_mount_data *data,
+                         struct nfs_parsed_mount_data *data)
-                         struct vfsmount *mnt)
 {
        char *export_path;
        struct vfsmount *root_mnt;
-        int error;
+        struct dentry *res;
        dfprintk(MOUNT, "--> nfs4_try_mount()\n");
@@ -2925,26 +2799,25 @@ static int nfs4_try_mount(int flags, const char *dev_name,
                        data->nfs_server.hostname);
        data->nfs_server.export_path = export_path;
-        error = PTR_ERR(root_mnt);
+        res = ERR_CAST(root_mnt);
-        if (IS_ERR(root_mnt))
+        if (!IS_ERR(root_mnt))
-                goto out;
+                res = nfs_follow_remote_path(root_mnt, export_path);
-        error = nfs_follow_remote_path(root_mnt, export_path, mnt);
-out:
+        dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n",
-        dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error,
+                        IS_ERR(res) ? PTR_ERR(res) : 0,
-                        error != 0 ? " [error]" : "");
+                        IS_ERR(res) ? " [error]" : "");
-        return error;
+        return res;
 }
 /*
 * Get the superblock for an NFS4 mountpoint
 */
-static int nfs4_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *raw_data)
 {
        struct nfs_parsed_mount_data *data;
        int error = -ENOMEM;
+        struct dentry *res = ERR_PTR(-ENOMEM);
        data = nfs_alloc_parsed_mount_data(4);
        if (data == NULL)
@@ -2952,10 +2825,14 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
        /* Validate the mount data */
        error = nfs4_validate_mount_data(raw_data, data, dev_name);
-        if (error < 0)
+        if (error < 0) {
+                res = ERR_PTR(error);
                goto out;
+        }
-        error = nfs4_try_mount(flags, dev_name, data, mnt);
+        res = nfs4_try_mount(flags, dev_name, data);
+        if (IS_ERR(res))
+                error = PTR_ERR(res);
 out:
        kfree(data->client_address);
@@ -2964,9 +2841,9 @@ out:
        kfree(data->fscache_uniq);
 out_free_data:
        kfree(data);
-        dprintk("<-- nfs4_get_sb() = %d%s\n", error,
+        dprintk("<-- nfs4_mount() = %d%s\n", error,
                        error != 0 ? " [error]" : "");
-        return error;
+        return res;
 }
 static void nfs4_kill_super(struct super_block *sb)
@@ -3033,7 +2910,7 @@ nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
                nfs_fscache_get_super_cookie(s, NULL, data);
        }
-        mntroot = nfs4_get_root(s, data->fh);
+        mntroot = nfs4_get_root(s, data->fh, dev_name);
        if (IS_ERR(mntroot)) {
                error = PTR_ERR(mntroot);
                goto error_splat_super;
@@ -3120,7 +2997,7 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
                nfs_fscache_get_super_cookie(s, NULL, data);
        }
-        mntroot = nfs4_get_root(s, mntfh);
+        mntroot = nfs4_get_root(s, mntfh, dev_name);
        if (IS_ERR(mntroot)) {
                error = PTR_ERR(mntroot);
                goto error_splat_super;
@@ -3160,16 +3037,15 @@ error_splat_bdi:
 /*
 * Create an NFS4 server record on referral traversal
 */
-static int nfs4_referral_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
-                int flags, const char *dev_name, void *raw_data,
+                int flags, const char *dev_name, void *raw_data)
-                struct vfsmount *mnt)
 {
        struct nfs_clone_mount *data = raw_data;
        char *export_path;
        struct vfsmount *root_mnt;
-        int error;
+        struct dentry *res;
-        dprintk("--> nfs4_referral_get_sb()\n");
+        dprintk("--> nfs4_referral_mount()\n");
        export_path = data->mnt_path;
        data->mnt_path = "/";
@@ -3178,15 +3054,13 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type,
                        flags, data, data->hostname);
        data->mnt_path = export_path;
-        error = PTR_ERR(root_mnt);
+        res = ERR_CAST(root_mnt);
-        if (IS_ERR(root_mnt))
+        if (!IS_ERR(root_mnt))
-                goto out;
+                res = nfs_follow_remote_path(root_mnt, export_path);
+        dprintk("<-- nfs4_referral_mount() = %ld%s\n",
-        error = nfs_follow_remote_path(root_mnt, export_path, mnt);
+                        IS_ERR(res) ? PTR_ERR(res) : 0,
-out:
+                        IS_ERR(res) ? " [error]" : "");
-        dprintk("<-- nfs4_referral_get_sb() = %d%s\n", error,
+        return res;
-                        error != 0 ? " [error]" : "");
-        return error;
 }
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index e313a51acdd1..8d6864c2a5fa 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -148,6 +148,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
        alias = d_lookup(parent, &data->args.name);
        if (alias != NULL) {
                int ret = 0;
+                void *devname_garbage = NULL;
                /*
                 * Hey, we raced with lookup... See if we need to transfer
@@ -157,6 +158,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
                spin_lock(&alias->d_lock);
                if (alias->d_inode != NULL &&
                    !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
+                        devname_garbage = alias->d_fsdata;
                        alias->d_fsdata = data;
                        alias->d_flags |= DCACHE_NFSFS_RENAMED;
                        ret = 1;
@@ -164,6 +166,13 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
                spin_unlock(&alias->d_lock);
                nfs_dec_sillycount(dir);
                dput(alias);
+                /*
+                 * If we'd displaced old cached devname, free it.  At that
+                 * point dentry is definitely not a root, so we won't need
+                 * that anymore.
+                 */
+                if (devname_garbage)
+                        kfree(devname_garbage);
                return ret;
        }
        data->dir = igrab(dir);
@@ -180,7 +189,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
        task_setup_data.rpc_client = NFS_CLIENT(dir);
        task = rpc_run_task(&task_setup_data);
        if (!IS_ERR(task))
-                rpc_put_task(task);
+                rpc_put_task_async(task);
        return 1;
 }
@@ -252,6 +261,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct nfs_unlinkdata *data;
        int status = -ENOMEM;
+        void *devname_garbage = NULL;
        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (data == NULL)
@@ -269,8 +279,16 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
        if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
                goto out_unlock;
        dentry->d_flags |= DCACHE_NFSFS_RENAMED;
+        devname_garbage = dentry->d_fsdata;
        dentry->d_fsdata = data;
        spin_unlock(&dentry->d_lock);
+        /*
+         * If we'd displaced old cached devname, free it.  At that
+         * point dentry is definitely not a root, so we won't need
+         * that anymore.
+         */
+        if (devname_garbage)
+                kfree(devname_garbage);
        return 0;
 out_unlock:
        spin_unlock(&dentry->d_lock);
@@ -299,6 +317,7 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
        if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
                dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
                data = dentry->d_fsdata;
+                dentry->d_fsdata = NULL;
        }
        spin_unlock(&dentry->d_lock);
@@ -315,6 +334,7 @@ nfs_cancel_async_unlink(struct dentry *dentry)
                struct nfs_unlinkdata *data = dentry->d_fsdata;
                dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+                dentry->d_fsdata = NULL;
                spin_unlock(&dentry->d_lock);
                nfs_free_unlinkdata(data);
                return;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 10d648ea128b..47a3ad63e0d5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -28,6 +28,7 @@
 #include "iostat.h"
 #include "nfs4_fs.h"
 #include "fscache.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_PAGECACHE
@@ -96,6 +97,7 @@ void nfs_writedata_free(struct nfs_write_data *p)
 static void nfs_writedata_release(struct nfs_write_data *wdata)
 {
+        put_lseg(wdata->lseg);
        put_nfs_open_context(wdata->args.context);
        nfs_writedata_free(wdata);
 }
@@ -781,25 +783,21 @@ static int flush_task_priority(int how)
        return RPC_PRIORITY_NORMAL;
 }
-/*
+int nfs_initiate_write(struct nfs_write_data *data,
- * Set up the argument/result storage required for the RPC call.
+                       struct rpc_clnt *clnt,
- */
+                       const struct rpc_call_ops *call_ops,
-static int nfs_write_rpcsetup(struct nfs_page *req,
+                       int how)
-                struct nfs_write_data *data,
-                const struct rpc_call_ops *call_ops,
-                unsigned int count, unsigned int offset,
-                int how)
 {
-        struct inode *inode = req->wb_context->path.dentry->d_inode;
+        struct inode *inode = data->inode;
        int priority = flush_task_priority(how);
        struct rpc_task *task;
        struct rpc_message msg = {
                .rpc_argp = &data->args,
                .rpc_resp = &data->res,
-                .rpc_cred = req->wb_context->cred,
+                .rpc_cred = data->cred,
        };
        struct rpc_task_setup task_setup_data = {
-                .rpc_client = NFS_CLIENT(inode),
+                .rpc_client = clnt,
                .task = &data->task,
                .rpc_message = &msg,
                .callback_ops = call_ops,
@@ -810,12 +808,52 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
        };
        int ret = 0;
+        /* Set up the initial task struct.  */
+        NFS_PROTO(inode)->write_setup(data, &msg);
+        dprintk("NFS: %5u initiated write call "
+                "(req %s/%lld, %u bytes @ offset %llu)\n",
+                data->task.tk_pid,
+                inode->i_sb->s_id,
+                (long long)NFS_FILEID(inode),
+                data->args.count,
+                (unsigned long long)data->args.offset);
+        task = rpc_run_task(&task_setup_data);
+        if (IS_ERR(task)) {
+                ret = PTR_ERR(task);
+                goto out;
+        }
+        if (how & FLUSH_SYNC) {
+                ret = rpc_wait_for_completion_task(task);
+                if (ret == 0)
+                        ret = task->tk_status;
+        }
+        rpc_put_task(task);
+out:
+        return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_write);
+/*
+ * Set up the argument/result storage required for the RPC call.
+ */
+static int nfs_write_rpcsetup(struct nfs_page *req,
+                struct nfs_write_data *data,
+                const struct rpc_call_ops *call_ops,
+                unsigned int count, unsigned int offset,
+                struct pnfs_layout_segment *lseg,
+                int how)
+{
+        struct inode *inode = req->wb_context->path.dentry->d_inode;
        /* Set up the RPC argument and reply structs
         * NB: take care not to mess about with data->commit et al. */
        data->req = req;
        data->inode = inode = req->wb_context->path.dentry->d_inode;
-        data->cred = msg.rpc_cred;
+        data->cred = req->wb_context->cred;
+        data->lseg = get_lseg(lseg);
        data->args.fh     = NFS_FH(inode);
        data->args.offset = req_offset(req) + offset;
@@ -836,30 +874,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
        data->res.verf    = &data->verf;
        nfs_fattr_init(&data->fattr);
-        /* Set up the initial task struct.  */
+        if (data->lseg &&
-        NFS_PROTO(inode)->write_setup(data, &msg);
+            (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
+                return 0;
-        dprintk("NFS: %5u initiated write call "
-                "(req %s/%lld, %u bytes @ offset %llu)\n",
-                data->task.tk_pid,
-                inode->i_sb->s_id,
-                (long long)NFS_FILEID(inode),
-                count,
-                (unsigned long long)data->args.offset);
-        task = rpc_run_task(&task_setup_data);
+        return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
-        if (IS_ERR(task)) {
-                ret = PTR_ERR(task);
-                goto out;
-        }
-        if (how & FLUSH_SYNC) {
-                ret = rpc_wait_for_completion_task(task);
-                if (ret == 0)
-                        ret = task->tk_status;
-        }
-        rpc_put_task(task);
-out:
-        return ret;
 }
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -879,20 +898,21 @@ static void nfs_redirty_request(struct nfs_page *req)
 * Generate multiple small requests to write out a single
 * contiguous dirty area on one page.
 */
-static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
+static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
 {
-        struct nfs_page *req = nfs_list_entry(head->next);
+        struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
        struct page *page = req->wb_page;
        struct nfs_write_data *data;
-        size_t wsize = NFS_SERVER(inode)->wsize, nbytes;
+        size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes;
        unsigned int offset;
        int requests = 0;
        int ret = 0;
+        struct pnfs_layout_segment *lseg;
        LIST_HEAD(list);
        nfs_list_remove_request(req);
-        nbytes = count;
+        nbytes = desc->pg_count;
        do {
                size_t len = min(nbytes, wsize);
@@ -905,9 +925,11 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
        } while (nbytes != 0);
        atomic_set(&req->wb_complete, requests);
+        BUG_ON(desc->pg_lseg);
+        lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
        ClearPageError(page);
        offset = 0;
-        nbytes = count;
+        nbytes = desc->pg_count;
        do {
                int ret2;
@@ -919,20 +941,22 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
                if (nbytes < wsize)
                        wsize = nbytes;
                ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
-                                   wsize, offset, how);
+                                          wsize, offset, lseg, desc->pg_ioflags);
                if (ret == 0)
                        ret = ret2;
                offset += wsize;
                nbytes -= wsize;
        } while (nbytes != 0);
+        put_lseg(lseg);
+        desc->pg_lseg = NULL;
        return ret;
 out_bad:
        while (!list_empty(&list)) {
                data = list_entry(list.next, struct nfs_write_data, pages);
                list_del(&data->pages);
-                nfs_writedata_release(data);
+                nfs_writedata_free(data);
        }
        nfs_redirty_request(req);
        return -ENOMEM;
@@ -946,16 +970,26 @@ out_bad:
 * This is the case if nfs_updatepage detects a conflicting request
 * that has been written but not committed.
 */
-static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
+static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
 {
        struct nfs_page         *req;
        struct page             **pages;
        struct nfs_write_data   *data;
+        struct list_head *head = &desc->pg_list;
+        struct pnfs_layout_segment *lseg = desc->pg_lseg;
+        int ret;
-        data = nfs_writedata_alloc(npages);
+        data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base,
-        if (!data)
+                                                      desc->pg_count));
-                goto out_bad;
+        if (!data) {
+                while (!list_empty(head)) {
+                        req = nfs_list_entry(head->next);
+                        nfs_list_remove_request(req);
+                        nfs_redirty_request(req);
+                }
+                ret = -ENOMEM;
+                goto out;
+        }
        pages = data->pagevec;
        while (!list_empty(head)) {
                req = nfs_list_entry(head->next);
@@ -965,16 +999,15 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
                *pages++ = req->wb_page;
        }
        req = nfs_list_entry(data->pages.next);
+        if ((!lseg) && list_is_singular(&data->pages))
+                lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
        /* Set up the argument struct */
-        return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how);
+        ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags);
- out_bad:
+out:
-        while (!list_empty(head)) {
+        put_lseg(lseg); /* Cleans any gotten in ->pg_test */
-                req = nfs_list_entry(head->next);
+        desc->pg_lseg = NULL;
-                nfs_list_remove_request(req);
+        return ret;
-                nfs_redirty_request(req);
-        }
-        return -ENOMEM;
 }
 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
@@ -982,6 +1015,8 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 {
        size_t wsize = NFS_SERVER(inode)->wsize;
+        pnfs_pageio_init_write(pgio, inode);
        if (wsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
        else
@@ -1132,7 +1167,7 @@ static const struct rpc_call_ops nfs_write_full_ops = {
 /*
 * This function is called when the WRITE call is complete.
 */
-int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
+void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct nfs_writeargs    *argp = &data->args;
        struct nfs_writeres     *resp = &data->res;
@@ -1151,7 +1186,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
         */
        status = NFS_PROTO(data->inode)->write_done(task, data);
        if (status != 0)
-                return status;
+                return;
        nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -1166,6 +1201,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                 */
                static unsigned long    complain;
+                /* Note this will print the MDS for a DS write */
                if (time_before(complain, jiffies)) {
                        dprintk("NFS:       faulty NFS server %s:"
                                " (committed = %d) != (stable = %d)\n",
@@ -1186,6 +1222,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                        /* Was this an NFSv2 write or an NFSv3 stable write? */
                        if (resp->verf->committed != NFS_UNSTABLE) {
                                /* Resend from where the server left off */
+                                data->mds_offset += resp->count;
                                argp->offset += resp->count;
                                argp->pgbase += resp->count;
                                argp->count -= resp->count;
@@ -1196,7 +1233,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                                argp->stable = NFS_FILE_SYNC;
                        }
                        nfs_restart_rpc(task, server->nfs_client);
-                        return -EAGAIN;
+                        return;
                }
                if (time_before(complain, jiffies)) {
                        printk(KERN_WARNING
@@ -1207,7 +1244,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                /* Can't do anything about it except throw an error. */
                task->tk_status = -EIO;
        }
-        return 0;
+        return;
 }
@@ -1292,6 +1329,8 @@ static int nfs_commit_rpcsetup(struct list_head *head,
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
+        if (how & FLUSH_SYNC)
+                rpc_wait_for_completion_task(task);
        rpc_put_task(task);
        return 0;
 }
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index fc1c52571c03..84c27d69d421 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -42,6 +42,11 @@ struct nfsacl_encode_desc {
        gid_t gid;
 };
+struct nfsacl_simple_acl {
+        struct posix_acl acl;
+        struct posix_acl_entry ace[4];
+};
 static int
 xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
 {
@@ -72,9 +77,20 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
        return 0;
 }
-unsigned int
+/**
-nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
+ * nfsacl_encode - Encode an NFSv3 ACL
-              struct posix_acl *acl, int encode_entries, int typeflag)
+ *
+ * @buf: destination xdr_buf to contain XDR encoded ACL
+ * @base: byte offset in xdr_buf where XDR'd ACL begins
+ * @inode: inode of file whose ACL this is
+ * @acl: posix_acl to encode
+ * @encode_entries: whether to encode ACEs as well
+ * @typeflag: ACL type: NFS_ACL_DEFAULT or zero
+ *
+ * Returns size of encoded ACL in bytes or a negative errno value.
+ */
+int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
+                  struct posix_acl *acl, int encode_entries, int typeflag)
 {
        int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0;
        struct nfsacl_encode_desc nfsacl_desc = {
@@ -88,17 +104,22 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
                .uid = inode->i_uid,
                .gid = inode->i_gid,
        };
+        struct nfsacl_simple_acl aclbuf;
        int err;
-        struct posix_acl *acl2 = NULL;
        if (entries > NFS_ACL_MAX_ENTRIES ||
            xdr_encode_word(buf, base, entries))
                return -EINVAL;
        if (encode_entries && acl && acl->a_count == 3) {
-                /* Fake up an ACL_MASK entry. */
+                struct posix_acl *acl2 = &aclbuf.acl;
-                acl2 = posix_acl_alloc(4, GFP_KERNEL);
-                if (!acl2)
+                /* Avoid the use of posix_acl_alloc().  nfsacl_encode() is
-                        return -ENOMEM;
+                 * invoked in contexts where a memory allocation failure is
+                 * fatal.  Fortunately this fake ACL is small enough to
+                 * construct on the stack. */
+                memset(acl2, 0, sizeof(acl2));
+                posix_acl_init(acl2, 4);
                /* Insert entries in canonical order: other orders seem
                 to confuse Solaris VxFS. */
                acl2->a_entries[0] = acl->a_entries[0];  /* ACL_USER_OBJ */
@@ -109,8 +130,6 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
                nfsacl_desc.acl = acl2;
        }
        err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc);
-        if (acl2)
-                posix_acl_release(acl2);
        if (!err)
                err = 8 + nfsacl_desc.desc.elem_size *
                          nfsacl_desc.desc.array_len;
@@ -224,9 +243,18 @@ posix_acl_from_nfsacl(struct posix_acl *acl)
        return 0;
 }
-unsigned int
+/**
-nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
+ * nfsacl_decode - Decode an NFSv3 ACL
-              struct posix_acl **pacl)
+ *
+ * @buf: xdr_buf containing XDR'd ACL data to decode
+ * @base: byte offset in xdr_buf where XDR'd ACL begins
+ * @aclcnt: count of ACEs in decoded posix_acl
+ * @pacl: buffer in which to place decoded posix_acl
+ *
+ * Returns the length of the decoded ACL in bytes, or a negative errno value.
+ */
+int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
+                  struct posix_acl **pacl)
 {
        struct nfsacl_decode_desc nfsacl_desc = {
                .desc = {
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index bf9cbd242ddd..124e8fcb0dd6 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -22,30 +22,17 @@
 static struct file *do_open(char *name, int flags)
 {
-        struct nameidata nd;
        struct vfsmount *mnt;
-        int error;
+        struct file *file;
        mnt = do_kern_mount("nfsd", 0, "nfsd", NULL);
        if (IS_ERR(mnt))
                return (struct file *)mnt;
-        error = vfs_path_lookup(mnt->mnt_root, mnt, name, 0, &nd);
+        file = file_open_root(mnt->mnt_root, mnt, name, flags);
-        mntput(mnt);    /* drop do_kern_mount reference */
-        if (error)
-                return ERR_PTR(error);
-        if (flags == O_RDWR)
-                error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags);
-        else
-                error = may_open(&nd.path, MAY_WRITE, flags);
-        if (!error)
+        mntput(mnt);    /* drop do_kern_mount reference */
-                return dentry_open(nd.path.dentry, nd.path.mnt, flags,
+        return file;
-                                   current_cred());
-        path_put(&nd.path);
-        return ERR_PTR(error);
 }
 static struct {
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 8b31e5f8795d..ad000aeb21a2 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -299,7 +299,6 @@ svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
 #define EXPORT_HASHBITS         8
 #define EXPORT_HASHMAX          (1<< EXPORT_HASHBITS)
-#define EXPORT_HASHMASK         (EXPORT_HASHMAX -1)
 static struct cache_head *export_table[EXPORT_HASHMAX];
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 3be975e18919..02eb4edf0ece 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -432,7 +432,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
         * If the server returns different values for sessionID, slotID or
         * sequence number, the server is looney tunes.
         */
-        p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4);
+        p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4);
        if (unlikely(p == NULL))
                goto out_overflow;
        memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
@@ -484,7 +484,7 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr,
 out:
        return status;
 out_default:
-        return nfs_cb_stat_to_errno(status);
+        return nfs_cb_stat_to_errno(nfserr);
 }
 /*
@@ -564,11 +564,9 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
        if (unlikely(status))
                goto out;
        if (unlikely(nfserr != NFS4_OK))
-                goto out_default;
+                status = nfs_cb_stat_to_errno(nfserr);
 out:
        return status;
-out_default:
-        return nfs_cb_stat_to_errno(status);
 }
 /*
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 6d2c397d458b..55780a22fdbd 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -63,7 +63,6 @@ struct ent {
 #define ENT_HASHBITS          8
 #define ENT_HASHMAX           (1 << ENT_HASHBITS)
-#define ENT_HASHMASK          (ENT_HASHMAX - 1)
 static void
 ent_init(struct cache_head *cnew, struct cache_head *citm)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index db52546143d1..5fcb1396a7e3 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -984,8 +984,8 @@ typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
                              void *);
 enum nfsd4_op_flags {
        ALLOWED_WITHOUT_FH = 1 << 0,    /* No current filehandle required */
-        ALLOWED_ON_ABSENT_FS = 2 << 0,  /* ops processed on absent fs */
+        ALLOWED_ON_ABSENT_FS = 1 << 1,  /* ops processed on absent fs */
-        ALLOWED_AS_FIRST_OP = 3 << 0,   /* ops reqired first in compound */
+        ALLOWED_AS_FIRST_OP = 1 << 2,   /* ops reqired first in compound */
 };
 struct nfsd4_operation {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d98d0213285d..fbde6f79922e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -148,7 +148,7 @@ static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE];
 /* hash table for nfs4_file */
 #define FILE_HASH_BITS                   8
 #define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
-#define FILE_HASH_MASK                  (FILE_HASH_SIZE - 1)
 /* hash table for (open)nfs4_stateid */
 #define STATEID_HASH_BITS              10
 #define STATEID_HASH_SIZE              (1 << STATEID_HASH_BITS)
@@ -230,9 +230,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        dp->dl_client = clp;
        get_nfs4_file(fp);
        dp->dl_file = fp;
-        dp->dl_vfs_file = find_readable_file(fp);
-        get_file(dp->dl_vfs_file);
-        dp->dl_flock = NULL;
        dp->dl_type = type;
        dp->dl_stateid.si_boot = boot_time;
        dp->dl_stateid.si_stateownerid = current_delegid++;
@@ -241,8 +238,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
        dp->dl_time = 0;
        atomic_set(&dp->dl_count, 1);
-        list_add(&dp->dl_perfile, &fp->fi_delegations);
-        list_add(&dp->dl_perclnt, &clp->cl_delegations);
        INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc);
        return dp;
 }
@@ -253,36 +248,30 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
        if (atomic_dec_and_test(&dp->dl_count)) {
                dprintk("NFSD: freeing dp %p\n",dp);
                put_nfs4_file(dp->dl_file);
-                fput(dp->dl_vfs_file);
                kmem_cache_free(deleg_slab, dp);
                num_delegations--;
        }
 }
-/* Remove the associated file_lock first, then remove the delegation.
+static void nfs4_put_deleg_lease(struct nfs4_file *fp)
- * lease_modify() is called to remove the FS_LEASE file_lock from
- * the i_flock list, eventually calling nfsd's lock_manager
- * fl_release_callback.
- */
-static void
-nfs4_close_delegation(struct nfs4_delegation *dp)
 {
-        dprintk("NFSD: close_delegation dp %p\n",dp);
+        if (atomic_dec_and_test(&fp->fi_delegees)) {
-        /* XXX: do we even need this check?: */
+                vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease);
-        if (dp->dl_flock)
+                fp->fi_lease = NULL;
-                vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock);
+                fp->fi_deleg_file = NULL;
+        }
 }
 /* Called under the state lock. */
 static void
 unhash_delegation(struct nfs4_delegation *dp)
 {
-        list_del_init(&dp->dl_perfile);
        list_del_init(&dp->dl_perclnt);
        spin_lock(&recall_lock);
+        list_del_init(&dp->dl_perfile);
        list_del_init(&dp->dl_recall_lru);
        spin_unlock(&recall_lock);
-        nfs4_close_delegation(dp);
+        nfs4_put_deleg_lease(dp->dl_file);
        nfs4_put_delegation(dp);
 }
@@ -327,64 +316,6 @@ static struct list_head	unconf_id_hashtbl[CLIENT_HASH_SIZE];
 static struct list_head client_lru;
 static struct list_head close_lru;
-static void unhash_generic_stateid(struct nfs4_stateid *stp)
-{
-        list_del(&stp->st_hash);
-        list_del(&stp->st_perfile);
-        list_del(&stp->st_perstateowner);
-}
-static void free_generic_stateid(struct nfs4_stateid *stp)
-{
-        put_nfs4_file(stp->st_file);
-        kmem_cache_free(stateid_slab, stp);
-}
-static void release_lock_stateid(struct nfs4_stateid *stp)
-{
-        struct file *file;
-        unhash_generic_stateid(stp);
-        file = find_any_file(stp->st_file);
-        if (file)
-                locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
-        free_generic_stateid(stp);
-}
-static void unhash_lockowner(struct nfs4_stateowner *sop)
-{
-        struct nfs4_stateid *stp;
-        list_del(&sop->so_idhash);
-        list_del(&sop->so_strhash);
-        list_del(&sop->so_perstateid);
-        while (!list_empty(&sop->so_stateids)) {
-                stp = list_first_entry(&sop->so_stateids,
-                                struct nfs4_stateid, st_perstateowner);
-                release_lock_stateid(stp);
-        }
-}
-static void release_lockowner(struct nfs4_stateowner *sop)
-{
-        unhash_lockowner(sop);
-        nfs4_put_stateowner(sop);
-}
-static void
-release_stateid_lockowners(struct nfs4_stateid *open_stp)
-{
-        struct nfs4_stateowner *lock_sop;
-        while (!list_empty(&open_stp->st_lockowners)) {
-                lock_sop = list_entry(open_stp->st_lockowners.next,
-                                struct nfs4_stateowner, so_perstateid);
-                /* list_del(&open_stp->st_lockowners);  */
-                BUG_ON(lock_sop->so_is_open_owner);
-                release_lockowner(lock_sop);
-        }
-}
 /*
 * We store the NONE, READ, WRITE, and BOTH bits separately in the
 * st_{access,deny}_bmap field of the stateid, in order to track not
@@ -457,13 +388,71 @@ static int nfs4_access_bmap_to_omode(struct nfs4_stateid *stp)
        return nfs4_access_to_omode(access);
 }
-static void release_open_stateid(struct nfs4_stateid *stp)
+static void unhash_generic_stateid(struct nfs4_stateid *stp)
+{
+        list_del(&stp->st_hash);
+        list_del(&stp->st_perfile);
+        list_del(&stp->st_perstateowner);
+}
+static void free_generic_stateid(struct nfs4_stateid *stp)
 {
        int oflag = nfs4_access_bmap_to_omode(stp);
+        nfs4_file_put_access(stp->st_file, oflag);
+        put_nfs4_file(stp->st_file);
+        kmem_cache_free(stateid_slab, stp);
+}
+static void release_lock_stateid(struct nfs4_stateid *stp)
+{
+        struct file *file;
+        unhash_generic_stateid(stp);
+        file = find_any_file(stp->st_file);
+        if (file)
+                locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
+        free_generic_stateid(stp);
+}
+static void unhash_lockowner(struct nfs4_stateowner *sop)
+{
+        struct nfs4_stateid *stp;
+        list_del(&sop->so_idhash);
+        list_del(&sop->so_strhash);
+        list_del(&sop->so_perstateid);
+        while (!list_empty(&sop->so_stateids)) {
+                stp = list_first_entry(&sop->so_stateids,
+                                struct nfs4_stateid, st_perstateowner);
+                release_lock_stateid(stp);
+        }
+}
+static void release_lockowner(struct nfs4_stateowner *sop)
+{
+        unhash_lockowner(sop);
+        nfs4_put_stateowner(sop);
+}
+static void
+release_stateid_lockowners(struct nfs4_stateid *open_stp)
+{
+        struct nfs4_stateowner *lock_sop;
+        while (!list_empty(&open_stp->st_lockowners)) {
+                lock_sop = list_entry(open_stp->st_lockowners.next,
+                                struct nfs4_stateowner, so_perstateid);
+                /* list_del(&open_stp->st_lockowners);  */
+                BUG_ON(lock_sop->so_is_open_owner);
+                release_lockowner(lock_sop);
+        }
+}
+static void release_open_stateid(struct nfs4_stateid *stp)
+{
        unhash_generic_stateid(stp);
        release_stateid_lockowners(stp);
-        nfs4_file_put_access(stp->st_file, oflag);
        free_generic_stateid(stp);
 }
@@ -619,7 +608,8 @@ static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4
        u32 maxrpc = nfsd_serv->sv_max_mesg;
        new->maxreqs = numslots;
-        new->maxresp_cached = slotsize + NFSD_MIN_HDR_SEQ_SZ;
+        new->maxresp_cached = min_t(u32, req->maxresp_cached,
+                                        slotsize + NFSD_MIN_HDR_SEQ_SZ);
        new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc);
        new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc);
        new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
@@ -958,8 +948,6 @@ expire_client(struct nfs4_client *clp)
        spin_lock(&recall_lock);
        while (!list_empty(&clp->cl_delegations)) {
                dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
-                dprintk("NFSD: expire client. dp %p, fp %p\n", dp,
-                                dp->dl_flock);
                list_del_init(&dp->dl_perclnt);
                list_move(&dp->dl_recall_lru, &reaplist);
        }
@@ -2078,6 +2066,7 @@ alloc_init_file(struct inode *ino)
                fp->fi_inode = igrab(ino);
                fp->fi_id = current_fileid++;
                fp->fi_had_conflict = false;
+                fp->fi_lease = NULL;
                memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
                memset(fp->fi_access, 0, sizeof(fp->fi_access));
                spin_lock(&recall_lock);
@@ -2329,23 +2318,8 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
                nfs4_file_put_access(fp, O_RDONLY);
 }
-/*
+static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
- * Spawn a thread to perform a recall on the delegation represented
- * by the lease (file_lock)
- *
- * Called from break_lease() with lock_flocks() held.
- * Note: we assume break_lease will only call this *once* for any given
- * lease.
- */
-static
-void nfsd_break_deleg_cb(struct file_lock *fl)
 {
-        struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
-        dprintk("NFSD nfsd_break_deleg_cb: dp %p fl %p\n",dp,fl);
-        if (!dp)
-                return;
        /* We're assuming the state code never drops its reference
         * without first removing the lease.  Since we're in this lease
         * callback (and since the lease code is serialized by the kernel
@@ -2353,22 +2327,35 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
         * it's safe to take a reference: */
        atomic_inc(&dp->dl_count);
-        spin_lock(&recall_lock);
        list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
-        spin_unlock(&recall_lock);
        /* only place dl_time is set. protected by lock_flocks*/
        dp->dl_time = get_seconds();
+        nfsd4_cb_recall(dp);
+}
+/* Called from break_lease() with lock_flocks() held. */
+static void nfsd_break_deleg_cb(struct file_lock *fl)
+{
+        struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
+        struct nfs4_delegation *dp;
+        BUG_ON(!fp);
+        /* We assume break_lease is only called once per lease: */
+        BUG_ON(fp->fi_had_conflict);
        /*
         * We don't want the locks code to timeout the lease for us;
-         * we'll remove it ourself if the delegation isn't returned
+         * we'll remove it ourself if a delegation isn't returned
-         * in time.
+         * in time:
         */
        fl->fl_break_time = 0;
-        dp->dl_file->fi_had_conflict = true;
+        spin_lock(&recall_lock);
-        nfsd4_cb_recall(dp);
+        fp->fi_had_conflict = true;
+        list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
+                nfsd_break_one_deleg(dp);
+        spin_unlock(&recall_lock);
 }
 static
@@ -2461,10 +2448,13 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
 {
        struct nfs4_delegation *dp;
-        list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) {
+        spin_lock(&recall_lock);
-                if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid)
+        list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
+                if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) {
+                        spin_unlock(&recall_lock);
                        return dp;
-        }
+                }
+        spin_unlock(&recall_lock);
        return NULL;
 }
@@ -2641,6 +2631,66 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
        return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
 }
+static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int flag)
+{
+        struct file_lock *fl;
+        fl = locks_alloc_lock();
+        if (!fl)
+                return NULL;
+        locks_init_lock(fl);
+        fl->fl_lmops = &nfsd_lease_mng_ops;
+        fl->fl_flags = FL_LEASE;
+        fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
+        fl->fl_end = OFFSET_MAX;
+        fl->fl_owner = (fl_owner_t)(dp->dl_file);
+        fl->fl_pid = current->tgid;
+        return fl;
+}
+static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
+{
+        struct nfs4_file *fp = dp->dl_file;
+        struct file_lock *fl;
+        int status;
+        fl = nfs4_alloc_init_lease(dp, flag);
+        if (!fl)
+                return -ENOMEM;
+        fl->fl_file = find_readable_file(fp);
+        list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
+        status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
+        if (status) {
+                list_del_init(&dp->dl_perclnt);
+                locks_free_lock(fl);
+                return -ENOMEM;
+        }
+        fp->fi_lease = fl;
+        fp->fi_deleg_file = fl->fl_file;
+        get_file(fp->fi_deleg_file);
+        atomic_set(&fp->fi_delegees, 1);
+        list_add(&dp->dl_perfile, &fp->fi_delegations);
+        return 0;
+}
+static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
+{
+        struct nfs4_file *fp = dp->dl_file;
+        if (!fp->fi_lease)
+                return nfs4_setlease(dp, flag);
+        spin_lock(&recall_lock);
+        if (fp->fi_had_conflict) {
+                spin_unlock(&recall_lock);
+                return -EAGAIN;
+        }
+        atomic_inc(&fp->fi_delegees);
+        list_add(&dp->dl_perfile, &fp->fi_delegations);
+        spin_unlock(&recall_lock);
+        list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
+        return 0;
+}
 /*
 * Attempt to hand out a delegation.
 */
@@ -2650,7 +2700,6 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
        struct nfs4_delegation *dp;
        struct nfs4_stateowner *sop = stp->st_stateowner;
        int cb_up;
-        struct file_lock *fl;
        int status, flag = 0;
        cb_up = nfsd4_cb_channel_good(sop->so_client);
@@ -2681,36 +2730,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
        }
        dp = alloc_init_deleg(sop->so_client, stp, fh, flag);
-        if (dp == NULL) {
+        if (dp == NULL)
-                flag = NFS4_OPEN_DELEGATE_NONE;
+                goto out_no_deleg;
-                goto out;
+        status = nfs4_set_delegation(dp, flag);
-        }
+        if (status)
-        status = -ENOMEM;
+                goto out_free;
-        fl = locks_alloc_lock();
-        if (!fl)
-                goto out;
-        locks_init_lock(fl);
-        fl->fl_lmops = &nfsd_lease_mng_ops;
-        fl->fl_flags = FL_LEASE;
-        fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
-        fl->fl_end = OFFSET_MAX;
-        fl->fl_owner =  (fl_owner_t)dp;
-        fl->fl_file = find_readable_file(stp->st_file);
-        BUG_ON(!fl->fl_file);
-        fl->fl_pid = current->tgid;
-        dp->dl_flock = fl;
-        /* vfs_setlease checks to see if delegation should be handed out.
-         * the lock_manager callback fl_change is used
-         */
-        if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
-                dprintk("NFSD: setlease failed [%d], no delegation\n", status);
-                dp->dl_flock = NULL;
-                locks_free_lock(fl);
-                unhash_delegation(dp);
-                flag = NFS4_OPEN_DELEGATE_NONE;
-                goto out;
-        }
        memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
@@ -2722,6 +2746,12 @@ out:
                        && open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE)
                dprintk("NFSD: WARNING: refusing delegation reclaim\n");
        open->op_delegate_type = flag;
+        return;
+out_free:
+        nfs4_put_delegation(dp);
+out_no_deleg:
+        flag = NFS4_OPEN_DELEGATE_NONE;
+        goto out;
 }
 /*
@@ -2916,8 +2946,6 @@ nfs4_laundromat(void)
                                test_val = u;
                        break;
                }
-                dprintk("NFSD: purging unused delegation dp %p, fp %p\n",
-                                    dp, dp->dl_flock);
                list_move(&dp->dl_recall_lru, &reaplist);
        }
        spin_unlock(&recall_lock);
@@ -3128,7 +3156,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                        goto out;
                renew_client(dp->dl_client);
                if (filpp) {
-                        *filpp = find_readable_file(dp->dl_file);
+                        *filpp = dp->dl_file->fi_deleg_file;
                        BUG_ON(!*filpp);
                }
        } else { /* open or lock stateid */
@@ -3708,6 +3736,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
        stp->st_stateid.si_stateownerid = sop->so_id;
        stp->st_stateid.si_fileid = fp->fi_id;
        stp->st_stateid.si_generation = 0;
+        stp->st_access_bmap = 0;
        stp->st_deny_bmap = open_stp->st_deny_bmap;
        stp->st_openstp = open_stp;
@@ -3722,6 +3751,17 @@ check_lock_length(u64 offset, u64 length)
             LOFF_OVERFLOW(offset, length)));
 }
+static void get_lock_access(struct nfs4_stateid *lock_stp, u32 access)
+{
+        struct nfs4_file *fp = lock_stp->st_file;
+        int oflag = nfs4_access_to_omode(access);
+        if (test_bit(access, &lock_stp->st_access_bmap))
+                return;
+        nfs4_file_get_access(fp, oflag);
+        __set_bit(access, &lock_stp->st_access_bmap);
+}
 /*
 *  LOCK operation 
 */
@@ -3738,7 +3778,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        struct file_lock conflock;
        __be32 status = 0;
        unsigned int strhashval;
-        unsigned int cmd;
        int err;
        dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
@@ -3820,22 +3859,18 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        switch (lock->lk_type) {
                case NFS4_READ_LT:
                case NFS4_READW_LT:
-                        if (find_readable_file(lock_stp->st_file)) {
+                        filp = find_readable_file(lock_stp->st_file);
-                                nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_READ);
+                        if (filp)
-                                filp = find_readable_file(lock_stp->st_file);
+                                get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
-                        }
                        file_lock.fl_type = F_RDLCK;
-                        cmd = F_SETLK;
+                        break;
-                break;
                case NFS4_WRITE_LT:
                case NFS4_WRITEW_LT:
-                        if (find_writeable_file(lock_stp->st_file)) {
+                        filp = find_writeable_file(lock_stp->st_file);
-                                nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_WRITE);
+                        if (filp)
-                                filp = find_writeable_file(lock_stp->st_file);
+                                get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
-                        }
                        file_lock.fl_type = F_WRLCK;
-                        cmd = F_SETLK;
+                        break;
-                break;
                default:
                        status = nfserr_inval;
                goto out;
@@ -3859,7 +3894,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        * Note: locks.c uses the BKL to protect the inode's lock list.
        */
-        err = vfs_lock_file(filp, cmd, &file_lock, &conflock);
+        err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock);
        switch (-err) {
        case 0: /* success! */
                update_stateid(&lock_stp->st_stateid);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 956629b9cdc9..c6766af00d98 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -317,8 +317,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                READ_BUF(dummy32);
                len += (XDR_QUADLEN(dummy32) << 2);
                READMEM(buf, dummy32);
-                if ((host_err = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
+                if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
-                        goto out_nfserr;
+                        return status;
                iattr->ia_valid |= ATTR_UID;
        }
        if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) {
@@ -328,8 +328,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                READ_BUF(dummy32);
                len += (XDR_QUADLEN(dummy32) << 2);
                READMEM(buf, dummy32);
-                if ((host_err = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
+                if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
-                        goto out_nfserr;
+                        return status;
                iattr->ia_valid |= ATTR_GID;
        }
        if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
@@ -1215,8 +1215,6 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
                        READ_BUF(4);
                        READ32(dummy);
                        READ_BUF(dummy * 4);
-                        for (i = 0; i < dummy; ++i)
-                                READ32(dummy);
                        break;
                case RPC_AUTH_GSS:
                        dprintk("RPC_AUTH_GSS callback secflavor "
@@ -1232,7 +1230,6 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
                        READ_BUF(4);
                        READ32(dummy);
                        READ_BUF(dummy);
-                        p += XDR_QUADLEN(dummy);
                        break;
                default:
                        dprintk("Illegal callback secflavor\n");
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 33b3e2b06779..1f5eae40f34e 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -12,13 +12,14 @@
 #include <linux/nfsd/syscall.h>
 #include <linux/lockd/lockd.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/gss_api.h>
 #include "idmap.h"
 #include "nfsd.h"
 #include "cache.h"
 /*
- *      We have a single directory with 9 nodes in it.
+ *      We have a single directory with several nodes in it.
 */
 enum {
        NFSD_Root = 1,
@@ -42,6 +43,7 @@ enum {
        NFSD_Versions,
        NFSD_Ports,
        NFSD_MaxBlkSize,
+        NFSD_SupportedEnctypes,
        /*
         * The below MUST come last.  Otherwise we leave a hole in nfsd_files[]
         * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
@@ -187,6 +189,34 @@ static struct file_operations export_features_operations = {
        .release        = single_release,
 };
+#ifdef CONFIG_SUNRPC_GSS
+static int supported_enctypes_show(struct seq_file *m, void *v)
+{
+        struct gss_api_mech *k5mech;
+        k5mech = gss_mech_get_by_name("krb5");
+        if (k5mech == NULL)
+                goto out;
+        if (k5mech->gm_upcall_enctypes != NULL)
+                seq_printf(m, k5mech->gm_upcall_enctypes);
+        gss_mech_put(k5mech);
+out:
+        return 0;
+}
+static int supported_enctypes_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, supported_enctypes_show, NULL);
+}
+static struct file_operations supported_enctypes_ops = {
+        .open           = supported_enctypes_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+#endif /* CONFIG_SUNRPC_GSS */
 extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
 extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
@@ -1397,6 +1427,9 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
                [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
                [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
+#ifdef CONFIG_SUNRPC_GSS
+                [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
+#endif /* CONFIG_SUNRPC_GSS */
 #ifdef CONFIG_NFSD_V4
                [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 3074656ba7bf..6bd2f3c21f2b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -83,8 +83,6 @@ struct nfs4_delegation {
        atomic_t                dl_count;       /* ref count */
        struct nfs4_client      *dl_client;
        struct nfs4_file        *dl_file;
-        struct file             *dl_vfs_file;
-        struct file_lock        *dl_flock;
        u32                     dl_type;
        time_t                  dl_time;
 /* For recall: */
@@ -369,16 +367,15 @@ struct nfs4_file {
        struct list_head        fi_delegations;
        /* One each for O_RDONLY, O_WRONLY, O_RDWR: */
        struct file *           fi_fds[3];
-        /* One each for O_RDONLY, O_WRONLY: */
-        atomic_t                fi_access[2];
        /*
-         * Each open stateid contributes 1 to either fi_readers or
+         * Each open or lock stateid contributes 1 to either
-         * fi_writers, or both, depending on the open mode.  A
+         * fi_access[O_RDONLY], fi_access[O_WRONLY], or both, depending
-         * delegation also takes an fi_readers reference.  Lock
+         * on open or lock mode:
-         * stateid's take none.
         */
-        atomic_t                fi_readers;
+        atomic_t                fi_access[2];
-        atomic_t                fi_writers;
+        struct file             *fi_deleg_file;
+        struct file_lock        *fi_lease;
+        atomic_t                fi_delegees;
        struct inode            *fi_inode;
        u32                     fi_id;      /* used with stateowner->so_id 
                                             * for stateid_hashtbl hash */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 641117f2188d..2e1cebde90df 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -87,7 +87,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
                            .dentry = dget(dentry)};
        int err = 0;
-        err = follow_down(&path, false);
+        err = follow_down(&path);
        if (err < 0)
                goto out;
@@ -808,7 +808,7 @@ nfsd_get_raparms(dev_t dev, ino_t ino)
                if (ra->p_count == 0)
                        frap = rap;
        }
-        depth = nfsdstats.ra_size*11/10;
+        depth = nfsdstats.ra_size;
        if (!frap) {    
                spin_unlock(&rab->pb_lock);
                return NULL;
@@ -1744,6 +1744,11 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        host_err = nfsd_break_lease(odentry->d_inode);
        if (host_err)
                goto out_drop_write;
+        if (ndentry->d_inode) {
+                host_err = nfsd_break_lease(ndentry->d_inode);
+                if (host_err)
+                        goto out_drop_write;
+        }
        host_err = vfs_rename(fdir, odentry, tdir, ndentry);
        if (!host_err) {
                host_err = commit_metadata(tfhp);
@@ -1812,22 +1817,22 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
        if (host_err)
-                goto out_nfserr;
+                goto out_put;
        host_err = nfsd_break_lease(rdentry->d_inode);
        if (host_err)
-                goto out_put;
+                goto out_drop_write;
        if (type != S_IFDIR)
                host_err = vfs_unlink(dirp, rdentry);
        else
                host_err = vfs_rmdir(dirp, rdentry);
-out_put:
-        dput(rdentry);
        if (!host_err)
                host_err = commit_metadata(fhp);
+out_drop_write:
        mnt_drop_write(fhp->fh_export->ex_path.mnt);
+out_put:
+        dput(rdentry);
 out_nfserr:
        err = nfserrno(host_err);
 out:
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index d7fd696e595c..0a0a66d98cce 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -521,8 +521,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
                                    group_offset, bitmap))
                printk(KERN_WARNING "%s: entry number %llu already freed\n",
                       __func__, (unsigned long long)req->pr_entry_nr);
+        else
-        nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+                nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
        kunmap(req->pr_bitmap_bh->b_page);
        kunmap(req->pr_desc_bh->b_page);
@@ -558,8 +558,8 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
                                    group_offset, bitmap))
                printk(KERN_WARNING "%s: entry number %llu already freed\n",
                       __func__, (unsigned long long)req->pr_entry_nr);
+        else
-        nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+                nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
        kunmap(req->pr_bitmap_bh->b_page);
        kunmap(req->pr_desc_bh->b_page);
@@ -665,7 +665,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
                for (j = i, n = 0;
                     (j < nitems) && nilfs_palloc_group_is_in(inode, group,
                                                              entry_nrs[j]);
-                     j++, n++) {
+                     j++) {
                        nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
                        if (!nilfs_clear_bit_atomic(
                                    nilfs_mdt_bgl_lock(inode, group),
@@ -674,6 +674,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
                                       "%s: entry number %llu already freed\n",
                                       __func__,
                                       (unsigned long long)entry_nrs[j]);
+                        } else {
+                                n++;
                        }
                }
                nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 9af34a7e6e13..f5fde36b9e28 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -74,7 +74,7 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
 #define nilfs_set_bit_atomic            ext2_set_bit_atomic
 #define nilfs_clear_bit_atomic          ext2_clear_bit_atomic
-#define nilfs_find_next_zero_bit        ext2_find_next_zero_bit
+#define nilfs_find_next_zero_bit        find_next_zero_bit_le
 /*
 * persistent object allocator cache
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 3ee67c67cc52..4723f04e9b12 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -25,7 +25,6 @@
 #include <linux/errno.h>
 #include "nilfs.h"
 #include "bmap.h"
-#include "sb.h"
 #include "btree.h"
 #include "direct.h"
 #include "btnode.h"
@@ -425,17 +424,6 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
 /*
 * Internal use only
 */
-void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
-{
-        inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
-}
-void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
-{
-        inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
-}
 __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
                              const struct buffer_head *bh)
 {
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index bde1c0aa2e15..40d9f453d31c 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -240,9 +240,6 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
 __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
 __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
-void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
-void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
 /* Assume that bmap semaphore is locked. */
 static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 388e9e8f5286..609cd223eea8 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -34,20 +34,10 @@
 #include "page.h"
 #include "btnode.h"
-void nilfs_btnode_cache_init_once(struct address_space *btnc)
-{
-        nilfs_mapping_init_once(btnc);
-}
-static const struct address_space_operations def_btnode_aops = {
-        .sync_page              = block_sync_page,
-};
 void nilfs_btnode_cache_init(struct address_space *btnc,
                             struct backing_dev_info *bdi)
 {
-        nilfs_mapping_init(btnc, bdi, &def_btnode_aops);
+        nilfs_mapping_init(btnc, bdi);
 }
 void nilfs_btnode_cache_clear(struct address_space *btnc)
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 79037494f1e0..1b8ebd888c28 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt {
        struct buffer_head *newbh;
 };
-void nilfs_btnode_cache_init_once(struct address_space *);
 void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 300c2bc00c3f..d451ae0e0bf3 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -1174,7 +1174,7 @@ static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr)
        if (ret < 0)
                goto out;
        nilfs_btree_commit_insert(btree, path, level, key, ptr);
-        nilfs_bmap_add_blocks(btree, stats.bs_nblocks);
+        nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks);
 out:
        nilfs_btree_free_path(path);
@@ -1511,7 +1511,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key)
        if (ret < 0)
                goto out;
        nilfs_btree_commit_delete(btree, path, level, dat);
-        nilfs_bmap_sub_blocks(btree, stats.bs_nblocks);
+        nilfs_inode_sub_blocks(btree->b_inode, stats.bs_nblocks);
 out:
        nilfs_btree_free_path(path);
@@ -1776,7 +1776,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
                return ret;
        nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n,
                                              di, ni, bh);
-        nilfs_bmap_add_blocks(btree, stats.bs_nblocks);
+        nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks);
        return 0;
 }
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 9d45773b79e6..3a1923943b14 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -440,7 +440,6 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
        nilfs_commit_chunk(page, mapping, from, to);
        nilfs_put_page(page);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-/*      NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
 }
 /*
@@ -531,7 +530,6 @@ got_it:
        nilfs_set_de_type(de, inode);
        nilfs_commit_chunk(page, page->mapping, from, to);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-/*      NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
        nilfs_mark_inode_dirty(dir);
        /* OFFSET_CACHE */
 out_put:
@@ -579,7 +577,6 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
        dir->inode = 0;
        nilfs_commit_chunk(page, mapping, from, to);
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-/*      NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */
 out:
        nilfs_put_page(page);
        return err;
@@ -684,7 +681,7 @@ const struct file_operations nilfs_dir_operations = {
        .readdir        = nilfs_readdir,
        .unlocked_ioctl = nilfs_ioctl,
 #ifdef CONFIG_COMPAT
-        .compat_ioctl   = nilfs_ioctl,
+        .compat_ioctl   = nilfs_compat_ioctl,
 #endif  /* CONFIG_COMPAT */
        .fsync          = nilfs_sync_file,
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 324d80c57518..82f4865e86dd 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -146,7 +146,7 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
                if (NILFS_BMAP_USE_VBN(bmap))
                        nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr);
-                nilfs_bmap_add_blocks(bmap, 1);
+                nilfs_inode_add_blocks(bmap->b_inode, 1);
        }
        return ret;
 }
@@ -168,7 +168,7 @@ static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
        if (!ret) {
                nilfs_bmap_commit_end_ptr(bmap, &req, dat);
                nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR);
-                nilfs_bmap_sub_blocks(bmap, 1);
+                nilfs_inode_sub_blocks(bmap->b_inode, 1);
        }
        return ret;
 }
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 2f560c9fb808..93589fccdd97 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -59,7 +59,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct nilfs_transaction_info ti;
        int ret;
-        if (unlikely(nilfs_near_disk_full(NILFS_SB(inode->i_sb)->s_nilfs)))
+        if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info)))
                return VM_FAULT_SIGBUS; /* -ENOSPC */
        lock_page(page);
@@ -142,7 +142,7 @@ const struct file_operations nilfs_file_operations = {
        .aio_write      = generic_file_aio_write,
        .unlocked_ioctl = nilfs_ioctl,
 #ifdef CONFIG_COMPAT
-        .compat_ioctl   = nilfs_ioctl,
+        .compat_ioctl   = nilfs_compat_ioctl,
 #endif  /* CONFIG_COMPAT */
        .mmap           = nilfs_file_mmap,
        .open           = generic_file_open,
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index caf9a6a3fb54..1c2a3e23f8b2 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -49,7 +49,6 @@
 #include "ifile.h"
 static const struct address_space_operations def_gcinode_aops = {
-        .sync_page              = block_sync_page,
 };
 /*
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2fd440d8d6b8..c0aa27490c02 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -41,6 +41,24 @@ struct nilfs_iget_args {
        int for_gc;
 };
+void nilfs_inode_add_blocks(struct inode *inode, int n)
+{
+        struct nilfs_root *root = NILFS_I(inode)->i_root;
+        inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
+        if (root)
+                atomic_add(n, &root->blocks_count);
+}
+void nilfs_inode_sub_blocks(struct inode *inode, int n)
+{
+        struct nilfs_root *root = NILFS_I(inode)->i_root;
+        inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
+        if (root)
+                atomic_sub(n, &root->blocks_count);
+}
 /**
 * nilfs_get_block() - get a file block on the filesystem (callback function)
 * @inode - inode struct of the target file
@@ -262,7 +280,6 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 const struct address_space_operations nilfs_aops = {
        .writepage              = nilfs_writepage,
        .readpage               = nilfs_readpage,
-        .sync_page              = block_sync_page,
        .writepages             = nilfs_writepages,
        .set_page_dirty         = nilfs_set_page_dirty,
        .readpages              = nilfs_readpages,
@@ -277,7 +294,7 @@ const struct address_space_operations nilfs_aops = {
 struct inode *nilfs_new_inode(struct inode *dir, int mode)
 {
        struct super_block *sb = dir->i_sb;
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct inode *inode;
        struct nilfs_inode_info *ii;
        struct nilfs_root *root;
@@ -315,19 +332,16 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
                /* No lock is needed; iget() ensures it. */
        }
-        ii->i_flags = NILFS_I(dir)->i_flags;
+        ii->i_flags = nilfs_mask_flags(
-        if (S_ISLNK(mode))
+                mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED);
-                ii->i_flags &= ~(NILFS_IMMUTABLE_FL | NILFS_APPEND_FL);
-        if (!S_ISDIR(mode))
-                ii->i_flags &= ~NILFS_DIRSYNC_FL;
        /* ii->i_file_acl = 0; */
        /* ii->i_dir_acl = 0; */
        ii->i_dir_start_lookup = 0;
        nilfs_set_inode_flags(inode);
-        spin_lock(&sbi->s_next_gen_lock);
+        spin_lock(&nilfs->ns_next_gen_lock);
-        inode->i_generation = sbi->s_next_generation++;
+        inode->i_generation = nilfs->ns_next_generation++;
-        spin_unlock(&sbi->s_next_gen_lock);
+        spin_unlock(&nilfs->ns_next_gen_lock);
        insert_inode_hash(inode);
        err = nilfs_init_acl(inode, dir);
@@ -359,17 +373,15 @@ void nilfs_set_inode_flags(struct inode *inode)
        inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
                            S_DIRSYNC);
-        if (flags & NILFS_SYNC_FL)
+        if (flags & FS_SYNC_FL)
                inode->i_flags |= S_SYNC;
-        if (flags & NILFS_APPEND_FL)
+        if (flags & FS_APPEND_FL)
                inode->i_flags |= S_APPEND;
-        if (flags & NILFS_IMMUTABLE_FL)
+        if (flags & FS_IMMUTABLE_FL)
                inode->i_flags |= S_IMMUTABLE;
-#ifndef NILFS_ATIME_DISABLE
+        if (flags & FS_NOATIME_FL)
-        if (flags & NILFS_NOATIME_FL)
-#endif
                inode->i_flags |= S_NOATIME;
-        if (flags & NILFS_DIRSYNC_FL)
+        if (flags & FS_DIRSYNC_FL)
                inode->i_flags |= S_DIRSYNC;
        mapping_set_gfp_mask(inode->i_mapping,
                             mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
@@ -420,7 +432,7 @@ static int __nilfs_read_inode(struct super_block *sb,
                              struct nilfs_root *root, unsigned long ino,
                              struct inode *inode)
 {
-        struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct buffer_head *bh;
        struct nilfs_inode *raw_inode;
        int err;
@@ -707,6 +719,7 @@ void nilfs_evict_inode(struct inode *inode)
        struct nilfs_transaction_info ti;
        struct super_block *sb = inode->i_sb;
        struct nilfs_inode_info *ii = NILFS_I(inode);
+        int ret;
        if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
                if (inode->i_data.nrpages)
@@ -725,8 +738,9 @@ void nilfs_evict_inode(struct inode *inode)
        nilfs_mark_inode_dirty(inode);
        end_writeback(inode);
-        nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
+        ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
-        atomic_dec(&ii->i_root->inodes_count);
+        if (!ret)
+                atomic_dec(&ii->i_root->inodes_count);
        nilfs_clear_inode(inode);
@@ -792,18 +806,18 @@ int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        struct nilfs_inode_info *ii = NILFS_I(inode);
        int err;
-        spin_lock(&sbi->s_inode_lock);
+        spin_lock(&nilfs->ns_inode_lock);
        if (ii->i_bh == NULL) {
-                spin_unlock(&sbi->s_inode_lock);
+                spin_unlock(&nilfs->ns_inode_lock);
                err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
                                                  inode->i_ino, pbh);
                if (unlikely(err))
                        return err;
-                spin_lock(&sbi->s_inode_lock);
+                spin_lock(&nilfs->ns_inode_lock);
                if (ii->i_bh == NULL)
                        ii->i_bh = *pbh;
                else {
@@ -814,36 +828,36 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
                *pbh = ii->i_bh;
        get_bh(*pbh);
-        spin_unlock(&sbi->s_inode_lock);
+        spin_unlock(&nilfs->ns_inode_lock);
        return 0;
 }
 int nilfs_inode_dirty(struct inode *inode)
 {
        struct nilfs_inode_info *ii = NILFS_I(inode);
-        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        int ret = 0;
        if (!list_empty(&ii->i_dirty)) {
-                spin_lock(&sbi->s_inode_lock);
+                spin_lock(&nilfs->ns_inode_lock);
                ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
                        test_bit(NILFS_I_BUSY, &ii->i_state);
-                spin_unlock(&sbi->s_inode_lock);
+                spin_unlock(&nilfs->ns_inode_lock);
        }
        return ret;
 }
 int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
        struct nilfs_inode_info *ii = NILFS_I(inode);
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
-        atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
+        atomic_add(nr_dirty, &nilfs->ns_ndirtyblks);
        if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
                return 0;
-        spin_lock(&sbi->s_inode_lock);
+        spin_lock(&nilfs->ns_inode_lock);
        if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
            !test_bit(NILFS_I_BUSY, &ii->i_state)) {
                /* Because this routine may race with nilfs_dispose_list(),
@@ -851,18 +865,18 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
                if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
                        /* This will happen when somebody is freeing
                           this inode. */
-                        nilfs_warning(sbi->s_super, __func__,
+                        nilfs_warning(inode->i_sb, __func__,
                                      "cannot get inode (ino=%lu)\n",
                                      inode->i_ino);
-                        spin_unlock(&sbi->s_inode_lock);
+                        spin_unlock(&nilfs->ns_inode_lock);
                        return -EINVAL; /* NILFS_I_DIRTY may remain for
                                           freeing inode */
                }
                list_del(&ii->i_dirty);
-                list_add_tail(&ii->i_dirty, &sbi->s_dirty_files);
+                list_add_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
                set_bit(NILFS_I_QUEUED, &ii->i_state);
        }
-        spin_unlock(&sbi->s_inode_lock);
+        spin_unlock(&nilfs->ns_inode_lock);
        return 0;
 }
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 496738963fdb..f2469ba6246b 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -26,7 +26,9 @@
 #include <linux/capability.h>   /* capable() */
 #include <linux/uaccess.h>      /* copy_from_user(), copy_to_user() */
 #include <linux/vmalloc.h>
+#include <linux/compat.h>       /* compat_ptr() */
 #include <linux/mount.h>        /* mnt_want_write(), mnt_drop_write() */
+#include <linux/buffer_head.h>
 #include <linux/nilfs2_fs.h>
 #include "nilfs.h"
 #include "segment.h"
@@ -97,11 +99,74 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
        return ret;
 }
+static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp)
+{
+        unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE;
+        return put_user(flags, (int __user *)argp);
+}
+static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
+                                void __user *argp)
+{
+        struct nilfs_transaction_info ti;
+        unsigned int flags, oldflags;
+        int ret;
+        if (!inode_owner_or_capable(inode))
+                return -EACCES;
+        if (get_user(flags, (int __user *)argp))
+                return -EFAULT;
+        ret = mnt_want_write(filp->f_path.mnt);
+        if (ret)
+                return ret;
+        flags = nilfs_mask_flags(inode->i_mode, flags);
+        mutex_lock(&inode->i_mutex);
+        oldflags = NILFS_I(inode)->i_flags;
+        /*
+         * The IMMUTABLE and APPEND_ONLY flags can only be changed by the
+         * relevant capability.
+         */
+        ret = -EPERM;
+        if (((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) &&
+            !capable(CAP_LINUX_IMMUTABLE))
+                goto out;
+        ret = nilfs_transaction_begin(inode->i_sb, &ti, 0);
+        if (ret)
+                goto out;
+        NILFS_I(inode)->i_flags = (oldflags & ~FS_FL_USER_MODIFIABLE) |
+                (flags & FS_FL_USER_MODIFIABLE);
+        nilfs_set_inode_flags(inode);
+        inode->i_ctime = CURRENT_TIME;
+        if (IS_SYNC(inode))
+                nilfs_set_transaction_flag(NILFS_TI_SYNC);
+        nilfs_mark_inode_dirty(inode);
+        ret = nilfs_transaction_commit(inode->i_sb);
+out:
+        mutex_unlock(&inode->i_mutex);
+        mnt_drop_write(filp->f_path.mnt);
+        return ret;
+}
+static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp)
+{
+        return put_user(inode->i_generation, (int __user *)argp);
+}
 static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
                                     unsigned int cmd, void __user *argp)
 {
-        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
-        struct inode *cpfile = nilfs->ns_cpfile;
        struct nilfs_transaction_info ti;
        struct nilfs_cpmode cpmode;
        int ret;
@@ -121,7 +186,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
        nilfs_transaction_begin(inode->i_sb, &ti, 0);
        ret = nilfs_cpfile_change_cpmode(
-                cpfile, cpmode.cm_cno, cpmode.cm_mode);
+                nilfs->ns_cpfile, cpmode.cm_cno, cpmode.cm_mode);
        if (unlikely(ret < 0))
                nilfs_transaction_abort(inode->i_sb);
        else
@@ -137,7 +202,7 @@ static int
 nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
                              unsigned int cmd, void __user *argp)
 {
-        struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        struct nilfs_transaction_info ti;
        __u64 cno;
        int ret;
@@ -154,7 +219,7 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
                goto out;
        nilfs_transaction_begin(inode->i_sb, &ti, 0);
-        ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
+        ret = nilfs_cpfile_delete_checkpoint(nilfs->ns_cpfile, cno);
        if (unlikely(ret < 0))
                nilfs_transaction_abort(inode->i_sb);
        else
@@ -180,7 +245,7 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
 static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
                                  unsigned int cmd, void __user *argp)
 {
-        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        struct nilfs_cpstat cpstat;
        int ret;
@@ -211,7 +276,7 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
 static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
                                  unsigned int cmd, void __user *argp)
 {
-        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        struct nilfs_sustat sustat;
        int ret;
@@ -267,7 +332,7 @@ nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
 static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
                                  unsigned int cmd, void __user *argp)
 {
-        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        struct nilfs_argv argv;
        int ret;
@@ -336,7 +401,7 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
                                   struct nilfs_argv *argv, void *buf)
 {
        size_t nmembs = argv->v_nmembs;
-        struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct inode *inode;
        struct nilfs_vdesc *vdesc;
        struct buffer_head *bh, *n;
@@ -550,7 +615,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
                ret = PTR_ERR(kbufs[4]);
                goto out;
        }
-        nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        nilfs = inode->i_sb->s_fs_info;
        for (n = 0; n < 4; n++) {
                ret = -EINVAL;
@@ -623,7 +688,7 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
                return ret;
        if (argp != NULL) {
-                nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+                nilfs = inode->i_sb->s_fs_info;
                down_read(&nilfs->ns_segctor_sem);
                cno = nilfs->ns_cno - 1;
                up_read(&nilfs->ns_segctor_sem);
@@ -641,7 +706,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
                                                  void *, size_t, size_t))
 {
-        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        struct nilfs_argv argv;
        int ret;
@@ -666,6 +731,12 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        void __user *argp = (void __user *)arg;
        switch (cmd) {
+        case FS_IOC_GETFLAGS:
+                return nilfs_ioctl_getflags(inode, argp);
+        case FS_IOC_SETFLAGS:
+                return nilfs_ioctl_setflags(inode, filp, argp);
+        case FS_IOC_GETVERSION:
+                return nilfs_ioctl_getversion(inode, argp);
        case NILFS_IOCTL_CHANGE_CPMODE:
                return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp);
        case NILFS_IOCTL_DELETE_CHECKPOINT:
@@ -696,3 +767,23 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                return -ENOTTY;
        }
 }
+#ifdef CONFIG_COMPAT
+long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+        switch (cmd) {
+        case FS_IOC32_GETFLAGS:
+                cmd = FS_IOC_GETFLAGS;
+                break;
+        case FS_IOC32_SETFLAGS:
+                cmd = FS_IOC_SETFLAGS;
+                break;
+        case FS_IOC32_GETVERSION:
+                cmd = FS_IOC_GETVERSION;
+                break;
+        default:
+                return -ENOIOCTLCMD;
+        }
+        return nilfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 6a0e2a189f60..a649b05f7069 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -399,7 +399,6 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 static const struct address_space_operations def_mdt_aops = {
        .writepage              = nilfs_mdt_write_page,
-        .sync_page              = block_sync_page,
 };
 static const struct inode_operations def_mdt_iops;
@@ -438,10 +437,6 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
        mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
 }
-static const struct address_space_operations shadow_map_aops = {
-        .sync_page              = block_sync_page,
-};
 /**
 * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
 * @inode: inode of the metadata file
@@ -454,10 +449,10 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
        struct backing_dev_info *bdi = inode->i_sb->s_bdi;
        INIT_LIST_HEAD(&shadow->frozen_buffers);
-        nilfs_mapping_init_once(&shadow->frozen_data);
+        address_space_init_once(&shadow->frozen_data);
-        nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
+        nilfs_mapping_init(&shadow->frozen_data, bdi);
-        nilfs_mapping_init_once(&shadow->frozen_btnodes);
+        address_space_init_once(&shadow->frozen_btnodes);
-        nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
+        nilfs_mapping_init(&shadow->frozen_btnodes, bdi);
        mi->mi_shadow = shadow;
        return 0;
 }
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index b13734bf3521..ed68563ec708 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -66,7 +66,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
 static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
 {
-        return NILFS_SB(inode->i_sb)->s_nilfs;
+        return inode->i_sb->s_fs_info;
 }
 /* Default GFP flags using highmem */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 98034271cd02..546849b3e88f 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -397,7 +397,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
                if (!new_de)
                        goto out_dir;
-                inc_nlink(old_inode);
                nilfs_set_link(new_dir, new_de, new_page, old_inode);
                nilfs_mark_inode_dirty(new_dir);
                new_inode->i_ctime = CURRENT_TIME;
@@ -411,13 +410,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        if (new_dir->i_nlink >= NILFS_LINK_MAX)
                                goto out_dir;
                }
-                inc_nlink(old_inode);
                err = nilfs_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        drop_nlink(old_inode);
-                        nilfs_mark_inode_dirty(old_inode);
                        goto out_dir;
-                }
                if (dir_de) {
                        inc_nlink(new_dir);
                        nilfs_mark_inode_dirty(new_dir);
@@ -431,7 +426,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        old_inode->i_ctime = CURRENT_TIME;
        nilfs_delete_entry(old_de, old_page);
-        drop_nlink(old_inode);
        if (dir_de) {
                nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
@@ -488,7 +482,7 @@ static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno,
        if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO)
                return ERR_PTR(-ESTALE);
-        root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
+        root = nilfs_lookup_root(sb->s_fs_info, cno);
        if (!root)
                return ERR_PTR(-ESTALE);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 777e8fd04304..856e8e4e0b74 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -30,7 +30,6 @@
 #include <linux/blkdev.h>
 #include <linux/nilfs2_fs.h>
 #include "the_nilfs.h"
-#include "sb.h"
 #include "bmap.h"
 /*
@@ -122,7 +121,7 @@ enum {
 #define NILFS_SYS_INO_BITS   \
  ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
-#define NILFS_FIRST_INO(sb)  (NILFS_SB(sb)->s_nilfs->ns_first_ino)
+#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino)
 #define NILFS_MDT_INODE(sb, ino) \
  ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
@@ -212,6 +211,23 @@ static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
 #define NILFS_ATIME_DISABLE
+/* Flags that should be inherited by new inodes from their parent. */
+#define NILFS_FL_INHERITED                                              \
+        (FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | FS_SYNC_FL |          \
+         FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL |\
+         FS_COMPRBLK_FL | FS_NOCOMP_FL | FS_NOTAIL_FL | FS_DIRSYNC_FL)
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags)
+{
+        if (S_ISDIR(mode))
+                return flags;
+        else if (S_ISREG(mode))
+                return flags & ~(FS_DIRSYNC_FL | FS_TOPDIR_FL);
+        else
+                return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
+}
 /* dir.c */
 extern int nilfs_add_link(struct dentry *, struct inode *);
 extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
@@ -229,10 +245,13 @@ extern int nilfs_sync_file(struct file *, int);
 /* ioctl.c */
 long nilfs_ioctl(struct file *, unsigned int, unsigned long);
+long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
                                       void **);
 /* inode.c */
+void nilfs_inode_add_blocks(struct inode *inode, int n);
+void nilfs_inode_sub_blocks(struct inode *inode, int n);
 extern struct inode *nilfs_new_inode(struct inode *, int);
 extern void nilfs_free_inode(struct inode *);
 extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
@@ -275,11 +294,11 @@ extern int nilfs_check_feature_compatibility(struct super_block *,
                                             struct nilfs_super_block *);
 extern void nilfs_set_log_cursor(struct nilfs_super_block *,
                                 struct the_nilfs *);
-extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *,
+struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
-                                                      int flip);
+                                               int flip);
-extern int nilfs_commit_super(struct nilfs_sb_info *, int);
+int nilfs_commit_super(struct super_block *sb, int flag);
-extern int nilfs_cleanup_super(struct nilfs_sb_info *);
+int nilfs_cleanup_super(struct super_block *sb);
-int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
+int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
                            struct nilfs_root **root);
 int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 0c432416cfef..4d2a1ee0eb47 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -492,29 +492,15 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
        return nc;
 }
-void nilfs_mapping_init_once(struct address_space *mapping)
-{
-        memset(mapping, 0, sizeof(*mapping));
-        INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-        spin_lock_init(&mapping->tree_lock);
-        INIT_LIST_HEAD(&mapping->private_list);
-        spin_lock_init(&mapping->private_lock);
-        spin_lock_init(&mapping->i_mmap_lock);
-        INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
-}
 void nilfs_mapping_init(struct address_space *mapping,
-                        struct backing_dev_info *bdi,
+                        struct backing_dev_info *bdi)
-                        const struct address_space_operations *aops)
 {
        mapping->host = NULL;
        mapping->flags = 0;
        mapping_set_gfp_mask(mapping, GFP_NOFS);
        mapping->assoc_mapping = NULL;
        mapping->backing_dev_info = bdi;
-        mapping->a_ops = aops;
+        mapping->a_ops = NULL;
 }
 /*
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 622df27cd891..f06b79ad7493 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -61,10 +61,8 @@ void nilfs_free_private_page(struct page *);
 int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
-void nilfs_mapping_init_once(struct address_space *mapping);
 void nilfs_mapping_init(struct address_space *mapping,
-                        struct backing_dev_info *bdi,
+                        struct backing_dev_info *bdi);
-                        const struct address_space_operations *aops);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
 unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
                                            sector_t start_blk,
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 3dfcd3b7d389..ba4a64518f38 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -425,7 +425,7 @@ void nilfs_dispose_segment_list(struct list_head *head)
 }
 static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
-                                              struct nilfs_sb_info *sbi,
+                                              struct super_block *sb,
                                              struct nilfs_recovery_info *ri)
 {
        struct list_head *head = &ri->ri_used_segments;
@@ -501,7 +501,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
 }
 static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
-                                      struct nilfs_sb_info *sbi,
+                                      struct super_block *sb,
                                      struct nilfs_root *root,
                                      struct list_head *head,
                                      unsigned long *nr_salvaged_blocks)
@@ -514,7 +514,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
        int err = 0, err2 = 0;
        list_for_each_entry_safe(rb, n, head, list) {
-                inode = nilfs_iget(sbi->s_super, root, rb->ino);
+                inode = nilfs_iget(sb, root, rb->ino);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        inode = NULL;
@@ -572,11 +572,11 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
 * nilfs_do_roll_forward - salvage logical segments newer than the latest
 * checkpoint
 * @nilfs: nilfs object
- * @sbi: nilfs_sb_info
+ * @sb: super block instance
 * @ri: pointer to a nilfs_recovery_info
 */
 static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
-                                 struct nilfs_sb_info *sbi,
+                                 struct super_block *sb,
                                 struct nilfs_root *root,
                                 struct nilfs_recovery_info *ri)
 {
@@ -648,7 +648,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
                                goto failed;
                        if (flags & NILFS_SS_LOGEND) {
                                err = nilfs_recover_dsync_blocks(
-                                        nilfs, sbi, root, &dsync_blocks,
+                                        nilfs, sb, root, &dsync_blocks,
                                        &nsalvaged_blocks);
                                if (unlikely(err))
                                        goto failed;
@@ -681,7 +681,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
        if (nsalvaged_blocks) {
                printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n",
-                       sbi->s_super->s_id, nsalvaged_blocks);
+                       sb->s_id, nsalvaged_blocks);
                ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
        }
 out:
@@ -695,7 +695,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
        printk(KERN_ERR
               "NILFS (device %s): Error roll-forwarding "
               "(err=%d, pseg block=%llu). ",
-               sbi->s_super->s_id, err, (unsigned long long)pseg_start);
+               sb->s_id, err, (unsigned long long)pseg_start);
        goto out;
 }
@@ -724,7 +724,7 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
 /**
 * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint
 * @nilfs: nilfs object
- * @sbi: nilfs_sb_info
+ * @sb: super block instance
 * @ri: pointer to a nilfs_recovery_info struct to store search results.
 *
 * Return Value: On success, 0 is returned.  On error, one of the following
@@ -741,7 +741,7 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
 * %-ENOMEM - Insufficient memory available.
 */
 int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
-                              struct nilfs_sb_info *sbi,
+                              struct super_block *sb,
                              struct nilfs_recovery_info *ri)
 {
        struct nilfs_root *root;
@@ -750,32 +750,32 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
        if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
                return 0;
-        err = nilfs_attach_checkpoint(sbi, ri->ri_cno, true, &root);
+        err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root);
        if (unlikely(err)) {
                printk(KERN_ERR
                       "NILFS: error loading the latest checkpoint.\n");
                return err;
        }
-        err = nilfs_do_roll_forward(nilfs, sbi, root, ri);
+        err = nilfs_do_roll_forward(nilfs, sb, root, ri);
        if (unlikely(err))
                goto failed;
        if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
-                err = nilfs_prepare_segment_for_recovery(nilfs, sbi, ri);
+                err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri);
                if (unlikely(err)) {
                        printk(KERN_ERR "NILFS: Error preparing segments for "
                               "recovery.\n");
                        goto failed;
                }
-                err = nilfs_attach_segment_constructor(sbi, root);
+                err = nilfs_attach_log_writer(sb, root);
                if (unlikely(err))
                        goto failed;
                set_nilfs_discontinued(nilfs);
-                err = nilfs_construct_segment(sbi->s_super);
+                err = nilfs_construct_segment(sb);
-                nilfs_detach_segment_constructor(sbi);
+                nilfs_detach_log_writer(sb);
                if (unlikely(err)) {
                        printk(KERN_ERR "NILFS: Oops! recovery failed. "
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
deleted file mode 100644
index 7a17715f215f..000000000000
--- a/fs/nilfs2/sb.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * sb.h - NILFS on-memory super block structure.
- *
- * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
- *
- */
-#ifndef _NILFS_SB
-#define _NILFS_SB
-#include <linux/types.h>
-#include <linux/fs.h>
-struct the_nilfs;
-struct nilfs_sc_info;
-/*
- * NILFS super-block data in memory
- */
-struct nilfs_sb_info {
-        /* Mount options */
-        unsigned long s_mount_opt;
-        uid_t s_resuid;
-        gid_t s_resgid;
-        unsigned long s_interval;       /* construction interval */
-        unsigned long s_watermark;      /* threshold of data amount
-                                           for the segment construction */
-        /* Fundamental members */
-        struct super_block *s_super;    /* reverse pointer to super_block */
-        struct the_nilfs *s_nilfs;
-        /* Segment constructor */
-        struct list_head s_dirty_files; /* dirty files list */
-        struct nilfs_sc_info *s_sc_info; /* segment constructor info */
-        spinlock_t s_inode_lock;        /* Lock for the nilfs inode.
-                                           It covers s_dirty_files list */
-        /* Inode allocator */
-        spinlock_t s_next_gen_lock;
-        u32 s_next_generation;
-};
-static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb)
-{
-        return sb->s_fs_info;
-}
-static inline struct nilfs_sc_info *NILFS_SC(struct nilfs_sb_info *sbi)
-{
-        return sbi->s_sc_info;
-}
-/*
- * Bit operations for the mount option
- */
-#define nilfs_clear_opt(sbi, opt)  \
-        do { (sbi)->s_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
-#define nilfs_set_opt(sbi, opt)  \
-        do { (sbi)->s_mount_opt |= NILFS_MOUNT_##opt; } while (0)
-#define nilfs_test_opt(sbi, opt)   ((sbi)->s_mount_opt & NILFS_MOUNT_##opt)
-#define nilfs_write_opt(sbi, mask, opt)                                 \
-        do { (sbi)->s_mount_opt =                                       \
-                (((sbi)->s_mount_opt & ~NILFS_MOUNT_##mask) |           \
-                 NILFS_MOUNT_##opt);                                    \
-        } while (0)
-#endif /* _NILFS_SB */
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 0f83e93935b2..2853ff20f85a 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -509,7 +509,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
                 * Last BIO is always sent through the following
                 * submission.
                 */
-                rw |= REQ_SYNC | REQ_UNPLUG;
+                rw |= REQ_SYNC;
                res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
        }
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 55ebae5c7f39..afe4f2183454 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -104,8 +104,7 @@ struct nilfs_sc_operations {
 static void nilfs_segctor_start_timer(struct nilfs_sc_info *);
 static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int);
 static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *);
-static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
+static void nilfs_dispose_list(struct the_nilfs *, struct list_head *, int);
-                               int);
 #define nilfs_cnt32_gt(a, b)   \
        (typecheck(__u32, a) && typecheck(__u32, b) && \
@@ -182,7 +181,6 @@ int nilfs_transaction_begin(struct super_block *sb,
                            struct nilfs_transaction_info *ti,
                            int vacancy_check)
 {
-        struct nilfs_sb_info *sbi;
        struct the_nilfs *nilfs;
        int ret = nilfs_prepare_segment_lock(ti);
@@ -193,8 +191,7 @@ int nilfs_transaction_begin(struct super_block *sb,
        vfs_check_frozen(sb, SB_FREEZE_WRITE);
-        sbi = NILFS_SB(sb);
+        nilfs = sb->s_fs_info;
-        nilfs = sbi->s_nilfs;
        down_read(&nilfs->ns_segctor_sem);
        if (vacancy_check && nilfs_near_disk_full(nilfs)) {
                up_read(&nilfs->ns_segctor_sem);
@@ -225,8 +222,7 @@ int nilfs_transaction_begin(struct super_block *sb,
 int nilfs_transaction_commit(struct super_block *sb)
 {
        struct nilfs_transaction_info *ti = current->journal_info;
-        struct nilfs_sb_info *sbi;
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct nilfs_sc_info *sci;
        int err = 0;
        BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
@@ -235,16 +231,15 @@ int nilfs_transaction_commit(struct super_block *sb)
                ti->ti_count--;
                return 0;
        }
-        sbi = NILFS_SB(sb);
+        if (nilfs->ns_writer) {
-        sci = NILFS_SC(sbi);
+                struct nilfs_sc_info *sci = nilfs->ns_writer;
-        if (sci != NULL) {
                if (ti->ti_flags & NILFS_TI_COMMIT)
                        nilfs_segctor_start_timer(sci);
-                if (atomic_read(&sbi->s_nilfs->ns_ndirtyblks) >
+                if (atomic_read(&nilfs->ns_ndirtyblks) > sci->sc_watermark)
-                    sci->sc_watermark)
                        nilfs_segctor_do_flush(sci, 0);
        }
-        up_read(&sbi->s_nilfs->ns_segctor_sem);
+        up_read(&nilfs->ns_segctor_sem);
        current->journal_info = ti->ti_save;
        if (ti->ti_flags & NILFS_TI_SYNC)
@@ -257,13 +252,14 @@ int nilfs_transaction_commit(struct super_block *sb)
 void nilfs_transaction_abort(struct super_block *sb)
 {
        struct nilfs_transaction_info *ti = current->journal_info;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
        if (ti->ti_count > 0) {
                ti->ti_count--;
                return;
        }
-        up_read(&NILFS_SB(sb)->s_nilfs->ns_segctor_sem);
+        up_read(&nilfs->ns_segctor_sem);
        current->journal_info = ti->ti_save;
        if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
@@ -272,9 +268,8 @@ void nilfs_transaction_abort(struct super_block *sb)
 void nilfs_relax_pressure_in_lock(struct super_block *sb)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        struct nilfs_sc_info *sci = nilfs->ns_writer;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        if (!sci || !sci->sc_flush_request)
                return;
@@ -294,11 +289,13 @@ void nilfs_relax_pressure_in_lock(struct super_block *sb)
        downgrade_write(&nilfs->ns_segctor_sem);
 }
-static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
+static void nilfs_transaction_lock(struct super_block *sb,
                                   struct nilfs_transaction_info *ti,
                                   int gcflag)
 {
        struct nilfs_transaction_info *cur_ti = current->journal_info;
+        struct the_nilfs *nilfs = sb->s_fs_info;
+        struct nilfs_sc_info *sci = nilfs->ns_writer;
        WARN_ON(cur_ti);
        ti->ti_flags = NILFS_TI_WRITER;
@@ -309,30 +306,31 @@ static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
        current->journal_info = ti;
        for (;;) {
-                down_write(&sbi->s_nilfs->ns_segctor_sem);
+                down_write(&nilfs->ns_segctor_sem);
-                if (!test_bit(NILFS_SC_PRIOR_FLUSH, &NILFS_SC(sbi)->sc_flags))
+                if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags))
                        break;
-                nilfs_segctor_do_immediate_flush(NILFS_SC(sbi));
+                nilfs_segctor_do_immediate_flush(sci);
-                up_write(&sbi->s_nilfs->ns_segctor_sem);
+                up_write(&nilfs->ns_segctor_sem);
                yield();
        }
        if (gcflag)
                ti->ti_flags |= NILFS_TI_GC;
 }
-static void nilfs_transaction_unlock(struct nilfs_sb_info *sbi)
+static void nilfs_transaction_unlock(struct super_block *sb)
 {
        struct nilfs_transaction_info *ti = current->journal_info;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
        BUG_ON(ti->ti_count > 0);
-        up_write(&sbi->s_nilfs->ns_segctor_sem);
+        up_write(&nilfs->ns_segctor_sem);
        current->journal_info = ti->ti_save;
        if (!list_empty(&ti->ti_garbage))
-                nilfs_dispose_list(sbi, &ti->ti_garbage, 0);
+                nilfs_dispose_list(nilfs, &ti->ti_garbage, 0);
 }
 static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -430,7 +428,8 @@ static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
        nilfs_segctor_map_segsum_entry(
                sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
-        if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+        if (NILFS_I(inode)->i_root &&
+            !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
                set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
        /* skip finfo */
 }
@@ -713,7 +712,7 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
        }
 }
-static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
+static void nilfs_dispose_list(struct the_nilfs *nilfs,
                               struct list_head *head, int force)
 {
        struct nilfs_inode_info *ii, *n;
@@ -721,7 +720,7 @@ static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
        unsigned nv = 0;
        while (!list_empty(head)) {
-                spin_lock(&sbi->s_inode_lock);
+                spin_lock(&nilfs->ns_inode_lock);
                list_for_each_entry_safe(ii, n, head, i_dirty) {
                        list_del_init(&ii->i_dirty);
                        if (force) {
@@ -732,14 +731,14 @@ static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
                        } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) {
                                set_bit(NILFS_I_QUEUED, &ii->i_state);
                                list_add_tail(&ii->i_dirty,
-                                              &sbi->s_dirty_files);
+                                              &nilfs->ns_dirty_files);
                                continue;
                        }
                        ivec[nv++] = ii;
                        if (nv == SC_N_INODEVEC)
                                break;
                }
-                spin_unlock(&sbi->s_inode_lock);
+                spin_unlock(&nilfs->ns_inode_lock);
                for (pii = ivec; nv > 0; pii++, nv--)
                        iput(&(*pii)->vfs_inode);
@@ -772,24 +771,23 @@ static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
 static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
 {
-        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
        int ret = 0;
-        if (nilfs_test_metadata_dirty(sbi->s_nilfs, sci->sc_root))
+        if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
                set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
-        spin_lock(&sbi->s_inode_lock);
+        spin_lock(&nilfs->ns_inode_lock);
-        if (list_empty(&sbi->s_dirty_files) && nilfs_segctor_clean(sci))
+        if (list_empty(&nilfs->ns_dirty_files) && nilfs_segctor_clean(sci))
                ret++;
-        spin_unlock(&sbi->s_inode_lock);
+        spin_unlock(&nilfs->ns_inode_lock);
        return ret;
 }
 static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
 {
-        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        nilfs_mdt_clear_dirty(sci->sc_root->ifile);
        nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
@@ -799,7 +797,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
 static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
 {
-        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
        struct buffer_head *bh_cp;
        struct nilfs_checkpoint *raw_cp;
        int err;
@@ -823,8 +821,7 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
 static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
 {
-        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct buffer_head *bh_cp;
        struct nilfs_checkpoint *raw_cp;
        int err;
@@ -1048,8 +1045,7 @@ static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
 static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 {
-        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct list_head *head;
        struct nilfs_inode_info *ii;
        size_t ndone;
@@ -1858,7 +1854,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 {
        struct nilfs_segment_buffer *segbuf;
        struct page *bd_page = NULL, *fs_page = NULL;
-        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
        int update_sr = false;
        list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
@@ -1962,30 +1958,30 @@ static int nilfs_segctor_wait(struct nilfs_sc_info *sci)
        return ret;
 }
-static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
+static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
-                                        struct nilfs_sb_info *sbi)
+                                             struct the_nilfs *nilfs)
 {
        struct nilfs_inode_info *ii, *n;
        struct inode *ifile = sci->sc_root->ifile;
-        spin_lock(&sbi->s_inode_lock);
+        spin_lock(&nilfs->ns_inode_lock);
 retry:
-        list_for_each_entry_safe(ii, n, &sbi->s_dirty_files, i_dirty) {
+        list_for_each_entry_safe(ii, n, &nilfs->ns_dirty_files, i_dirty) {
                if (!ii->i_bh) {
                        struct buffer_head *ibh;
                        int err;
-                        spin_unlock(&sbi->s_inode_lock);
+                        spin_unlock(&nilfs->ns_inode_lock);
                        err = nilfs_ifile_get_inode_block(
                                ifile, ii->vfs_inode.i_ino, &ibh);
                        if (unlikely(err)) {
-                                nilfs_warning(sbi->s_super, __func__,
+                                nilfs_warning(sci->sc_super, __func__,
                                              "failed to get inode block.\n");
                                return err;
                        }
                        nilfs_mdt_mark_buffer_dirty(ibh);
                        nilfs_mdt_mark_dirty(ifile);
-                        spin_lock(&sbi->s_inode_lock);
+                        spin_lock(&nilfs->ns_inode_lock);
                        if (likely(!ii->i_bh))
                                ii->i_bh = ibh;
                        else
@@ -1998,18 +1994,18 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
                list_del(&ii->i_dirty);
                list_add_tail(&ii->i_dirty, &sci->sc_dirty_files);
        }
-        spin_unlock(&sbi->s_inode_lock);
+        spin_unlock(&nilfs->ns_inode_lock);
        return 0;
 }
-static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
+static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
-                                          struct nilfs_sb_info *sbi)
+                                             struct the_nilfs *nilfs)
 {
        struct nilfs_transaction_info *ti = current->journal_info;
        struct nilfs_inode_info *ii, *n;
-        spin_lock(&sbi->s_inode_lock);
+        spin_lock(&nilfs->ns_inode_lock);
        list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
                if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
                    test_bit(NILFS_I_DIRTY, &ii->i_state))
@@ -2021,7 +2017,7 @@ static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
                list_del(&ii->i_dirty);
                list_add_tail(&ii->i_dirty, &ti->ti_garbage);
        }
-        spin_unlock(&sbi->s_inode_lock);
+        spin_unlock(&nilfs->ns_inode_lock);
 }
 /*
@@ -2029,15 +2025,14 @@ static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
 */
 static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 {
-        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct page *failed_page;
        int err;
        sci->sc_stage.scnt = NILFS_ST_INIT;
        sci->sc_cno = nilfs->ns_cno;
-        err = nilfs_segctor_check_in_files(sci, sbi);
+        err = nilfs_segctor_collect_dirty_files(sci, nilfs);
        if (unlikely(err))
                goto out;
@@ -2115,7 +2110,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
        } while (sci->sc_stage.scnt != NILFS_ST_DONE);
 out:
-        nilfs_segctor_check_out_files(sci, sbi);
+        nilfs_segctor_drop_written_files(sci, nilfs);
        return err;
 failed_to_write:
@@ -2168,8 +2163,8 @@ static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
 */
 void nilfs_flush_segment(struct super_block *sb, ino_t ino)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        struct nilfs_sc_info *sci = nilfs->ns_writer;
        if (!sci || nilfs_doing_construction())
                return;
@@ -2258,8 +2253,8 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
 */
 int nilfs_construct_segment(struct super_block *sb)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        struct nilfs_sc_info *sci = nilfs->ns_writer;
        struct nilfs_transaction_info *ti;
        int err;
@@ -2296,8 +2291,8 @@ int nilfs_construct_segment(struct super_block *sb)
 int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
                                  loff_t start, loff_t end)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        struct nilfs_sc_info *sci = nilfs->ns_writer;
        struct nilfs_inode_info *ii;
        struct nilfs_transaction_info ti;
        int err = 0;
@@ -2305,33 +2300,33 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
        if (!sci)
                return -EROFS;
-        nilfs_transaction_lock(sbi, &ti, 0);
+        nilfs_transaction_lock(sb, &ti, 0);
        ii = NILFS_I(inode);
        if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) ||
-            nilfs_test_opt(sbi, STRICT_ORDER) ||
+            nilfs_test_opt(nilfs, STRICT_ORDER) ||
            test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
-            nilfs_discontinued(sbi->s_nilfs)) {
+            nilfs_discontinued(nilfs)) {
-                nilfs_transaction_unlock(sbi);
+                nilfs_transaction_unlock(sb);
                err = nilfs_segctor_sync(sci);
                return err;
        }
-        spin_lock(&sbi->s_inode_lock);
+        spin_lock(&nilfs->ns_inode_lock);
        if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
            !test_bit(NILFS_I_BUSY, &ii->i_state)) {
-                spin_unlock(&sbi->s_inode_lock);
+                spin_unlock(&nilfs->ns_inode_lock);
-                nilfs_transaction_unlock(sbi);
+                nilfs_transaction_unlock(sb);
                return 0;
        }
-        spin_unlock(&sbi->s_inode_lock);
+        spin_unlock(&nilfs->ns_inode_lock);
        sci->sc_dsync_inode = ii;
        sci->sc_dsync_start = start;
        sci->sc_dsync_end = end;
        err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
-        nilfs_transaction_unlock(sbi);
+        nilfs_transaction_unlock(sb);
        return err;
 }
@@ -2387,8 +2382,7 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
 */
 static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
 {
-        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct nilfs_super_block **sbp;
        int err = 0;
@@ -2406,11 +2400,12 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
                    nilfs_discontinued(nilfs)) {
                        down_write(&nilfs->ns_sem);
                        err = -EIO;
-                        sbp = nilfs_prepare_super(sbi,
+                        sbp = nilfs_prepare_super(sci->sc_super,
                                                  nilfs_sb_will_flip(nilfs));
                        if (likely(sbp)) {
                                nilfs_set_log_cursor(sbp[0], nilfs);
-                                err = nilfs_commit_super(sbi, NILFS_SB_COMMIT);
+                                err = nilfs_commit_super(sci->sc_super,
+                                                         NILFS_SB_COMMIT);
                        }
                        up_write(&nilfs->ns_sem);
                }
@@ -2442,16 +2437,15 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
 int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
                         void **kbufs)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        struct nilfs_sc_info *sci = nilfs->ns_writer;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct nilfs_transaction_info ti;
        int err;
        if (unlikely(!sci))
                return -EROFS;
-        nilfs_transaction_lock(sbi, &ti, 1);
+        nilfs_transaction_lock(sb, &ti, 1);
        err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat);
        if (unlikely(err))
@@ -2479,14 +2473,14 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
                set_current_state(TASK_INTERRUPTIBLE);
                schedule_timeout(sci->sc_interval);
        }
-        if (nilfs_test_opt(sbi, DISCARD)) {
+        if (nilfs_test_opt(nilfs, DISCARD)) {
                int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
                                                 sci->sc_nfreesegs);
                if (ret) {
                        printk(KERN_WARNING
                               "NILFS warning: error %d on discard request, "
                               "turning discards off for the device\n", ret);
-                        nilfs_clear_opt(sbi, DISCARD);
+                        nilfs_clear_opt(nilfs, DISCARD);
                }
        }
@@ -2494,16 +2488,15 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
        sci->sc_freesegs = NULL;
        sci->sc_nfreesegs = 0;
        nilfs_mdt_clear_shadow_map(nilfs->ns_dat);
-        nilfs_transaction_unlock(sbi);
+        nilfs_transaction_unlock(sb);
        return err;
 }
 static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
 {
-        struct nilfs_sb_info *sbi = sci->sc_sbi;
        struct nilfs_transaction_info ti;
-        nilfs_transaction_lock(sbi, &ti, 0);
+        nilfs_transaction_lock(sci->sc_super, &ti, 0);
        nilfs_segctor_construct(sci, mode);
        /*
@@ -2514,7 +2507,7 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
        if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags))
                nilfs_segctor_start_timer(sci);
-        nilfs_transaction_unlock(sbi);
+        nilfs_transaction_unlock(sci->sc_super);
 }
 static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
@@ -2560,7 +2553,7 @@ static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
 static int nilfs_segctor_thread(void *arg)
 {
        struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
-        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
        int timeout = 0;
        sci->sc_timer.data = (unsigned long)current;
@@ -2671,17 +2664,17 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
 /*
 * Setup & clean-up functions
 */
-static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi,
+static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
                                               struct nilfs_root *root)
 {
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_sc_info *sci;
        sci = kzalloc(sizeof(*sci), GFP_KERNEL);
        if (!sci)
                return NULL;
-        sci->sc_sbi = sbi;
+        sci->sc_super = sb;
-        sci->sc_super = sbi->s_super;
        nilfs_get_root(root);
        sci->sc_root = root;
@@ -2701,10 +2694,10 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi,
        sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
        sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK;
-        if (sbi->s_interval)
+        if (nilfs->ns_interval)
-                sci->sc_interval = sbi->s_interval;
+                sci->sc_interval = nilfs->ns_interval;
-        if (sbi->s_watermark)
+        if (nilfs->ns_watermark)
-                sci->sc_watermark = sbi->s_watermark;
+                sci->sc_watermark = nilfs->ns_watermark;
        return sci;
 }
@@ -2715,12 +2708,11 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
        /* The segctord thread was stopped and its timer was removed.
           But some tasks remain. */
        do {
-                struct nilfs_sb_info *sbi = sci->sc_sbi;
                struct nilfs_transaction_info ti;
-                nilfs_transaction_lock(sbi, &ti, 0);
+                nilfs_transaction_lock(sci->sc_super, &ti, 0);
                ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
-                nilfs_transaction_unlock(sbi);
+                nilfs_transaction_unlock(sci->sc_super);
        } while (ret && retrycount-- > 0);
 }
@@ -2735,10 +2727,10 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
 */
 static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 {
-        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
        int flag;
-        up_write(&sbi->s_nilfs->ns_segctor_sem);
+        up_write(&nilfs->ns_segctor_sem);
        spin_lock(&sci->sc_state_lock);
        nilfs_segctor_kill_thread(sci);
@@ -2752,9 +2744,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
        WARN_ON(!list_empty(&sci->sc_copied_buffers));
        if (!list_empty(&sci->sc_dirty_files)) {
-                nilfs_warning(sbi->s_super, __func__,
+                nilfs_warning(sci->sc_super, __func__,
                              "dirty file(s) after the final construction\n");
-                nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1);
+                nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
        }
        WARN_ON(!list_empty(&sci->sc_segbufs));
@@ -2762,79 +2754,78 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
        nilfs_put_root(sci->sc_root);
-        down_write(&sbi->s_nilfs->ns_segctor_sem);
+        down_write(&nilfs->ns_segctor_sem);
        del_timer_sync(&sci->sc_timer);
        kfree(sci);
 }
 /**
- * nilfs_attach_segment_constructor - attach a segment constructor
+ * nilfs_attach_log_writer - attach log writer
- * @sbi: nilfs_sb_info
+ * @sb: super block instance
 * @root: root object of the current filesystem tree
 *
- * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
+ * This allocates a log writer object, initializes it, and starts the
- * initializes it, and starts the segment constructor.
+ * log writer.
 *
 * Return Value: On success, 0 is returned. On error, one of the following
 * negative error code is returned.
 *
 * %-ENOMEM - Insufficient memory available.
 */
-int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
+int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
-                                     struct nilfs_root *root)
 {
+        struct the_nilfs *nilfs = sb->s_fs_info;
        int err;
-        if (NILFS_SC(sbi)) {
+        if (nilfs->ns_writer) {
                /*
                 * This happens if the filesystem was remounted
                 * read/write after nilfs_error degenerated it into a
                 * read-only mount.
                 */
-                nilfs_detach_segment_constructor(sbi);
+                nilfs_detach_log_writer(sb);
        }
-        sbi->s_sc_info = nilfs_segctor_new(sbi, root);
+        nilfs->ns_writer = nilfs_segctor_new(sb, root);
-        if (!sbi->s_sc_info)
+        if (!nilfs->ns_writer)
                return -ENOMEM;
-        err = nilfs_segctor_start_thread(NILFS_SC(sbi));
+        err = nilfs_segctor_start_thread(nilfs->ns_writer);
        if (err) {
-                kfree(sbi->s_sc_info);
+                kfree(nilfs->ns_writer);
-                sbi->s_sc_info = NULL;
+                nilfs->ns_writer = NULL;
        }
        return err;
 }
 /**
- * nilfs_detach_segment_constructor - destroy the segment constructor
+ * nilfs_detach_log_writer - destroy log writer
- * @sbi: nilfs_sb_info
+ * @sb: super block instance
 *
- * nilfs_detach_segment_constructor() kills the segment constructor daemon,
+ * This kills log writer daemon, frees the log writer object, and
- * frees the struct nilfs_sc_info, and destroy the dirty file list.
+ * destroys list of dirty files.
 */
-void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
+void nilfs_detach_log_writer(struct super_block *sb)
 {
-        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        LIST_HEAD(garbage_list);
        down_write(&nilfs->ns_segctor_sem);
-        if (NILFS_SC(sbi)) {
+        if (nilfs->ns_writer) {
-                nilfs_segctor_destroy(NILFS_SC(sbi));
+                nilfs_segctor_destroy(nilfs->ns_writer);
-                sbi->s_sc_info = NULL;
+                nilfs->ns_writer = NULL;
        }
        /* Force to free the list of dirty files */
-        spin_lock(&sbi->s_inode_lock);
+        spin_lock(&nilfs->ns_inode_lock);
-        if (!list_empty(&sbi->s_dirty_files)) {
+        if (!list_empty(&nilfs->ns_dirty_files)) {
-                list_splice_init(&sbi->s_dirty_files, &garbage_list);
+                list_splice_init(&nilfs->ns_dirty_files, &garbage_list);
-                nilfs_warning(sbi->s_super, __func__,
+                nilfs_warning(sb, __func__,
-                              "Non empty dirty list after the last "
+                              "Hit dirty file after stopped log writer\n");
-                              "segment construction\n");
+        }
-        }
+        spin_unlock(&nilfs->ns_inode_lock);
-        spin_unlock(&sbi->s_inode_lock);
        up_write(&nilfs->ns_segctor_sem);
-        nilfs_dispose_list(sbi, &garbage_list, 1);
+        nilfs_dispose_list(nilfs, &garbage_list, 1);
 }
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index cd8056e7cbed..6c02a86745fb 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -27,7 +27,7 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/nilfs2_fs.h>
-#include "sb.h"
+#include "nilfs.h"
 struct nilfs_root;
@@ -88,7 +88,6 @@ struct nilfs_segsum_pointer {
 /**
 * struct nilfs_sc_info - Segment constructor information
 * @sc_super: Back pointer to super_block struct
- * @sc_sbi: Back pointer to nilfs_sb_info struct
 * @sc_root: root object of the current filesystem tree
 * @sc_nblk_inc: Block count of current generation
 * @sc_dirty_files: List of files to be written
@@ -131,7 +130,6 @@ struct nilfs_segsum_pointer {
 */
 struct nilfs_sc_info {
        struct super_block     *sc_super;
-        struct nilfs_sb_info   *sc_sbi;
        struct nilfs_root      *sc_root;
        unsigned long           sc_nblk_inc;
@@ -235,18 +233,16 @@ extern void nilfs_flush_segment(struct super_block *, ino_t);
 extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
                                void **);
-int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
+int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root);
-                                     struct nilfs_root *root);
+void nilfs_detach_log_writer(struct super_block *sb);
-extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
 /* recovery.c */
 extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t,
                                       struct buffer_head **, int);
 extern int nilfs_search_super_root(struct the_nilfs *,
                                   struct nilfs_recovery_info *);
-extern int nilfs_salvage_orphan_logs(struct the_nilfs *,
+int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, struct super_block *sb,
-                                     struct nilfs_sb_info *,
+                              struct nilfs_recovery_info *ri);
-                                     struct nilfs_recovery_info *);
 extern void nilfs_dispose_segment_list(struct list_head *);
 #endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 0994f6a76c07..062cca065195 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -43,7 +43,6 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/parser.h>
-#include <linux/random.h>
 #include <linux/crc32.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
@@ -72,23 +71,23 @@ struct kmem_cache *nilfs_transaction_cachep;
 struct kmem_cache *nilfs_segbuf_cachep;
 struct kmem_cache *nilfs_btree_path_cache;
-static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount);
+static int nilfs_setup_super(struct super_block *sb, int is_mount);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
-static void nilfs_set_error(struct nilfs_sb_info *sbi)
+static void nilfs_set_error(struct super_block *sb)
 {
-        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_super_block **sbp;
        down_write(&nilfs->ns_sem);
        if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
                nilfs->ns_mount_state |= NILFS_ERROR_FS;
-                sbp = nilfs_prepare_super(sbi, 0);
+                sbp = nilfs_prepare_super(sb, 0);
                if (likely(sbp)) {
                        sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
                        if (sbp[1])
                                sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
-                        nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
+                        nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
                }
        }
        up_write(&nilfs->ns_sem);
@@ -109,7 +108,7 @@ static void nilfs_set_error(struct nilfs_sb_info *sbi)
 void nilfs_error(struct super_block *sb, const char *function,
                 const char *fmt, ...)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct va_format vaf;
        va_list args;
@@ -124,15 +123,15 @@ void nilfs_error(struct super_block *sb, const char *function,
        va_end(args);
        if (!(sb->s_flags & MS_RDONLY)) {
-                nilfs_set_error(sbi);
+                nilfs_set_error(sb);
-                if (nilfs_test_opt(sbi, ERRORS_RO)) {
+                if (nilfs_test_opt(nilfs, ERRORS_RO)) {
                        printk(KERN_CRIT "Remounting filesystem read-only\n");
                        sb->s_flags |= MS_RDONLY;
                }
        }
-        if (nilfs_test_opt(sbi, ERRORS_PANIC))
+        if (nilfs_test_opt(nilfs, ERRORS_PANIC))
                panic("NILFS (device %s): panic forced after error\n",
                      sb->s_id);
 }
@@ -189,14 +188,14 @@ void nilfs_destroy_inode(struct inode *inode)
        call_rcu(&inode->i_rcu, nilfs_i_callback);
 }
-static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
+static int nilfs_sync_super(struct super_block *sb, int flag)
 {
-        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        int err;
 retry:
        set_buffer_dirty(nilfs->ns_sbh[0]);
-        if (nilfs_test_opt(sbi, BARRIER)) {
+        if (nilfs_test_opt(nilfs, BARRIER)) {
                err = __sync_dirty_buffer(nilfs->ns_sbh[0],
                                          WRITE_SYNC | WRITE_FLUSH_FUA);
        } else {
@@ -263,10 +262,10 @@ void nilfs_set_log_cursor(struct nilfs_super_block *sbp,
        spin_unlock(&nilfs->ns_last_segment_lock);
 }
-struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
+struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
                                               int flip)
 {
-        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_super_block **sbp = nilfs->ns_sbp;
        /* nilfs->ns_sem must be locked by the caller. */
@@ -276,7 +275,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
                        memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
                } else {
                        printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
-                               sbi->s_super->s_id);
+                               sb->s_id);
                        return NULL;
                }
        } else if (sbp[1] &&
@@ -290,9 +289,9 @@ struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
        return sbp;
 }
-int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag)
+int nilfs_commit_super(struct super_block *sb, int flag)
 {
-        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_super_block **sbp = nilfs->ns_sbp;
        time_t t;
@@ -312,27 +311,28 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag)
                                            nilfs->ns_sbsize));
        }
        clear_nilfs_sb_dirty(nilfs);
-        return nilfs_sync_super(sbi, flag);
+        return nilfs_sync_super(sb, flag);
 }
 /**
 * nilfs_cleanup_super() - write filesystem state for cleanup
- * @sbi: nilfs_sb_info to be unmounted or degraded to read-only
+ * @sb: super block instance to be unmounted or degraded to read-only
 *
 * This function restores state flags in the on-disk super block.
 * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the
 * filesystem was not clean previously.
 */
-int nilfs_cleanup_super(struct nilfs_sb_info *sbi)
+int nilfs_cleanup_super(struct super_block *sb)
 {
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_super_block **sbp;
        int flag = NILFS_SB_COMMIT;
        int ret = -EIO;
-        sbp = nilfs_prepare_super(sbi, 0);
+        sbp = nilfs_prepare_super(sb, 0);
        if (sbp) {
-                sbp[0]->s_state = cpu_to_le16(sbi->s_nilfs->ns_mount_state);
+                sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
-                nilfs_set_log_cursor(sbp[0], sbi->s_nilfs);
+                nilfs_set_log_cursor(sbp[0], nilfs);
                if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) {
                        /*
                         * make the "clean" flag also to the opposite
@@ -342,21 +342,20 @@ int nilfs_cleanup_super(struct nilfs_sb_info *sbi)
                        sbp[1]->s_state = sbp[0]->s_state;
                        flag = NILFS_SB_COMMIT_ALL;
                }
-                ret = nilfs_commit_super(sbi, flag);
+                ret = nilfs_commit_super(sb, flag);
        }
        return ret;
 }
 static void nilfs_put_super(struct super_block *sb)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
-        nilfs_detach_segment_constructor(sbi);
+        nilfs_detach_log_writer(sb);
        if (!(sb->s_flags & MS_RDONLY)) {
                down_write(&nilfs->ns_sem);
-                nilfs_cleanup_super(sbi);
+                nilfs_cleanup_super(sb);
                up_write(&nilfs->ns_sem);
        }
@@ -365,15 +364,12 @@ static void nilfs_put_super(struct super_block *sb)
        iput(nilfs->ns_dat);
        destroy_nilfs(nilfs);
-        sbi->s_super = NULL;
        sb->s_fs_info = NULL;
-        kfree(sbi);
 }
 static int nilfs_sync_fs(struct super_block *sb, int wait)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct nilfs_super_block **sbp;
        int err = 0;
@@ -383,10 +379,10 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
        down_write(&nilfs->ns_sem);
        if (nilfs_sb_dirty(nilfs)) {
-                sbp = nilfs_prepare_super(sbi, nilfs_sb_will_flip(nilfs));
+                sbp = nilfs_prepare_super(sb, nilfs_sb_will_flip(nilfs));
                if (likely(sbp)) {
                        nilfs_set_log_cursor(sbp[0], nilfs);
-                        nilfs_commit_super(sbi, NILFS_SB_COMMIT);
+                        nilfs_commit_super(sb, NILFS_SB_COMMIT);
                }
        }
        up_write(&nilfs->ns_sem);
@@ -394,10 +390,10 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
        return err;
 }
-int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
+int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
                            struct nilfs_root **rootp)
 {
-        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_root *root;
        struct nilfs_checkpoint *raw_cp;
        struct buffer_head *bh_cp;
@@ -426,7 +422,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
                goto failed;
        }
-        err = nilfs_ifile_read(sbi->s_super, root, nilfs->ns_inode_size,
+        err = nilfs_ifile_read(sb, root, nilfs->ns_inode_size,
                               &raw_cp->cp_ifile_inode, &root->ifile);
        if (err)
                goto failed_bh;
@@ -450,8 +446,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
 static int nilfs_freeze(struct super_block *sb)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        int err;
        if (sb->s_flags & MS_RDONLY)
@@ -459,21 +454,20 @@ static int nilfs_freeze(struct super_block *sb)
        /* Mark super block clean */
        down_write(&nilfs->ns_sem);
-        err = nilfs_cleanup_super(sbi);
+        err = nilfs_cleanup_super(sb);
        up_write(&nilfs->ns_sem);
        return err;
 }
 static int nilfs_unfreeze(struct super_block *sb)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        if (sb->s_flags & MS_RDONLY)
                return 0;
        down_write(&nilfs->ns_sem);
-        nilfs_setup_super(sbi, false);
+        nilfs_setup_super(sb, false);
        up_write(&nilfs->ns_sem);
        return 0;
 }
@@ -530,22 +524,22 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
        struct super_block *sb = vfs->mnt_sb;
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root;
-        if (!nilfs_test_opt(sbi, BARRIER))
+        if (!nilfs_test_opt(nilfs, BARRIER))
                seq_puts(seq, ",nobarrier");
        if (root->cno != NILFS_CPTREE_CURRENT_CNO)
                seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno);
-        if (nilfs_test_opt(sbi, ERRORS_PANIC))
+        if (nilfs_test_opt(nilfs, ERRORS_PANIC))
                seq_puts(seq, ",errors=panic");
-        if (nilfs_test_opt(sbi, ERRORS_CONT))
+        if (nilfs_test_opt(nilfs, ERRORS_CONT))
                seq_puts(seq, ",errors=continue");
-        if (nilfs_test_opt(sbi, STRICT_ORDER))
+        if (nilfs_test_opt(nilfs, STRICT_ORDER))
                seq_puts(seq, ",order=strict");
-        if (nilfs_test_opt(sbi, NORECOVERY))
+        if (nilfs_test_opt(nilfs, NORECOVERY))
                seq_puts(seq, ",norecovery");
-        if (nilfs_test_opt(sbi, DISCARD))
+        if (nilfs_test_opt(nilfs, DISCARD))
                seq_puts(seq, ",discard");
        return 0;
@@ -594,7 +588,7 @@ static match_table_t tokens = {
 static int parse_options(char *options, struct super_block *sb, int is_remount)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
        char *p;
        substring_t args[MAX_OPT_ARGS];
@@ -609,29 +603,29 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_barrier:
-                        nilfs_set_opt(sbi, BARRIER);
+                        nilfs_set_opt(nilfs, BARRIER);
                        break;
                case Opt_nobarrier:
-                        nilfs_clear_opt(sbi, BARRIER);
+                        nilfs_clear_opt(nilfs, BARRIER);
                        break;
                case Opt_order:
                        if (strcmp(args[0].from, "relaxed") == 0)
                                /* Ordered data semantics */
-                                nilfs_clear_opt(sbi, STRICT_ORDER);
+                                nilfs_clear_opt(nilfs, STRICT_ORDER);
                        else if (strcmp(args[0].from, "strict") == 0)
                                /* Strict in-order semantics */
-                                nilfs_set_opt(sbi, STRICT_ORDER);
+                                nilfs_set_opt(nilfs, STRICT_ORDER);
                        else
                                return 0;
                        break;
                case Opt_err_panic:
-                        nilfs_write_opt(sbi, ERROR_MODE, ERRORS_PANIC);
+                        nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_PANIC);
                        break;
                case Opt_err_ro:
-                        nilfs_write_opt(sbi, ERROR_MODE, ERRORS_RO);
+                        nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_RO);
                        break;
                case Opt_err_cont:
-                        nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
+                        nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_CONT);
                        break;
                case Opt_snapshot:
                        if (is_remount) {
@@ -642,13 +636,13 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
                        }
                        break;
                case Opt_norecovery:
-                        nilfs_set_opt(sbi, NORECOVERY);
+                        nilfs_set_opt(nilfs, NORECOVERY);
                        break;
                case Opt_discard:
-                        nilfs_set_opt(sbi, DISCARD);
+                        nilfs_set_opt(nilfs, DISCARD);
                        break;
                case Opt_nodiscard:
-                        nilfs_clear_opt(sbi, DISCARD);
+                        nilfs_clear_opt(nilfs, DISCARD);
                        break;
                default:
                        printk(KERN_ERR
@@ -660,22 +654,24 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
 }
 static inline void
-nilfs_set_default_options(struct nilfs_sb_info *sbi,
+nilfs_set_default_options(struct super_block *sb,
                          struct nilfs_super_block *sbp)
 {
-        sbi->s_mount_opt =
+        struct the_nilfs *nilfs = sb->s_fs_info;
+        nilfs->ns_mount_opt =
                NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
 }
-static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount)
+static int nilfs_setup_super(struct super_block *sb, int is_mount)
 {
-        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_super_block **sbp;
        int max_mnt_count;
        int mnt_count;
        /* nilfs->ns_sem must be locked by the caller. */
-        sbp = nilfs_prepare_super(sbi, 0);
+        sbp = nilfs_prepare_super(sb, 0);
        if (!sbp)
                return -EIO;
@@ -704,8 +700,9 @@ skip_mount_setup:
        sbp[0]->s_state =
                cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
        /* synchronize sbp[1] with sbp[0] */
-        memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+        if (sbp[1])
-        return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
+                memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+        return nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
 }
 struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
@@ -726,7 +723,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
                                 struct nilfs_super_block *sbp,
                                 char *data)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
        sb->s_magic = le16_to_cpu(sbp->s_magic);
@@ -735,12 +732,12 @@ int nilfs_store_magic_and_option(struct super_block *sb,
        sb->s_flags |= MS_NOATIME;
 #endif
-        nilfs_set_default_options(sbi, sbp);
+        nilfs_set_default_options(sb, sbp);
-        sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid);
+        nilfs->ns_resuid = le16_to_cpu(sbp->s_def_resuid);
-        sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid);
+        nilfs->ns_resgid = le16_to_cpu(sbp->s_def_resgid);
-        sbi->s_interval = le32_to_cpu(sbp->s_c_interval);
+        nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval);
-        sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max);
+        nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max);
        return !parse_options(data, sb, 0) ? -EINVAL : 0 ;
 }
@@ -821,7 +818,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
 static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
                                 struct dentry **root_dentry)
 {
-        struct the_nilfs *nilfs = NILFS_SB(s)->s_nilfs;
+        struct the_nilfs *nilfs = s->s_fs_info;
        struct nilfs_root *root;
        int ret;
@@ -839,7 +836,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
                goto out;
        }
-        ret = nilfs_attach_checkpoint(NILFS_SB(s), cno, false, &root);
+        ret = nilfs_attach_checkpoint(s, cno, false, &root);
        if (ret) {
                printk(KERN_ERR "NILFS: error loading snapshot "
                       "(checkpoint number=%llu).\n",
@@ -873,7 +870,7 @@ static int nilfs_try_to_shrink_tree(struct dentry *root_dentry)
 int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
 {
-        struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_root *root;
        struct inode *inode;
        struct dentry *dentry;
@@ -886,7 +883,7 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
                return true;    /* protect recent checkpoints */
        ret = false;
-        root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
+        root = nilfs_lookup_root(nilfs, cno);
        if (root) {
                inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO);
                if (inode) {
@@ -916,43 +913,21 @@ static int
 nilfs_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct the_nilfs *nilfs;
-        struct nilfs_sb_info *sbi;
        struct nilfs_root *fsroot;
        struct backing_dev_info *bdi;
        __u64 cno;
        int err;
-        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+        nilfs = alloc_nilfs(sb->s_bdev);
-        if (!sbi)
+        if (!nilfs)
                return -ENOMEM;
-        sb->s_fs_info = sbi;
+        sb->s_fs_info = nilfs;
-        sbi->s_super = sb;
-        nilfs = alloc_nilfs(sb->s_bdev);
-        if (!nilfs) {
-                err = -ENOMEM;
-                goto failed_sbi;
-        }
-        sbi->s_nilfs = nilfs;
-        err = init_nilfs(nilfs, sbi, (char *)data);
+        err = init_nilfs(nilfs, sb, (char *)data);
        if (err)
                goto failed_nilfs;
-        spin_lock_init(&sbi->s_inode_lock);
-        INIT_LIST_HEAD(&sbi->s_dirty_files);
-        /*
-         * Following initialization is overlapped because
-         * nilfs_sb_info structure has been cleared at the beginning.
-         * But we reserve them to keep our interest and make ready
-         * for the future change.
-         */
-        get_random_bytes(&sbi->s_next_generation,
-                         sizeof(sbi->s_next_generation));
-        spin_lock_init(&sbi->s_next_gen_lock);
        sb->s_op = &nilfs_sops;
        sb->s_export_op = &nilfs_export_ops;
        sb->s_root = NULL;
@@ -961,12 +936,12 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
        bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
        sb->s_bdi = bdi ? : &default_backing_dev_info;
-        err = load_nilfs(nilfs, sbi);
+        err = load_nilfs(nilfs, sb);
        if (err)
                goto failed_nilfs;
        cno = nilfs_last_cno(nilfs);
-        err = nilfs_attach_checkpoint(sbi, cno, true, &fsroot);
+        err = nilfs_attach_checkpoint(sb, cno, true, &fsroot);
        if (err) {
                printk(KERN_ERR "NILFS: error loading last checkpoint "
                       "(checkpoint number=%llu).\n", (unsigned long long)cno);
@@ -974,7 +949,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (!(sb->s_flags & MS_RDONLY)) {
-                err = nilfs_attach_segment_constructor(sbi, fsroot);
+                err = nilfs_attach_log_writer(sb, fsroot);
                if (err)
                        goto failed_checkpoint;
        }
@@ -987,14 +962,14 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
        if (!(sb->s_flags & MS_RDONLY)) {
                down_write(&nilfs->ns_sem);
-                nilfs_setup_super(sbi, true);
+                nilfs_setup_super(sb, true);
                up_write(&nilfs->ns_sem);
        }
        return 0;
 failed_segctor:
-        nilfs_detach_segment_constructor(sbi);
+        nilfs_detach_log_writer(sb);
 failed_checkpoint:
        nilfs_put_root(fsroot);
@@ -1006,23 +981,18 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 failed_nilfs:
        destroy_nilfs(nilfs);
- failed_sbi:
-        sb->s_fs_info = NULL;
-        kfree(sbi);
        return err;
 }
 static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        unsigned long old_sb_flags;
        unsigned long old_mount_opt;
        int err;
        old_sb_flags = sb->s_flags;
-        old_mount_opt = sbi->s_mount_opt;
+        old_mount_opt = nilfs->ns_mount_opt;
        if (!parse_options(data, sb, 1)) {
                err = -EINVAL;
@@ -1042,8 +1012,8 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                goto out;
        if (*flags & MS_RDONLY) {
-                /* Shutting down the segment constructor */
+                /* Shutting down log writer */
-                nilfs_detach_segment_constructor(sbi);
+                nilfs_detach_log_writer(sb);
                sb->s_flags |= MS_RDONLY;
                /*
@@ -1051,7 +1021,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                 * the RDONLY flag and then mark the partition as valid again.
                 */
                down_write(&nilfs->ns_sem);
-                nilfs_cleanup_super(sbi);
+                nilfs_cleanup_super(sb);
                up_write(&nilfs->ns_sem);
        } else {
                __u64 features;
@@ -1078,12 +1048,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                sb->s_flags &= ~MS_RDONLY;
                root = NILFS_I(sb->s_root->d_inode)->i_root;
-                err = nilfs_attach_segment_constructor(sbi, root);
+                err = nilfs_attach_log_writer(sb, root);
                if (err)
                        goto restore_opts;
                down_write(&nilfs->ns_sem);
-                nilfs_setup_super(sbi, true);
+                nilfs_setup_super(sb, true);
                up_write(&nilfs->ns_sem);
        }
 out:
@@ -1091,13 +1061,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 restore_opts:
        sb->s_flags = old_sb_flags;
-        sbi->s_mount_opt = old_mount_opt;
+        nilfs->ns_mount_opt = old_mount_opt;
        return err;
 }
 struct nilfs_super_data {
        struct block_device *bdev;
-        struct nilfs_sb_info *sbi;
        __u64 cno;
        int flags;
 };
@@ -1278,7 +1247,7 @@ static void nilfs_inode_init_once(void *obj)
 #ifdef CONFIG_NILFS_XATTR
        init_rwsem(&ii->xattr_sem);
 #endif
-        nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+        address_space_init_once(&ii->i_btnode_cache);
        ii->i_bmap = &ii->i_bmap_data;
        inode_init_once(&ii->vfs_inode);
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ad4ac607cf57..d2acd1a651f3 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
+#include <linux/random.h>
 #include <linux/crc32.h>
 #include "nilfs.h"
 #include "segment.h"
@@ -75,7 +76,10 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
        nilfs->ns_bdev = bdev;
        atomic_set(&nilfs->ns_ndirtyblks, 0);
        init_rwsem(&nilfs->ns_sem);
+        INIT_LIST_HEAD(&nilfs->ns_dirty_files);
        INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
+        spin_lock_init(&nilfs->ns_inode_lock);
+        spin_lock_init(&nilfs->ns_next_gen_lock);
        spin_lock_init(&nilfs->ns_last_segment_lock);
        nilfs->ns_cptree = RB_ROOT;
        spin_lock_init(&nilfs->ns_cptree_lock);
@@ -197,16 +201,16 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
 /**
 * load_nilfs - load and recover the nilfs
 * @nilfs: the_nilfs structure to be released
- * @sbi: nilfs_sb_info used to recover past segment
+ * @sb: super block isntance used to recover past segment
 *
 * load_nilfs() searches and load the latest super root,
 * attaches the last segment, and does recovery if needed.
 * The caller must call this exclusively for simultaneous mounts.
 */
-int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
+int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
 {
        struct nilfs_recovery_info ri;
-        unsigned int s_flags = sbi->s_super->s_flags;
+        unsigned int s_flags = sb->s_flags;
        int really_read_only = bdev_read_only(nilfs->ns_bdev);
        int valid_fs = nilfs_valid_fs(nilfs);
        int err;
@@ -271,7 +275,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
                        goto scan_error;
        }
-        err = nilfs_load_super_root(nilfs, sbi->s_super, ri.ri_super_root);
+        err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root);
        if (unlikely(err)) {
                printk(KERN_ERR "NILFS: error loading super root.\n");
                goto failed;
@@ -283,7 +287,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        if (s_flags & MS_RDONLY) {
                __u64 features;
-                if (nilfs_test_opt(sbi, NORECOVERY)) {
+                if (nilfs_test_opt(nilfs, NORECOVERY)) {
                        printk(KERN_INFO "NILFS: norecovery option specified. "
                               "skipping roll-forward recovery\n");
                        goto skip_recovery;
@@ -304,21 +308,21 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
                        err = -EROFS;
                        goto failed_unload;
                }
-                sbi->s_super->s_flags &= ~MS_RDONLY;
+                sb->s_flags &= ~MS_RDONLY;
-        } else if (nilfs_test_opt(sbi, NORECOVERY)) {
+        } else if (nilfs_test_opt(nilfs, NORECOVERY)) {
                printk(KERN_ERR "NILFS: recovery cancelled because norecovery "
                       "option was specified for a read/write mount\n");
                err = -EINVAL;
                goto failed_unload;
        }
-        err = nilfs_salvage_orphan_logs(nilfs, sbi, &ri);
+        err = nilfs_salvage_orphan_logs(nilfs, sb, &ri);
        if (err)
                goto failed_unload;
        down_write(&nilfs->ns_sem);
        nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */
-        err = nilfs_cleanup_super(sbi);
+        err = nilfs_cleanup_super(sb);
        up_write(&nilfs->ns_sem);
        if (err) {
@@ -330,7 +334,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 skip_recovery:
        nilfs_clear_recovery_info(&ri);
-        sbi->s_super->s_flags = s_flags;
+        sb->s_flags = s_flags;
        return 0;
 scan_error:
@@ -344,7 +348,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 failed:
        nilfs_clear_recovery_info(&ri);
-        sbi->s_super->s_flags = s_flags;
+        sb->s_flags = s_flags;
        return err;
 }
@@ -475,10 +479,13 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
                        return -EIO;
                }
                printk(KERN_WARNING
-                       "NILFS warning: unable to read primary superblock\n");
+                       "NILFS warning: unable to read primary superblock "
-        } else if (!sbp[1])
+                       "(blocksize = %d)\n", blocksize);
+        } else if (!sbp[1]) {
                printk(KERN_WARNING
-                       "NILFS warning: unable to read secondary superblock\n");
+                       "NILFS warning: unable to read secondary superblock "
+                       "(blocksize = %d)\n", blocksize);
+        }
        /*
         * Compare two super blocks and set 1 in swp if the secondary
@@ -505,7 +512,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
        if (!valid[!swp])
                printk(KERN_WARNING "NILFS warning: broken superblock. "
-                       "using spare superblock.\n");
+                       "using spare superblock (blocksize = %d).\n", blocksize);
        if (swp)
                nilfs_swap_super_block(nilfs);
@@ -519,7 +526,6 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
 /**
 * init_nilfs - initialize a NILFS instance.
 * @nilfs: the_nilfs structure
- * @sbi: nilfs_sb_info
 * @sb: super block
 * @data: mount options
 *
@@ -530,9 +536,8 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
 * Return Value: On success, 0 is returned. On error, a negative error
 * code is returned.
 */
-int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
+int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
 {
-        struct super_block *sb = sbi->s_super;
        struct nilfs_super_block *sbp;
        int blocksize;
        int err;
@@ -588,6 +593,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
        nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
        nilfs->ns_blocksize = blocksize;
+        get_random_bytes(&nilfs->ns_next_generation,
+                         sizeof(nilfs->ns_next_generation));
        err = nilfs_store_disk_layout(nilfs, sbp);
        if (err)
                goto failed_sbh;
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index fd85e4c05c6b..f4968145c2a3 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -31,7 +31,8 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/slab.h>
-#include "sb.h"
+struct nilfs_sc_info;
 /* the_nilfs struct */
 enum {
@@ -65,13 +66,23 @@ enum {
 * @ns_last_cno: checkpoint number of the latest segment
 * @ns_prot_seq: least sequence number of segments which must not be reclaimed
 * @ns_prev_seq: base sequence number used to decide if advance log cursor
- * @ns_segctor_sem: segment constructor semaphore
+ * @ns_writer: log writer
+ * @ns_segctor_sem: semaphore protecting log write
 * @ns_dat: DAT file inode
 * @ns_cpfile: checkpoint file inode
 * @ns_sufile: segusage file inode
 * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root)
 * @ns_cptree_lock: lock protecting @ns_cptree
+ * @ns_dirty_files: list of dirty files
+ * @ns_inode_lock: lock protecting @ns_dirty_files
 * @ns_gc_inodes: dummy inodes to keep live blocks
+ * @ns_next_generation: next generation number for inodes
+ * @ns_next_gen_lock: lock protecting @ns_next_generation
+ * @ns_mount_opt: mount options
+ * @ns_resuid: uid for reserved blocks
+ * @ns_resgid: gid for reserved blocks
+ * @ns_interval: checkpoint creation interval
+ * @ns_watermark: watermark for the number of dirty buffers
 * @ns_blocksize_bits: bit length of block size
 * @ns_blocksize: block size
 * @ns_nsegments: number of segments in filesystem
@@ -131,6 +142,7 @@ struct the_nilfs {
        u64                     ns_prot_seq;
        u64                     ns_prev_seq;
+        struct nilfs_sc_info   *ns_writer;
        struct rw_semaphore     ns_segctor_sem;
        /*
@@ -145,9 +157,25 @@ struct the_nilfs {
        struct rb_root          ns_cptree;
        spinlock_t              ns_cptree_lock;
+        /* Dirty inode list */
+        struct list_head        ns_dirty_files;
+        spinlock_t              ns_inode_lock;
        /* GC inode list */
        struct list_head        ns_gc_inodes;
+        /* Inode allocator */
+        u32                     ns_next_generation;
+        spinlock_t              ns_next_gen_lock;
+        /* Mount options */
+        unsigned long           ns_mount_opt;
+        uid_t                   ns_resuid;
+        gid_t                   ns_resgid;
+        unsigned long           ns_interval;
+        unsigned long           ns_watermark;
        /* Disk layout information (static) */
        unsigned int            ns_blocksize_bits;
        unsigned int            ns_blocksize;
@@ -180,6 +208,20 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
 THE_NILFS_FNS(GC_RUNNING, gc_running)
 THE_NILFS_FNS(SB_DIRTY, sb_dirty)
+/*
+ * Mount option operations
+ */
+#define nilfs_clear_opt(nilfs, opt)  \
+        do { (nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
+#define nilfs_set_opt(nilfs, opt)  \
+        do { (nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt; } while (0)
+#define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt)
+#define nilfs_write_opt(nilfs, mask, opt)                               \
+        do { (nilfs)->ns_mount_opt =                                    \
+                (((nilfs)->ns_mount_opt & ~NILFS_MOUNT_##mask) |        \
+                 NILFS_MOUNT_##opt);                                    \
+        } while (0)
 /**
 * struct nilfs_root - nilfs root object
 * @cno: checkpoint number
@@ -224,15 +266,14 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
 struct the_nilfs *alloc_nilfs(struct block_device *bdev);
 void destroy_nilfs(struct the_nilfs *nilfs);
-int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
+int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data);
-int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
+int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
 int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
 struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
 struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs,
                                             __u64 cno);
 void nilfs_put_root(struct nilfs_root *root);
-struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
 int nilfs_near_disk_full(struct the_nilfs *);
 void nilfs_fall_back_super_block(struct the_nilfs *);
 void nilfs_swap_super_block(struct the_nilfs *);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8b61220cffc5..6b1305dc26c0 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -876,7 +876,7 @@ SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark);
 #endif
 /*
- * fanotify_user_setup - Our initialization function.  Note that we cannnot return
+ * fanotify_user_setup - Our initialization function.  Note that we cannot return
 * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
 * must result in panic().
 */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 4c29fcf557d1..07ea8d3e6ea2 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -22,13 +22,14 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
-#include <linux/writeback.h> /* for inode_lock */
 #include <asm/atomic.h>
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
+#include "../internal.h"
 /*
 * Recalculate the mask of events relevant to a given inode locked.
 */
@@ -237,15 +238,14 @@ out:
 * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
 * @list: list of inodes being unmounted (sb->s_inodes)
 *
- * Called with inode_lock held, protecting the unmounting super block's list
+ * Called during unmount with no locks held, so needs to be safe against
- * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
+ * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block.
- * We temporarily drop inode_lock, however, and CAN block.
 */
 void fsnotify_unmount_inodes(struct list_head *list)
 {
        struct inode *inode, *next_i, *need_iput = NULL;
-        spin_lock(&inode_lock);
+        spin_lock(&inode_sb_list_lock);
        list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
                struct inode *need_iput_tmp;
@@ -254,8 +254,11 @@ void fsnotify_unmount_inodes(struct list_head *list)
                 * I_WILL_FREE, or I_NEW which is fine because by that point
                 * the inode cannot have any associated watches.
                 */
-                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+                spin_lock(&inode->i_lock);
+                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+                        spin_unlock(&inode->i_lock);
                        continue;
+                }
                /*
                 * If i_count is zero, the inode cannot have any watches and
@@ -263,8 +266,10 @@ void fsnotify_unmount_inodes(struct list_head *list)
                 * evict all inodes with zero i_count from icache which is
                 * unnecessarily violent and may in fact be illegal to do.
                 */
-                if (!atomic_read(&inode->i_count))
+                if (!atomic_read(&inode->i_count)) {
+                        spin_unlock(&inode->i_lock);
                        continue;
+                }
                need_iput_tmp = need_iput;
                need_iput = NULL;
@@ -274,22 +279,25 @@ void fsnotify_unmount_inodes(struct list_head *list)
                        __iget(inode);
                else
                        need_iput_tmp = NULL;
+                spin_unlock(&inode->i_lock);
                /* In case the dropping of a reference would nuke next_i. */
                if ((&next_i->i_sb_list != list) &&
-                    atomic_read(&next_i->i_count) &&
+                    atomic_read(&next_i->i_count)) {
-                    !(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
+                        spin_lock(&next_i->i_lock);
-                        __iget(next_i);
+                        if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
-                        need_iput = next_i;
+                                __iget(next_i);
+                                need_iput = next_i;
+                        }
+                        spin_unlock(&next_i->i_lock);
                }
                /*
-                 * We can safely drop inode_lock here because we hold
+                 * We can safely drop inode_sb_list_lock here because we hold
                 * references on both inode and next_i.  Also no new inodes
-                 * will be added since the umount has begun.  Finally,
+                 * will be added since the umount has begun.
-                 * iprune_mutex keeps shrink_icache_memory() away.
                 */
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode_sb_list_lock);
                if (need_iput_tmp)
                        iput(need_iput_tmp);
@@ -301,7 +309,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
                iput(inode);
-                spin_lock(&inode_lock);
+                spin_lock(&inode_sb_list_lock);
        }
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_sb_list_lock);
 }
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 4cd5d5d78f9f..bd46e7c8a0ef 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -841,7 +841,7 @@ out:
 }
 /*
- * inotify_user_setup - Our initialization function.  Note that we cannnot return
+ * inotify_user_setup - Our initialization function.  Note that we cannot return
 * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
 * must result in panic().
 */
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 325185e514bb..50c00856f730 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -91,7 +91,6 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/srcu.h>
-#include <linux/writeback.h> /* for inode_lock */
 #include <asm/atomic.h>
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 85eebff6d0d7..e86577d6c5c3 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -23,7 +23,6 @@
 #include <linux/mount.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
-#include <linux/writeback.h> /* for inode_lock */
 #include <asm/atomic.h>
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 4ff028fcfd6e..30206b238433 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -2,18 +2,13 @@
 obj-$(CONFIG_NTFS_FS) += ntfs.o
-ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
+ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
-             index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
+          index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
-             unistr.o upcase.o
+          unistr.o upcase.o
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.30\"
+ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
-ifeq ($(CONFIG_NTFS_DEBUG),y)
+ccflags-y := -DNTFS_VERSION=\"2.1.30\"
-EXTRA_CFLAGS += -DDEBUG
+ccflags-$(CONFIG_NTFS_DEBUG)    += -DDEBUG
-endif
+ccflags-$(CONFIG_NTFS_RW)       += -DNTFS_RW
-ifeq ($(CONFIG_NTFS_RW),y)
-EXTRA_CFLAGS += -DNTFS_RW
-ntfs-objs += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
-endif
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index c3c2c7ac9020..0b1e885b8cf8 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1543,8 +1543,6 @@ err_out:
 */
 const struct address_space_operations ntfs_aops = {
        .readpage       = ntfs_readpage,        /* Fill page with data. */
-        .sync_page      = block_sync_page,      /* Currently, just unplugs the
-                                                   disk request queue. */
 #ifdef NTFS_RW
        .writepage      = ntfs_writepage,       /* Write dirty page to disk. */
 #endif /* NTFS_RW */
@@ -1560,8 +1558,6 @@ const struct address_space_operations ntfs_aops = {
 */
 const struct address_space_operations ntfs_mst_aops = {
        .readpage       = ntfs_readpage,        /* Fill page with data. */
-        .sync_page      = block_sync_page,      /* Currently, just unplugs the
-                                                   disk request queue. */
 #ifdef NTFS_RW
        .writepage      = ntfs_writepage,       /* Write dirty page to disk. */
        .set_page_dirty = __set_page_dirty_nobuffers,   /* Set the page dirty
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 6551c7cbad92..ef9ed854255c 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -698,8 +698,7 @@ lock_retry_remap:
                                        "uptodate! Unplugging the disk queue "
                                        "and rescheduling.");
                        get_bh(tbh);
-                        blk_run_address_space(mapping);
+                        io_schedule();
-                        schedule();
                        put_bh(tbh);
                        if (unlikely(!buffer_uptodate(tbh)))
                                goto read_err;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index a627ed82c0a3..0b56c6b7ec01 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -54,7 +54,7 @@
 *
 * Return 1 if the attributes match and 0 if not.
 *
- * NOTE: This function runs with the inode_lock spin lock held so it is not
+ * NOTE: This function runs with the inode->i_lock spin lock held so it is not
 * allowed to sleep.
 */
 int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
@@ -98,7 +98,7 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
 *
 * Return 0 on success and -errno on error.
 *
- * NOTE: This function runs with the inode_lock spin lock held so it is not
+ * NOTE: This function runs with the inode->i_lock spin lock held so it is not
 * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
 */
 static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index b572b6727181..326e7475a22a 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1,7 +1,7 @@
 /**
 * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
 *
- * Copyright (c) 2001-2006 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
 * Copyright (c) 2002 Richard Russon
 *
 * This program/include file is free software; you can redistribute it and/or
@@ -2576,6 +2576,8 @@ mft_rec_already_initialized:
        flush_dcache_page(page);
        SetPageUptodate(page);
        if (base_ni) {
+                MFT_RECORD *m_tmp;
                /*
                 * Setup the base mft record in the extent mft record.  This
                 * completes initialization of the allocated extent mft record
@@ -2588,11 +2590,11 @@ mft_rec_already_initialized:
                 * attach it to the base inode @base_ni and map, pin, and lock
                 * its, i.e. the allocated, mft record.
                 */
-                m = map_extent_mft_record(base_ni, bit, &ni);
+                m_tmp = map_extent_mft_record(base_ni, bit, &ni);
-                if (IS_ERR(m)) {
+                if (IS_ERR(m_tmp)) {
                        ntfs_error(vol->sb, "Failed to map allocated extent "
                                        "mft record 0x%llx.", (long long)bit);
-                        err = PTR_ERR(m);
+                        err = PTR_ERR(m_tmp);
                        /* Set the mft record itself not in use. */
                        m->flags &= cpu_to_le16(
                                        ~le16_to_cpu(MFT_RECORD_IN_USE));
@@ -2603,6 +2605,7 @@ mft_rec_already_initialized:
                        ntfs_unmap_page(page);
                        goto undo_mftbmp_alloc;
                }
+                BUG_ON(m != m_tmp);
                /*
                 * Make sure the allocated mft record is written out to disk.
                 * No need to set the inode dirty because the caller is going
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 07d9fd854350..d8a0313e99e6 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -1,6 +1,6 @@
-EXTRA_CFLAGS += -Ifs/ocfs2
+ccflags-y := -Ifs/ocfs2
-EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
+ccflags-y += -DCATCH_BH_JBD_RACES
 obj-$(CONFIG_OCFS2_FS) +=       \
        ocfs2.o                 \
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 704f6b1742f3..90f2729b7a5b 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -497,7 +497,7 @@ static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name,
        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EPERM;
        if (value) {
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1fbb0e20131b..daea0359e974 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2043,7 +2043,6 @@ const struct address_space_operations ocfs2_aops = {
        .write_begin            = ocfs2_write_begin,
        .write_end              = ocfs2_write_end,
        .bmap                   = ocfs2_bmap,
-        .sync_page              = block_sync_page,
        .direct_IO              = ocfs2_direct_IO,
        .invalidatepage         = ocfs2_invalidatepage,
        .releasepage            = ocfs2_releasepage,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index b108e863d8f6..1adab287bd24 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -367,11 +367,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
 static void o2hb_wait_on_io(struct o2hb_region *reg,
                            struct o2hb_bio_wait_ctxt *wc)
 {
-        struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
-        blk_run_address_space(mapping);
        o2hb_bio_wait_dec(wc, 1);
        wait_for_completion(&wc->wc_io_complete);
 }
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 6d80ecc7834f..7eb90403fc8a 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -56,7 +56,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
        int ret = 0;    /* if all else fails, just return false */
        struct ocfs2_super *osb;
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        inode = dentry->d_inode;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index d417b3f9b0c7..f97b6f1c61dd 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -354,7 +354,7 @@ static inline int ocfs2_match(int len,
 /*
 * Returns 0 if not found, -1 on failure, and 1 on success
 */
-static int inline ocfs2_search_dirblock(struct buffer_head *bh,
+static inline int ocfs2_search_dirblock(struct buffer_head *bh,
                                        struct inode *dir,
                                        const char *name, int namelen,
                                        unsigned long offset,
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index dcebf0d920fa..c8a044efbb15 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,4 +1,4 @@
-EXTRA_CFLAGS += -Ifs/ocfs2
+ccflags-y := -Ifs/ocfs2
 obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index df69b4856d0d..f14be89a6701 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -1,4 +1,4 @@
-EXTRA_CFLAGS += -Ifs/ocfs2
+ccflags-y := -Ifs/ocfs2
 obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 5dbc3062b4fd..254652a9b542 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -197,8 +197,12 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
                   dentry->d_name.len, dentry->d_name.name,
                   fh, len, connectable);
-        if (len < 3 || (connectable && len < 6)) {
+        if (connectable && (len < 6)) {
-                mlog(ML_ERROR, "fh buffer is too small for encoding\n");
+                *max_len = 6;
+                type = 255;
+                goto bail;
+        } else if (len < 3) {
+                *max_len = 3;
                type = 255;
                goto bail;
        }
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7a4868196152..09de77ce002a 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -82,7 +82,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
        }
        status = -EACCES;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                goto bail_unlock;
        if (!S_ISDIR(inode->i_mode))
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 43e56b97f9c0..6180da1e37e6 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -405,9 +405,9 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
               ocfs2_quota_trans_credits(sb);
 }
-/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
+/* data block for new dir/symlink, allocation of directory block, dx_root
- * bitmap block for the new bit) dx_root update for free list */
+ * update for free list */
-#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1)
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1)
 static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
 {
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 849fb4a2e814..d6c25d76b537 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -293,7 +293,7 @@ static int ocfs2_mknod(struct inode *dir,
        }
        /* get security xattr */
-        status = ocfs2_init_security_get(inode, dir, &si);
+        status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
        if (status) {
                if (status == -EOPNOTSUPP)
                        si.enable = 0;
@@ -1665,7 +1665,7 @@ static int ocfs2_symlink(struct inode *dir,
        }
        /* get security xattr */
-        status = ocfs2_init_security_get(inode, dir, &si);
+        status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
        if (status) {
                if (status == -EOPNOTSUPP)
                        si.enable = 0;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 51cd6898e7f1..1a97ba1ec3fc 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -831,18 +831,18 @@ static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
 static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
 {
-        ext2_set_bit(bit, bitmap);
+        __test_and_set_bit_le(bit, bitmap);
 }
 #define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
 static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
 {
-        ext2_clear_bit(bit, bitmap);
+        __test_and_clear_bit_le(bit, bitmap);
 }
 #define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
-#define ocfs2_test_bit ext2_test_bit
+#define ocfs2_test_bit test_bit_le
-#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
+#define ocfs2_find_next_zero_bit find_next_zero_bit_le
-#define ocfs2_find_next_bit ext2_find_next_bit
+#define ocfs2_find_next_bit find_next_bit_le
 #endif  /* OCFS2_H */
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 196fcb52d95d..d5ab56cbe5c5 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -114,7 +114,4 @@ int ocfs2_local_write_dquot(struct dquot *dquot);
 extern const struct dquot_operations ocfs2_quota_operations;
 extern struct quota_format_type ocfs2_quota_format;
-int ocfs2_quota_setup(void);
-void ocfs2_quota_shutdown(void);
 #endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 4607923eb24c..a73f64166481 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -63,8 +63,6 @@
 *        write to gf
 */
-static struct workqueue_struct *ocfs2_quota_wq = NULL;
 static void qsync_work_fn(struct work_struct *work);
 static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
@@ -400,8 +398,8 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
                                                OCFS2_QBLK_RESERVED_SPACE;
        oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
        INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
-        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+        schedule_delayed_work(&oinfo->dqi_sync_work,
-                           msecs_to_jiffies(oinfo->dqi_syncms));
+                              msecs_to_jiffies(oinfo->dqi_syncms));
 out_err:
        mlog_exit(status);
@@ -635,8 +633,8 @@ static void qsync_work_fn(struct work_struct *work)
        struct super_block *sb = oinfo->dqi_gqinode->i_sb;
        dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
-        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+        schedule_delayed_work(&oinfo->dqi_sync_work,
-                           msecs_to_jiffies(oinfo->dqi_syncms));
+                              msecs_to_jiffies(oinfo->dqi_syncms));
 }
 /*
@@ -923,20 +921,3 @@ const struct dquot_operations ocfs2_quota_operations = {
        .alloc_dquot    = ocfs2_alloc_dquot,
        .destroy_dquot  = ocfs2_destroy_dquot,
 };
-int ocfs2_quota_setup(void)
-{
-        ocfs2_quota_wq = create_workqueue("o2quot");
-        if (!ocfs2_quota_wq)
-                return -ENOMEM;
-        return 0;
-}
-void ocfs2_quota_shutdown(void)
-{
-        if (ocfs2_quota_wq) {
-                flush_workqueue(ocfs2_quota_wq);
-                destroy_workqueue(ocfs2_quota_wq);
-                ocfs2_quota_wq = NULL;
-        }
-}
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index b5f9160e93e9..c384d634872a 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3228,7 +3228,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
                                        u32 num_clusters, unsigned int e_flags)
 {
        int ret, delete, index, credits =  0;
-        u32 new_bit, new_len;
+        u32 new_bit, new_len, orig_num_clusters;
        unsigned int set_len;
        struct ocfs2_super *osb = OCFS2_SB(sb);
        handle_t *handle;
@@ -3261,6 +3261,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
                goto out;
        }
+        orig_num_clusters = num_clusters;
        while (num_clusters) {
                ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
                                             p_cluster, num_clusters,
@@ -3348,7 +3350,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
         * in write-back mode.
         */
        if (context->get_clusters == ocfs2_di_get_clusters) {
-                ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters);
+                ret = ocfs2_cow_sync_writeback(sb, context, cpos,
+                                               orig_num_clusters);
                if (ret)
                        mlog_errno(ret);
        }
@@ -4325,7 +4328,8 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
        /* If the security isn't preserved, we need to re-initialize them. */
        if (!preserve) {
-                error = ocfs2_init_security_and_acl(dir, new_orphan_inode);
+                error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
+                                                    &new_dentry->d_name);
                if (error)
                        mlog_errno(error);
        }
@@ -4376,7 +4380,7 @@ static int ocfs2_user_path_parent(const char __user *path,
        if (IS_ERR(s))
                return PTR_ERR(s);
-        error = path_lookup(s, LOOKUP_PARENT, nd);
+        error = kern_path_parent(s, nd);
        if (error)
                putname(s);
        else
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 38f986d2447e..236ed1bdca2c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1316,7 +1316,7 @@ static int ocfs2_parse_options(struct super_block *sb,
                               struct mount_options *mopt,
                               int is_remount)
 {
-        int status;
+        int status, user_stack = 0;
        char *p;
        u32 tmp;
@@ -1459,6 +1459,15 @@ static int ocfs2_parse_options(struct super_block *sb,
                        memcpy(mopt->cluster_stack, args[0].from,
                               OCFS2_STACK_LABEL_LEN);
                        mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+                        /*
+                         * Open code the memcmp here as we don't have
+                         * an osb to pass to
+                         * ocfs2_userspace_stack().
+                         */
+                        if (memcmp(mopt->cluster_stack,
+                                   OCFS2_CLASSIC_CLUSTER_STACK,
+                                   OCFS2_STACK_LABEL_LEN))
+                                user_stack = 1;
                        break;
                case Opt_inode64:
                        mopt->mount_opt |= OCFS2_MOUNT_INODE64;
@@ -1514,13 +1523,16 @@ static int ocfs2_parse_options(struct super_block *sb,
                }
        }
-        /* Ensure only one heartbeat mode */
+        if (user_stack == 0) {
-        tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
+                /* Ensure only one heartbeat mode */
-                                 OCFS2_MOUNT_HB_NONE);
+                tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
-        if (hweight32(tmp) != 1) {
+                                         OCFS2_MOUNT_HB_GLOBAL |
-                mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+                                         OCFS2_MOUNT_HB_NONE);
-                status = 0;
+                if (hweight32(tmp) != 1) {
-                goto bail;
+                        mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+                        status = 0;
+                        goto bail;
+                }
        }
        status = 1;
@@ -1645,16 +1657,11 @@ static int __init ocfs2_init(void)
                mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
        }
-        status = ocfs2_quota_setup();
-        if (status)
-                goto leave;
        ocfs2_set_locking_protocol();
        status = register_quota_format(&ocfs2_quota_format);
 leave:
        if (status < 0) {
-                ocfs2_quota_shutdown();
                ocfs2_free_mem_caches();
                exit_ocfs2_uptodate_cache();
        }
@@ -1671,8 +1678,6 @@ static void __exit ocfs2_exit(void)
 {
        mlog_entry_void();
-        ocfs2_quota_shutdown();
        if (ocfs2_wq) {
                flush_workqueue(ocfs2_wq);
                destroy_workqueue(ocfs2_wq);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 67cd43914641..6bb602486c6b 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7185,7 +7185,8 @@ out:
 * must not hold any lock expect i_mutex.
 */
 int ocfs2_init_security_and_acl(struct inode *dir,
-                                struct inode *inode)
+                                struct inode *inode,
+                                const struct qstr *qstr)
 {
        int ret = 0;
        struct buffer_head *dir_bh = NULL;
@@ -7193,7 +7194,7 @@ int ocfs2_init_security_and_acl(struct inode *dir,
                .enable = 1,
        };
-        ret = ocfs2_init_security_get(inode, dir, &si);
+        ret = ocfs2_init_security_get(inode, dir, qstr, &si);
        if (!ret) {
                ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
                                      si.name, si.value, si.value_len,
@@ -7261,13 +7262,14 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
 int ocfs2_init_security_get(struct inode *inode,
                            struct inode *dir,
+                            const struct qstr *qstr,
                            struct ocfs2_security_xattr_info *si)
 {
        /* check whether ocfs2 support feature xattr */
        if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
                return -EOPNOTSUPP;
-        return security_inode_init_security(inode, dir, &si->name, &si->value,
+        return security_inode_init_security(inode, dir, qstr, &si->name,
-                                            &si->value_len);
+                                            &si->value, &si->value_len);
 }
 int ocfs2_init_security_set(handle_t *handle,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index aa64bb37a65b..d63cfb72316b 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -57,6 +57,7 @@ int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
                                         struct ocfs2_dinode *di);
 int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
 int ocfs2_init_security_get(struct inode *, struct inode *,
+                            const struct qstr *,
                            struct ocfs2_security_xattr_info *);
 int ocfs2_init_security_set(handle_t *, struct inode *,
                            struct buffer_head *,
@@ -94,5 +95,6 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
                         struct buffer_head *new_bh,
                         bool preserve_security);
 int ocfs2_init_security_and_acl(struct inode *dir,
-                                struct inode *inode);
+                                struct inode *inode,
+                                const struct qstr *qstr);
 #endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 393f3f659da7..de4ff29f1e05 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -235,33 +235,22 @@ static int omfs_dir_is_empty(struct inode *inode)
        return *ptr != ~0;
 }
-static int omfs_unlink(struct inode *dir, struct dentry *dentry)
+static int omfs_remove(struct inode *dir, struct dentry *dentry)
 {
-        int ret;
        struct inode *inode = dentry->d_inode;
+        int ret;
+        if (S_ISDIR(inode->i_mode) && !omfs_dir_is_empty(inode))
+                return -ENOTEMPTY;
        ret = omfs_delete_entry(dentry);
        if (ret)
-                goto end_unlink;
+                return ret;
+        
-        inode_dec_link_count(inode);
+        clear_nlink(inode);
+        mark_inode_dirty(inode);
        mark_inode_dirty(dir);
+        return 0;
-end_unlink:
-        return ret;
-}
-static int omfs_rmdir(struct inode *dir, struct dentry *dentry)
-{
-        int err = -ENOTEMPTY;
-        struct inode *inode = dentry->d_inode;
-        if (omfs_dir_is_empty(inode)) {
-                err = omfs_unlink(dir, dentry);
-                if (!err)
-                        inode_dec_link_count(inode);
-        }
-        return err;
 }
 static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode)
@@ -372,9 +361,10 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
                res = filldir(dirent, oi->i_name, strnlen(oi->i_name,
                        OMFS_NAMELEN), filp->f_pos, self, d_type);
-                if (res == 0)
-                        filp->f_pos++;
                brelse(bh);
+                if (res < 0)
+                        break;
+                filp->f_pos++;
        }
 out:
        return res;
@@ -385,44 +375,28 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 {
        struct inode *new_inode = new_dentry->d_inode;
        struct inode *old_inode = old_dentry->d_inode;
-        struct buffer_head *bh;
-        int is_dir;
        int err;
-        is_dir = S_ISDIR(old_inode->i_mode);
        if (new_inode) {
                /* overwriting existing file/dir */
-                err = -ENOTEMPTY;
+                err = omfs_remove(new_dir, new_dentry);
-                if (is_dir && !omfs_dir_is_empty(new_inode))
-                        goto out;
-                err = -ENOENT;
-                bh = omfs_find_entry(new_dir, new_dentry->d_name.name,
-                        new_dentry->d_name.len);
-                if (IS_ERR(bh))
-                        goto out;
-                brelse(bh);
-                err = omfs_unlink(new_dir, new_dentry);
                if (err)
                        goto out;
        }
        /* since omfs locates files by name, we need to unlink _before_
         * adding the new link or we won't find the old one */
-        inode_inc_link_count(old_inode);
+        err = omfs_delete_entry(old_dentry);
-        err = omfs_unlink(old_dir, old_dentry);
+        if (err)
-        if (err) {
-                inode_dec_link_count(old_inode);
                goto out;
-        }
+        mark_inode_dirty(old_dir);
        err = omfs_add_link(new_dentry, old_inode);
        if (err)
                goto out;
        old_inode->i_ctime = CURRENT_TIME_SEC;
+        mark_inode_dirty(old_inode);
 out:
        return err;
 }
@@ -488,8 +462,8 @@ const struct inode_operations omfs_dir_inops = {
        .mkdir = omfs_mkdir,
        .rename = omfs_rename,
        .create = omfs_create,
-        .unlink = omfs_unlink,
+        .unlink = omfs_remove,
-        .rmdir = omfs_rmdir,
+        .rmdir = omfs_remove,
 };
 const struct file_operations omfs_dir_operations = {
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 8a6d34fa668a..d738a7e493dd 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -372,7 +372,6 @@ const struct address_space_operations omfs_aops = {
        .readpages = omfs_readpages,
        .writepage = omfs_writepage,
        .writepages = omfs_writepages,
-        .sync_page = block_sync_page,
        .write_begin = omfs_write_begin,
        .write_end = generic_write_end,
        .bmap = omfs_bmap,
diff --git a/fs/open.c b/fs/open.c
index e52389e1f05b..b52cf013ffa1 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -233,6 +233,14 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
+        /* It's not possible punch hole on append only file */
+        if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode))
+                return -EPERM;
+        if (IS_IMMUTABLE(inode))
+                return -EPERM;
        /*
         * Revalidate the write permissions, in case security policy has
         * changed since the files were opened.
@@ -565,13 +573,15 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
 {
        struct path path;
        int error = -EINVAL;
-        int follow;
+        int lookup_flags;
-        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                goto out;
-        follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+        lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
-        error = user_path_at(dfd, filename, follow, &path);
+        if (flag & AT_EMPTY_PATH)
+                lookup_flags |= LOOKUP_EMPTY;
+        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
                goto out;
        error = mnt_want_write(path.mnt);
@@ -661,11 +671,16 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
                                        int (*open)(struct inode *, struct file *),
                                        const struct cred *cred)
 {
+        static const struct file_operations empty_fops = {};
        struct inode *inode;
        int error;
        f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
                                FMODE_PREAD | FMODE_PWRITE;
+        if (unlikely(f->f_flags & O_PATH))
+                f->f_mode = FMODE_PATH;
        inode = dentry->d_inode;
        if (f->f_mode & FMODE_WRITE) {
                error = __get_file_write_access(inode, mnt);
@@ -679,9 +694,15 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
        f->f_path.dentry = dentry;
        f->f_path.mnt = mnt;
        f->f_pos = 0;
-        f->f_op = fops_get(inode->i_fop);
        file_sb_list_add(f, inode->i_sb);
+        if (unlikely(f->f_mode & FMODE_PATH)) {
+                f->f_op = &empty_fops;
+                return f;
+        }
+        f->f_op = fops_get(inode->i_fop);
        error = security_dentry_open(f, cred);
        if (error)
                goto cleanup_all;
@@ -693,7 +714,8 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
                if (error)
                        goto cleanup_all;
        }
-        ima_counts_get(f);
+        if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+                i_readcount_inc(inode);
        f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
@@ -790,6 +812,8 @@ struct file *nameidata_to_filp(struct nameidata *nd)
        /* Pick up the filp from the open intent */
        filp = nd->intent.open.file;
+        nd->intent.open.file = NULL;
        /* Has the filesystem initialised the file for us? */
        if (filp->f_path.dentry == NULL) {
                path_get(&nd->path);
@@ -811,17 +835,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
        validate_creds(cred);
-        /*
+        /* We must always pass in a valid mount pointer. */
-         * We must always pass in a valid mount pointer.   Historically
+        BUG_ON(!mnt);
-         * callers got away with not passing it, but we must enforce this at
-         * the earliest possible point now to avoid strange problems deep in the
-         * filesystem stack.
-         */
-        if (!mnt) {
-                printk(KERN_WARNING "%s called with NULL vfsmount\n", __func__);
-                dump_stack();
-                return ERR_PTR(-EINVAL);
-        }
        error = -ENFILE;
        f = get_empty_filp();
@@ -880,15 +895,110 @@ void fd_install(unsigned int fd, struct file *file)
 EXPORT_SYMBOL(fd_install);
+static inline int build_open_flags(int flags, int mode, struct open_flags *op)
+{
+        int lookup_flags = 0;
+        int acc_mode;
+        if (!(flags & O_CREAT))
+                mode = 0;
+        op->mode = mode;
+        /* Must never be set by userspace */
+        flags &= ~FMODE_NONOTIFY;
+        /*
+         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
+         * check for O_DSYNC if the need any syncing at all we enforce it's
+         * always set instead of having to deal with possibly weird behaviour
+         * for malicious applications setting only __O_SYNC.
+         */
+        if (flags & __O_SYNC)
+                flags |= O_DSYNC;
+        /*
+         * If we have O_PATH in the open flag. Then we
+         * cannot have anything other than the below set of flags
+         */
+        if (flags & O_PATH) {
+                flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
+                acc_mode = 0;
+        } else {
+                acc_mode = MAY_OPEN | ACC_MODE(flags);
+        }
+        op->open_flag = flags;
+        /* O_TRUNC implies we need access checks for write permissions */
+        if (flags & O_TRUNC)
+                acc_mode |= MAY_WRITE;
+        /* Allow the LSM permission hook to distinguish append
+           access from general write access. */
+        if (flags & O_APPEND)
+                acc_mode |= MAY_APPEND;
+        op->acc_mode = acc_mode;
+        op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
+        if (flags & O_CREAT) {
+                op->intent |= LOOKUP_CREATE;
+                if (flags & O_EXCL)
+                        op->intent |= LOOKUP_EXCL;
+        }
+        if (flags & O_DIRECTORY)
+                lookup_flags |= LOOKUP_DIRECTORY;
+        if (!(flags & O_NOFOLLOW))
+                lookup_flags |= LOOKUP_FOLLOW;
+        return lookup_flags;
+}
+/**
+ * filp_open - open file and return file pointer
+ *
+ * @filename:   path to open
+ * @flags:      open flags as per the open(2) second argument
+ * @mode:       mode for the new file if O_CREAT is set, else ignored
+ *
+ * This is the helper to open a file from kernelspace if you really
+ * have to.  But in generally you should not do this, so please move
+ * along, nothing to see here..
+ */
+struct file *filp_open(const char *filename, int flags, int mode)
+{
+        struct open_flags op;
+        int lookup = build_open_flags(flags, mode, &op);
+        return do_filp_open(AT_FDCWD, filename, &op, lookup);
+}
+EXPORT_SYMBOL(filp_open);
+struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+                            const char *filename, int flags)
+{
+        struct open_flags op;
+        int lookup = build_open_flags(flags, 0, &op);
+        if (flags & O_CREAT)
+                return ERR_PTR(-EINVAL);
+        if (!filename && (flags & O_DIRECTORY))
+                if (!dentry->d_inode->i_op->lookup)
+                        return ERR_PTR(-ENOTDIR);
+        return do_file_open_root(dentry, mnt, filename, &op, lookup);
+}
+EXPORT_SYMBOL(file_open_root);
 long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 {
+        struct open_flags op;
+        int lookup = build_open_flags(flags, mode, &op);
        char *tmp = getname(filename);
        int fd = PTR_ERR(tmp);
        if (!IS_ERR(tmp)) {
                fd = get_unused_fd_flags(flags);
                if (fd >= 0) {
-                        struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);
+                        struct file *f = do_filp_open(dfd, tmp, &op, lookup);
                        if (IS_ERR(f)) {
                                put_unused_fd(fd);
                                fd = PTR_ERR(f);
@@ -958,8 +1068,10 @@ int filp_close(struct file *filp, fl_owner_t id)
        if (filp->f_op && filp->f_op->flush)
                retval = filp->f_op->flush(filp, id);
-        dnotify_flush(filp, id);
+        if (likely(!(filp->f_mode & FMODE_PATH))) {
-        locks_remove_posix(filp, id);
+                dnotify_flush(filp, id);
+                locks_remove_posix(filp, id);
+        }
        fput(filp);
        return retval;
 }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 9c21119512b9..ac546975031f 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -290,7 +290,8 @@ ssize_t part_inflight_show(struct device *dev,
 {
        struct hd_struct *p = dev_to_part(dev);
-        return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
+        return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
+                atomic_read(&p->in_flight[1]));
 }
 #ifdef CONFIG_FAIL_MAKE_REQUEST
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 789c625c7aa5..b10e3540d5b7 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -251,6 +251,11 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
        }
        vm->vblk_size     = get_unaligned_be32(data + 0x08);
+        if (vm->vblk_size == 0) {
+                ldm_error ("Illegal VBLK size");
+                return false;
+        }
        vm->vblk_offset   = get_unaligned_be32(data + 0x0C);
        vm->last_vblk_seq = get_unaligned_be32(data + 0x04);
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index 68d6a216ee79..11f688bd76c5 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -29,10 +29,9 @@ static inline void mac_fix_string(char *stg, int len)
 int mac_partition(struct parsed_partitions *state)
 {
-        int slot = 1;
        Sector sect;
        unsigned char *data;
-        int blk, blocks_in_map;
+        int slot, blocks_in_map;
        unsigned secsize;
 #ifdef CONFIG_PPC_PMAC
        int found_root = 0;
@@ -59,10 +58,14 @@ int mac_partition(struct parsed_partitions *state)
                put_dev_sector(sect);
                return 0;               /* not a MacOS disk */
        }
-        strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
        blocks_in_map = be32_to_cpu(part->map_count);
-        for (blk = 1; blk <= blocks_in_map; ++blk) {
+        if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) {
-                int pos = blk * secsize;
+                put_dev_sector(sect);
+                return 0;
+        }
+        strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
+        for (slot = 1; slot <= blocks_in_map; ++slot) {
+                int pos = slot * secsize;
                put_dev_sector(sect);
                data = read_part_sector(state, pos/512, &sect);
                if (!data)
@@ -113,13 +116,11 @@ int mac_partition(struct parsed_partitions *state)
                        }
                        if (goodness > found_root_goodness) {
-                                found_root = blk;
+                                found_root = slot;
                                found_root_goodness = goodness;
                        }
                }
 #endif /* CONFIG_PPC_PMAC */
-                ++slot;
        }
 #ifdef CONFIG_PPC_PMAC
        if (found_root_goodness)
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index 48cec7cbca17..764b86a01965 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,10 +10,13 @@
 #include "check.h"
 #include "osf.h"
+#define MAX_OSF_PARTITIONS 18
 int osf_partition(struct parsed_partitions *state)
 {
        int i;
        int slot = 1;
+        unsigned int npartitions;
        Sector sect;
        unsigned char *data;
        struct disklabel {
@@ -45,7 +48,7 @@ int osf_partition(struct parsed_partitions *state)
                        u8  p_fstype;
                        u8  p_frag;
                        __le16 p_cpg;
-                } d_partitions[8];
+                } d_partitions[MAX_OSF_PARTITIONS];
        } * label;
        struct d_partition * partition;
@@ -63,7 +66,12 @@ int osf_partition(struct parsed_partitions *state)
                put_dev_sector(sect);
                return 0;
        }
-        for (i = 0 ; i < le16_to_cpu(label->d_npartitions); i++, partition++) {
+        npartitions = le16_to_cpu(label->d_npartitions);
+        if (npartitions > MAX_OSF_PARTITIONS) {
+                put_dev_sector(sect);
+                return 0;
+        }
+        for (i = 0 ; i < npartitions; i++, partition++) {
                if (slot == state->limit)
                        break;
                if (le32_to_cpu(partition->p_size))
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 39df95a0ec25..b1cf6bf4b41d 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -22,6 +22,7 @@
 #include <linux/errno.h>
+EXPORT_SYMBOL(posix_acl_init);
 EXPORT_SYMBOL(posix_acl_alloc);
 EXPORT_SYMBOL(posix_acl_clone);
 EXPORT_SYMBOL(posix_acl_valid);
@@ -32,6 +33,16 @@ EXPORT_SYMBOL(posix_acl_chmod_masq);
 EXPORT_SYMBOL(posix_acl_permission);
 /*
+ * Init a fresh posix_acl
+ */
+void
+posix_acl_init(struct posix_acl *acl, int count)
+{
+        atomic_set(&acl->a_refcount, 1);
+        acl->a_count = count;
+}
+/*
 * Allocate a new ACL with the specified number of entries.
 */
 struct posix_acl *
@@ -40,10 +51,8 @@ posix_acl_alloc(int count, gfp_t flags)
        const size_t size = sizeof(struct posix_acl) +
                            count * sizeof(struct posix_acl_entry);
        struct posix_acl *acl = kmalloc(size, flags);
-        if (acl) {
+        if (acl)
-                atomic_set(&acl->a_refcount, 1);
+                posix_acl_init(acl, count);
-                acl->a_count = count;
-        }
        return acl;
 }
diff --git a/fs/proc/array.c b/fs/proc/array.c
index df2b703b9d0f..5e4f776b0917 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -353,9 +353,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
        task_cap(m, task);
        task_cpus_allowed(m, task);
        cpuset_task_status_allowed(m, task);
-#if defined(CONFIG_S390)
-        task_show_regs(m, task);
-#endif
        task_context_switch_counts(m, task);
        return 0;
 }
@@ -492,8 +489,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                vsize,
                mm ? get_mm_rss(mm) : 0,
                rsslim,
-                mm ? mm->start_code : 0,
+                mm ? (permitted ? mm->start_code : 1) : 0,
-                mm ? mm->end_code : 0,
+                mm ? (permitted ? mm->end_code : 1) : 0,
                (permitted && mm) ? mm->start_stack : 0,
                esp,
                eip,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9d096e82b201..5a670c11aeac 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -191,17 +191,20 @@ static int proc_root_link(struct inode *inode, struct path *path)
        return result;
 }
-/*
+static struct mm_struct *__check_mem_permission(struct task_struct *task)
- * Return zero if current may access user memory in @task, -error if not.
- */
-static int check_mem_permission(struct task_struct *task)
 {
+        struct mm_struct *mm;
+        mm = get_task_mm(task);
+        if (!mm)
+                return ERR_PTR(-EINVAL);
        /*
         * A task can always look at itself, in case it chooses
         * to use system calls instead of load instructions.
         */
        if (task == current)
-                return 0;
+                return mm;
        /*
         * If current is actively ptrace'ing, and would also be
@@ -213,27 +216,53 @@ static int check_mem_permission(struct task_struct *task)
                match = (tracehook_tracer_task(task) == current);
                rcu_read_unlock();
                if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
-                        return 0;
+                        return mm;
        }
        /*
         * Noone else is allowed.
         */
-        return -EPERM;
+        mmput(mm);
+        return ERR_PTR(-EPERM);
+}
+/*
+ * If current may access user memory in @task return a reference to the
+ * corresponding mm, otherwise ERR_PTR.
+ */
+static struct mm_struct *check_mem_permission(struct task_struct *task)
+{
+        struct mm_struct *mm;
+        int err;
+        /*
+         * Avoid racing if task exec's as we might get a new mm but validate
+         * against old credentials.
+         */
+        err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+        if (err)
+                return ERR_PTR(err);
+        mm = __check_mem_permission(task);
+        mutex_unlock(&task->signal->cred_guard_mutex);
+        return mm;
 }
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
        struct mm_struct *mm;
+        int err;
-        if (mutex_lock_killable(&task->signal->cred_guard_mutex))
+        err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
-                return NULL;
+        if (err)
+                return ERR_PTR(err);
        mm = get_task_mm(task);
        if (mm && mm != current->mm &&
                        !ptrace_may_access(task, PTRACE_MODE_READ)) {
                mmput(mm);
-                mm = NULL;
+                mm = ERR_PTR(-EACCES);
        }
        mutex_unlock(&task->signal->cred_guard_mutex);
@@ -279,9 +308,9 @@ out:
 static int proc_pid_auxv(struct task_struct *task, char *buffer)
 {
-        int res = 0;
+        struct mm_struct *mm = mm_for_maps(task);
-        struct mm_struct *mm = get_task_mm(task);
+        int res = PTR_ERR(mm);
-        if (mm) {
+        if (mm && !IS_ERR(mm)) {
                unsigned int nwords = 0;
                do {
                        nwords += 2;
@@ -318,6 +347,23 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
 }
 #endif /* CONFIG_KALLSYMS */
+static int lock_trace(struct task_struct *task)
+{
+        int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+        if (err)
+                return err;
+        if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+                mutex_unlock(&task->signal->cred_guard_mutex);
+                return -EPERM;
+        }
+        return 0;
+}
+static void unlock_trace(struct task_struct *task)
+{
+        mutex_unlock(&task->signal->cred_guard_mutex);
+}
 #ifdef CONFIG_STACKTRACE
 #define MAX_STACK_TRACE_DEPTH   64
@@ -327,6 +373,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
 {
        struct stack_trace trace;
        unsigned long *entries;
+        int err;
        int i;
        entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
@@ -337,15 +384,20 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
        trace.max_entries       = MAX_STACK_TRACE_DEPTH;
        trace.entries           = entries;
        trace.skip              = 0;
-        save_stack_trace_tsk(task, &trace);
-        for (i = 0; i < trace.nr_entries; i++) {
+        err = lock_trace(task);
-                seq_printf(m, "[<%p>] %pS\n",
+        if (!err) {
-                           (void *)entries[i], (void *)entries[i]);
+                save_stack_trace_tsk(task, &trace);
+                for (i = 0; i < trace.nr_entries; i++) {
+                        seq_printf(m, "[<%pK>] %pS\n",
+                                   (void *)entries[i], (void *)entries[i]);
+                }
+                unlock_trace(task);
        }
        kfree(entries);
-        return 0;
+        return err;
 }
 #endif
@@ -508,18 +560,22 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer)
 {
        long nr;
        unsigned long args[6], sp, pc;
+        int res = lock_trace(task);
+        if (res)
+                return res;
        if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
-                return sprintf(buffer, "running\n");
+                res = sprintf(buffer, "running\n");
+        else if (nr < 0)
-        if (nr < 0)
+                res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
-                return sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+        else
+                res = sprintf(buffer,
-        return sprintf(buffer,
                       "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
                       nr,
                       args[0], args[1], args[2], args[3], args[4], args[5],
                       sp, pc);
+        unlock_trace(task);
+        return res;
 }
 #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
@@ -775,18 +831,14 @@ static ssize_t mem_read(struct file * file, char __user * buf,
        if (!task)
                goto out_no_task;
-        if (check_mem_permission(task))
-                goto out;
        ret = -ENOMEM;
        page = (char *)__get_free_page(GFP_TEMPORARY);
        if (!page)
                goto out;
-        ret = 0;
+        mm = check_mem_permission(task);
- 
+        ret = PTR_ERR(mm);
-        mm = get_task_mm(task);
+        if (IS_ERR(mm))
-        if (!mm)
                goto out_free;
        ret = -EIO;
@@ -800,8 +852,8 @@ static ssize_t mem_read(struct file * file, char __user * buf,
                int this_len, retval;
                this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
-                retval = access_process_vm(task, src, page, this_len, 0);
+                retval = access_remote_vm(mm, src, page, this_len, 0);
-                if (!retval || check_mem_permission(task)) {
+                if (!retval) {
                        if (!ret)
                                ret = -EIO;
                        break;
@@ -829,10 +881,6 @@ out_no_task:
        return ret;
 }
-#define mem_write NULL
-#ifndef mem_write
-/* This is a security hazard */
 static ssize_t mem_write(struct file * file, const char __user *buf,
                         size_t count, loff_t *ppos)
 {
@@ -840,18 +888,25 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
        char *page;
        struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
        unsigned long dst = *ppos;
+        struct mm_struct *mm;
        copied = -ESRCH;
        if (!task)
                goto out_no_task;
-        if (check_mem_permission(task))
+        mm = check_mem_permission(task);
-                goto out;
+        copied = PTR_ERR(mm);
+        if (IS_ERR(mm))
+                goto out_task;
+        copied = -EIO;
+        if (file->private_data != (void *)((long)current->self_exec_id))
+                goto out_mm;
        copied = -ENOMEM;
        page = (char *)__get_free_page(GFP_TEMPORARY);
        if (!page)
-                goto out;
+                goto out_mm;
        copied = 0;
        while (count > 0) {
@@ -862,7 +917,7 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
                        copied = -EFAULT;
                        break;
                }
-                retval = access_process_vm(task, dst, page, this_len, 1);
+                retval = access_remote_vm(mm, dst, page, this_len, 1);
                if (!retval) {
                        if (!copied)
                                copied = -EIO;
@@ -875,12 +930,13 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
        }
        *ppos = dst;
        free_page((unsigned long) page);
-out:
+out_mm:
+        mmput(mm);
+out_task:
        put_task_struct(task);
 out_no_task:
        return copied;
 }
-#endif
 loff_t mem_lseek(struct file *file, loff_t offset, int orig)
 {
@@ -917,20 +973,18 @@ static ssize_t environ_read(struct file *file, char __user *buf,
        if (!task)
                goto out_no_task;
-        if (!ptrace_may_access(task, PTRACE_MODE_READ))
-                goto out;
        ret = -ENOMEM;
        page = (char *)__get_free_page(GFP_TEMPORARY);
        if (!page)
                goto out;
-        ret = 0;
-        mm = get_task_mm(task);
+        mm = mm_for_maps(task);
-        if (!mm)
+        ret = PTR_ERR(mm);
+        if (!mm || IS_ERR(mm))
                goto out_free;
+        ret = 0;
        while (count > 0) {
                int this_len, retval, max_len;
@@ -2620,35 +2674,6 @@ static const struct pid_entry proc_base_stuff[] = {
                &proc_self_inode_operations, NULL, {}),
 };
-/*
- *      Exceptional case: normally we are not allowed to unhash a busy
- * directory. In this case, however, we can do it - no aliasing problems
- * due to the way we treat inodes.
- */
-static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
-        struct inode *inode;
-        struct task_struct *task;
-        if (nd->flags & LOOKUP_RCU)
-                return -ECHILD;
-        inode = dentry->d_inode;
-        task = get_proc_task(inode);
-        if (task) {
-                put_task_struct(task);
-                return 1;
-        }
-        d_drop(dentry);
-        return 0;
-}
-static const struct dentry_operations proc_base_dentry_operations =
-{
-        .d_revalidate   = proc_base_revalidate,
-        .d_delete       = pid_delete_dentry,
-};
 static struct dentry *proc_base_instantiate(struct inode *dir,
        struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
@@ -2685,7 +2710,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        if (p->fop)
                inode->i_fop = p->fop;
        ei->op = p->op;
-        d_set_d_op(dentry, &proc_base_dentry_operations);
        d_add(dentry, inode);
        error = NULL;
 out:
@@ -2778,8 +2802,12 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
 static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
 {
-        seq_printf(m, "%08x\n", task->personality);
+        int err = lock_trace(task);
-        return 0;
+        if (!err) {
+                seq_printf(m, "%08x\n", task->personality);
+                unlock_trace(task);
+        }
+        return err;
 }
 /*
@@ -2798,7 +2826,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        REG("environ",    S_IRUSR, proc_environ_operations),
        INF("auxv",       S_IRUSR, proc_pid_auxv),
        ONE("status",     S_IRUGO, proc_pid_status),
-        ONE("personality", S_IRUSR, proc_pid_personality),
+        ONE("personality", S_IRUGO, proc_pid_personality),
        INF("limits",     S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
@@ -2808,7 +2836,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-        INF("syscall",    S_IRUSR, proc_pid_syscall),
+        INF("syscall",    S_IRUGO, proc_pid_syscall),
 #endif
        INF("cmdline",    S_IRUGO, proc_pid_cmdline),
        ONE("stat",       S_IRUGO, proc_tgid_stat),
@@ -2827,7 +2855,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
        REG("smaps",      S_IRUGO, proc_smaps_operations),
-        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+        REG("pagemap",    S_IRUGO, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
        DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -2836,7 +2864,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        INF("wchan",      S_IRUGO, proc_pid_wchan),
 #endif
 #ifdef CONFIG_STACKTRACE
-        ONE("stack",      S_IRUSR, proc_pid_stack),
+        ONE("stack",      S_IRUGO, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
        INF("schedstat",  S_IRUGO, proc_pid_schedstat),
@@ -3138,14 +3166,14 @@ static const struct pid_entry tid_base_stuff[] = {
        REG("environ",   S_IRUSR, proc_environ_operations),
        INF("auxv",      S_IRUSR, proc_pid_auxv),
        ONE("status",    S_IRUGO, proc_pid_status),
-        ONE("personality", S_IRUSR, proc_pid_personality),
+        ONE("personality", S_IRUGO, proc_pid_personality),
        INF("limits",    S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-        INF("syscall",   S_IRUSR, proc_pid_syscall),
+        INF("syscall",   S_IRUGO, proc_pid_syscall),
 #endif
        INF("cmdline",   S_IRUGO, proc_pid_cmdline),
        ONE("stat",      S_IRUGO, proc_tid_stat),
@@ -3163,7 +3191,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
        REG("smaps",     S_IRUGO, proc_smaps_operations),
-        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+        REG("pagemap",    S_IRUGO, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
        DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -3172,7 +3200,7 @@ static const struct pid_entry tid_base_stuff[] = {
        INF("wchan",     S_IRUGO, proc_pid_wchan),
 #endif
 #ifdef CONFIG_STACKTRACE
-        ONE("stack",      S_IRUSR, proc_pid_stack),
+        ONE("stack",      S_IRUGO, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
        INF("schedstat", S_IRUGO, proc_pid_schedstat),
@@ -3191,7 +3219,7 @@ static const struct pid_entry tid_base_stuff[] = {
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
        REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
-        REG("sessionid",  S_IRUSR, proc_sessionid_operations),
+        REG("sessionid",  S_IRUGO, proc_sessionid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index eafc22ab1fdd..b701eaa482bf 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -67,7 +67,7 @@ static void *c_start(struct seq_file *m, loff_t *pos)
        struct console *con;
        loff_t off = 0;
-        acquire_console_sem();
+        console_lock();
        for_each_console(con)
                if (off++ == *pos)
                        break;
@@ -84,7 +84,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
 static void c_stop(struct seq_file *m, void *v)
 {
-        release_console_sem();
+        console_unlock();
 }
 static const struct seq_operations consoles_op = {
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 01e07f2a188f..f1281339b6fa 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -28,7 +28,7 @@
 DEFINE_SPINLOCK(proc_subdir_lock);
-static int proc_match(int len, const char *name, struct proc_dir_entry *de)
+static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
 {
        if (de->namelen != len)
                return 0;
@@ -303,7 +303,7 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
 {
        const char              *cp = name, *next;
        struct proc_dir_entry   *de;
-        int                     len;
+        unsigned int            len;
        de = *ret;
        if (!de)
@@ -602,7 +602,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
 {
        struct proc_dir_entry *ent = NULL;
        const char *fn = name;
-        int len;
+        unsigned int len;
        /* make sure name is valid */
        if (!name || !strlen(name)) goto out;
@@ -786,7 +786,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
        struct proc_dir_entry **p;
        struct proc_dir_entry *de = NULL;
        const char *fn = name;
-        int len;
+        unsigned int len;
        spin_lock(&proc_subdir_lock);
        if (__xlate_proc_name(name, &parent, &fn) != 0) {
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 176ce4cda68a..d15aa1b1cc8f 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -27,6 +27,7 @@
 static void proc_evict_inode(struct inode *inode)
 {
        struct proc_dir_entry *de;
+        struct ctl_table_header *head;
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
@@ -38,12 +39,13 @@ static void proc_evict_inode(struct inode *inode)
        de = PROC_I(inode)->pde;
        if (de)
                pde_put(de);
-        if (PROC_I(inode)->sysctl)
+        head = PROC_I(inode)->sysctl;
-                sysctl_head_put(PROC_I(inode)->sysctl);
+        if (head) {
+                rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
+                sysctl_head_put(head);
+        }
 }
-struct vfsmount *proc_mnt;
 static struct kmem_cache * proc_inode_cachep;
 static struct inode *proc_alloc_inode(struct super_block *sb)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9ad561ded409..c03e8d3a3a5b 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -107,7 +107,6 @@ static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
 }
 void pde_put(struct proc_dir_entry *pde);
-extern struct vfsmount *proc_mnt;
 int proc_fill_super(struct super_block *);
 struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index d9396a4fc7ff..927cbd115e53 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -233,7 +233,7 @@ void __init proc_device_tree_init(void)
                return;
        root = of_find_node_by_path("/");
        if (root == NULL) {
-                printk(KERN_ERR "/proc/device-tree: can't find root\n");
+                pr_debug("/proc/device-tree: can't find root\n");
                return;
        }
        proc_device_tree_add_node(root, proc_device_tree);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 09a1f92a34ef..f50133c11c24 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -32,7 +32,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
        ei->sysctl_entry = table;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-        inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
        inode->i_mode = table->mode;
        if (!table->child) {
                inode->i_mode |= S_IFREG;
@@ -408,15 +407,18 @@ static int proc_sys_compare(const struct dentry *parent,
                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
+        struct ctl_table_header *head;
        /* Although proc doesn't have negative dentries, rcu-walk means
         * that inode here can be NULL */
+        /* AV: can it, indeed? */
        if (!inode)
-                return 0;
+                return 1;
        if (name->len != len)
                return 1;
        if (memcmp(name->name, str, len))
                return 1;
-        return !sysctl_is_seen(PROC_I(inode)->sysctl);
+        head = rcu_dereference(PROC_I(inode)->sysctl);
+        return !head || !sysctl_is_seen(head);
 }
 static const struct dentry_operations proc_sys_dentry_operations = {
diff --git a/fs/proc/root.c b/fs/proc/root.c
index ef9fa8e24ad6..a9000e9cfee5 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -43,17 +43,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
        struct pid_namespace *ns;
        struct proc_inode *ei;
-        if (proc_mnt) {
-                /* Seed the root directory with a pid so it doesn't need
-                 * to be special in base.c.  I would do this earlier but
-                 * the only task alive when /proc is mounted the first time
-                 * is the init_task and it doesn't have any pids.
-                 */
-                ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode);
-                if (!ei->pid)
-                        ei->pid = find_get_pid(1);
-        }
        if (flags & MS_KERNMOUNT)
                ns = (struct pid_namespace *)data;
        else
@@ -71,16 +60,16 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
                        return ERR_PTR(err);
                }
-                ei = PROC_I(sb->s_root->d_inode);
-                if (!ei->pid) {
-                        rcu_read_lock();
-                        ei->pid = get_pid(find_pid_ns(1, ns));
-                        rcu_read_unlock();
-                }
                sb->s_flags |= MS_ACTIVE;
        }
+        ei = PROC_I(sb->s_root->d_inode);
+        if (!ei->pid) {
+                rcu_read_lock();
+                ei->pid = get_pid(find_pid_ns(1, ns));
+                rcu_read_unlock();
+        }
        return dget(sb->s_root);
 }
@@ -101,19 +90,20 @@ static struct file_system_type proc_fs_type = {
 void __init proc_root_init(void)
 {
+        struct vfsmount *mnt;
        int err;
        proc_init_inodecache();
        err = register_filesystem(&proc_fs_type);
        if (err)
                return;
-        proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
+        mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
-        if (IS_ERR(proc_mnt)) {
+        if (IS_ERR(mnt)) {
                unregister_filesystem(&proc_fs_type);
                return;
        }
-        init_pid_ns.proc_mnt = proc_mnt;
+        init_pid_ns.proc_mnt = mnt;
        proc_symlink("mounts", NULL, "self/mounts");
        proc_net_init();
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 60b914860f81..7c708a418acc 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,5 +1,6 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
+#include <linux/huge_mm.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/highmem.h>
@@ -7,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
+#include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
@@ -119,14 +121,14 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
        if (!priv->task)
-                return NULL;
+                return ERR_PTR(-ESRCH);
        mm = mm_for_maps(priv->task);
-        if (!mm)
+        if (!mm || IS_ERR(mm))
-                return NULL;
+                return mm;
        down_read(&mm->mmap_sem);
-        tail_vma = get_gate_vma(priv->task);
+        tail_vma = get_gate_vma(priv->task->mm);
        priv->tail_vma = tail_vma;
        /* Start with last addr hint */
@@ -249,8 +251,8 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
                const char *name = arch_vma_name(vma);
                if (!name) {
                        if (mm) {
-                                if (vma->vm_start <= mm->start_brk &&
+                                if (vma->vm_start <= mm->brk &&
-                                                vma->vm_end >= mm->brk) {
+                                                vma->vm_end >= mm->start_brk) {
                                        name = "[heap]";
                                } else if (vma->vm_start <= mm->start_stack &&
                                           vma->vm_end >= mm->start_stack) {
@@ -277,7 +279,8 @@ static int show_map(struct seq_file *m, void *v)
        show_map_vma(m, vma);
        if (m->count < m->size)  /* vma is copied successfully */
-                m->version = (vma != get_gate_vma(task))? vma->vm_start: 0;
+                m->version = (vma != get_gate_vma(task->mm))
+                        ? vma->vm_start : 0;
        return 0;
 }
@@ -329,58 +332,86 @@ struct mem_size_stats {
        unsigned long private_dirty;
        unsigned long referenced;
        unsigned long anonymous;
+        unsigned long anonymous_thp;
        unsigned long swap;
        u64 pss;
 };
-static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-                           struct mm_walk *walk)
+static void smaps_pte_entry(pte_t ptent, unsigned long addr,
+                unsigned long ptent_size, struct mm_walk *walk)
 {
        struct mem_size_stats *mss = walk->private;
        struct vm_area_struct *vma = mss->vma;
-        pte_t *pte, ptent;
-        spinlock_t *ptl;
        struct page *page;
        int mapcount;
-        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+        if (is_swap_pte(ptent)) {
-        for (; addr != end; pte++, addr += PAGE_SIZE) {
+                mss->swap += ptent_size;
-                ptent = *pte;
+                return;
+        }
-                if (is_swap_pte(ptent)) {
-                        mss->swap += PAGE_SIZE;
-                        continue;
-                }
-                if (!pte_present(ptent))
+        if (!pte_present(ptent))
-                        continue;
+                return;
+        page = vm_normal_page(vma, addr, ptent);
+        if (!page)
+                return;
+        if (PageAnon(page))
+                mss->anonymous += ptent_size;
+        mss->resident += ptent_size;
+        /* Accumulate the size in pages that have been accessed. */
+        if (pte_young(ptent) || PageReferenced(page))
+                mss->referenced += ptent_size;
+        mapcount = page_mapcount(page);
+        if (mapcount >= 2) {
+                if (pte_dirty(ptent) || PageDirty(page))
+                        mss->shared_dirty += ptent_size;
+                else
+                        mss->shared_clean += ptent_size;
+                mss->pss += (ptent_size << PSS_SHIFT) / mapcount;
+        } else {
+                if (pte_dirty(ptent) || PageDirty(page))
+                        mss->private_dirty += ptent_size;
+                else
+                        mss->private_clean += ptent_size;
+                mss->pss += (ptent_size << PSS_SHIFT);
+        }
+}
-                page = vm_normal_page(vma, addr, ptent);
+static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-                if (!page)
+                           struct mm_walk *walk)
-                        continue;
+{
+        struct mem_size_stats *mss = walk->private;
+        struct vm_area_struct *vma = mss->vma;
+        pte_t *pte;
+        spinlock_t *ptl;
-                if (PageAnon(page))
+        spin_lock(&walk->mm->page_table_lock);
-                        mss->anonymous += PAGE_SIZE;
+        if (pmd_trans_huge(*pmd)) {
+                if (pmd_trans_splitting(*pmd)) {
-                mss->resident += PAGE_SIZE;
+                        spin_unlock(&walk->mm->page_table_lock);
-                /* Accumulate the size in pages that have been accessed. */
+                        wait_split_huge_page(vma->anon_vma, pmd);
-                if (pte_young(ptent) || PageReferenced(page))
-                        mss->referenced += PAGE_SIZE;
-                mapcount = page_mapcount(page);
-                if (mapcount >= 2) {
-                        if (pte_dirty(ptent) || PageDirty(page))
-                                mss->shared_dirty += PAGE_SIZE;
-                        else
-                                mss->shared_clean += PAGE_SIZE;
-                        mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
                } else {
-                        if (pte_dirty(ptent) || PageDirty(page))
+                        smaps_pte_entry(*(pte_t *)pmd, addr,
-                                mss->private_dirty += PAGE_SIZE;
+                                        HPAGE_PMD_SIZE, walk);
-                        else
+                        spin_unlock(&walk->mm->page_table_lock);
-                                mss->private_clean += PAGE_SIZE;
+                        mss->anonymous_thp += HPAGE_PMD_SIZE;
-                        mss->pss += (PAGE_SIZE << PSS_SHIFT);
+                        return 0;
                }
+        } else {
+                spin_unlock(&walk->mm->page_table_lock);
        }
+        /*
+         * The mmap_sem held all the way back in m_start() is what
+         * keeps khugepaged out of here and from collapsing things
+         * in here.
+         */
+        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+        for (; addr != end; pte++, addr += PAGE_SIZE)
+                smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
        pte_unmap_unlock(pte - 1, ptl);
        cond_resched();
        return 0;
@@ -416,6 +447,7 @@ static int show_smap(struct seq_file *m, void *v)
                   "Private_Dirty:  %8lu kB\n"
                   "Referenced:     %8lu kB\n"
                   "Anonymous:      %8lu kB\n"
+                   "AnonHugePages:  %8lu kB\n"
                   "Swap:           %8lu kB\n"
                   "KernelPageSize: %8lu kB\n"
                   "MMUPageSize:    %8lu kB\n"
@@ -429,6 +461,7 @@ static int show_smap(struct seq_file *m, void *v)
                   mss.private_dirty >> 10,
                   mss.referenced >> 10,
                   mss.anonymous >> 10,
+                   mss.anonymous_thp >> 10,
                   mss.swap >> 10,
                   vma_kernel_pagesize(vma) >> 10,
                   vma_mmu_pagesize(vma) >> 10,
@@ -436,7 +469,8 @@ static int show_smap(struct seq_file *m, void *v)
                        (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
        if (m->count < m->size)  /* vma is copied successfully */
-                m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+                m->version = (vma != get_gate_vma(task->mm))
+                        ? vma->vm_start : 0;
        return 0;
 }
@@ -467,6 +501,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
        spinlock_t *ptl;
        struct page *page;
+        split_huge_page_pmd(walk->mm, pmd);
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; pte++, addr += PAGE_SIZE) {
                ptent = *pte;
@@ -623,6 +659,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        pte_t *pte;
        int err = 0;
+        split_huge_page_pmd(walk->mm, pmd);
        /* find the first VMA at or above 'addr' */
        vma = find_vma(walk->mm, addr);
        for (; addr != end; addr += PAGE_SIZE) {
@@ -728,8 +766,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        if (!task)
                goto out;
-        ret = -EACCES;
+        mm = mm_for_maps(task);
-        if (!ptrace_may_access(task, PTRACE_MODE_READ))
+        ret = PTR_ERR(mm);
+        if (!mm || IS_ERR(mm))
                goto out_task;
        ret = -EINVAL;
@@ -742,10 +781,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        if (!count)
                goto out_task;
-        mm = get_task_mm(task);
-        if (!mm)
-                goto out_task;
        pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
        pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
        ret = -ENOMEM;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index b535d3e5d5f1..980de547c070 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -199,13 +199,13 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        /* pin the task and mm whilst we play with them */
        priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
        if (!priv->task)
-                return NULL;
+                return ERR_PTR(-ESRCH);
        mm = mm_for_maps(priv->task);
-        if (!mm) {
+        if (!mm || IS_ERR(mm)) {
                put_task_struct(priv->task);
                priv->task = NULL;
-                return NULL;
+                return mm;
        }
        down_read(&mm->mmap_sem);
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
new file mode 100644
index 000000000000..867d0ac026ce
--- /dev/null
+++ b/fs/pstore/Kconfig
@@ -0,0 +1,13 @@
+config PSTORE
+        bool "Persistant store support"
+        default n
+        help
+           This option enables generic access to platform level
+           persistent storage via "pstore" filesystem that can
+           be mounted as /dev/pstore.  Only useful if you have
+           a platform level driver that registers with pstore to
+           provide the data, so you probably should just go say "Y"
+           (or "M") to a platform specific persistent store driver
+           (e.g. ACPI_APEI on X86) which will select this for you.
+           If you don't have a platform persistent store driver,
+           say N.
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
new file mode 100644
index 000000000000..760f4bce7d1d
--- /dev/null
+++ b/fs/pstore/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux pstorefs routines.
+#
+obj-y += pstore.o
+pstore-objs += inode.o platform.o
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
new file mode 100644
index 000000000000..977ed2723845
--- /dev/null
+++ b/fs/pstore/inode.c
@@ -0,0 +1,311 @@
+/*
+ * Persistent Storage - ramfs parts.
+ *
+ * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/mount.h>
+#include <linux/ramfs.h>
+#include <linux/parser.h>
+#include <linux/sched.h>
+#include <linux/magic.h>
+#include <linux/pstore.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include "internal.h"
+#define PSTORE_NAMELEN  64
+struct pstore_private {
+        u64     id;
+        int     (*erase)(u64);
+        ssize_t size;
+        char    data[];
+};
+static int pstore_file_open(struct inode *inode, struct file *file)
+{
+        file->private_data = inode->i_private;
+        return 0;
+}
+static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
+                                                size_t count, loff_t *ppos)
+{
+        struct pstore_private *ps = file->private_data;
+        return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size);
+}
+static const struct file_operations pstore_file_operations = {
+        .open   = pstore_file_open,
+        .read   = pstore_file_read,
+        .llseek = default_llseek,
+};
+/*
+ * When a file is unlinked from our file system we call the
+ * platform driver to erase the record from persistent store.
+ */
+static int pstore_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct pstore_private *p = dentry->d_inode->i_private;
+        p->erase(p->id);
+        return simple_unlink(dir, dentry);
+}
+static void pstore_evict_inode(struct inode *inode)
+{
+        end_writeback(inode);
+        kfree(inode->i_private);
+}
+static const struct inode_operations pstore_dir_inode_operations = {
+        .lookup         = simple_lookup,
+        .unlink         = pstore_unlink,
+};
+static struct inode *pstore_get_inode(struct super_block *sb,
+                                        const struct inode *dir, int mode, dev_t dev)
+{
+        struct inode *inode = new_inode(sb);
+        if (inode) {
+                inode->i_ino = get_next_ino();
+                inode->i_uid = inode->i_gid = 0;
+                inode->i_mode = mode;
+                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+                switch (mode & S_IFMT) {
+                case S_IFREG:
+                        inode->i_fop = &pstore_file_operations;
+                        break;
+                case S_IFDIR:
+                        inode->i_op = &pstore_dir_inode_operations;
+                        inode->i_fop = &simple_dir_operations;
+                        inc_nlink(inode);
+                        break;
+                }
+        }
+        return inode;
+}
+enum {
+        Opt_kmsg_bytes, Opt_err
+};
+static const match_table_t tokens = {
+        {Opt_kmsg_bytes, "kmsg_bytes=%u"},
+        {Opt_err, NULL}
+};
+static void parse_options(char *options)
+{
+        char            *p;
+        substring_t     args[MAX_OPT_ARGS];
+        int             option;
+        if (!options)
+                return;
+        while ((p = strsep(&options, ",")) != NULL) {
+                int token;
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_kmsg_bytes:
+                        if (!match_int(&args[0], &option))
+                                pstore_set_kmsg_bytes(option);
+                        break;
+                }
+        }
+}
+static int pstore_remount(struct super_block *sb, int *flags, char *data)
+{
+        parse_options(data);
+        return 0;
+}
+static const struct super_operations pstore_ops = {
+        .statfs         = simple_statfs,
+        .drop_inode     = generic_delete_inode,
+        .evict_inode    = pstore_evict_inode,
+        .remount_fs     = pstore_remount,
+        .show_options   = generic_show_options,
+};
+static struct super_block *pstore_sb;
+int pstore_is_mounted(void)
+{
+        return pstore_sb != NULL;
+}
+/*
+ * Make a regular file in the root directory of our file system.
+ * Load it up with "size" bytes of data from "buf".
+ * Set the mtime & ctime to the date that this record was originally stored.
+ */
+int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
+                              char *data, size_t size,
+                              struct timespec time, int (*erase)(u64))
+{
+        struct dentry           *root = pstore_sb->s_root;
+        struct dentry           *dentry;
+        struct inode            *inode;
+        int                     rc;
+        char                    name[PSTORE_NAMELEN];
+        struct pstore_private   *private;
+        rc = -ENOMEM;
+        inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0);
+        if (!inode)
+                goto fail;
+        private = kmalloc(sizeof *private + size, GFP_KERNEL);
+        if (!private)
+                goto fail_alloc;
+        private->id = id;
+        private->erase = erase;
+        switch (type) {
+        case PSTORE_TYPE_DMESG:
+                sprintf(name, "dmesg-%s-%lld", psname, id);
+                break;
+        case PSTORE_TYPE_MCE:
+                sprintf(name, "mce-%s-%lld", psname, id);
+                break;
+        case PSTORE_TYPE_UNKNOWN:
+                sprintf(name, "unknown-%s-%lld", psname, id);
+                break;
+        default:
+                sprintf(name, "type%d-%s-%lld", type, psname, id);
+                break;
+        }
+        mutex_lock(&root->d_inode->i_mutex);
+        rc = -ENOSPC;
+        dentry = d_alloc_name(root, name);
+        if (IS_ERR(dentry))
+                goto fail_lockedalloc;
+        memcpy(private->data, data, size);
+        inode->i_size = private->size = size;
+        inode->i_private = private;
+        if (time.tv_sec)
+                inode->i_mtime = inode->i_ctime = time;
+        d_add(dentry, inode);
+        mutex_unlock(&root->d_inode->i_mutex);
+        return 0;
+fail_lockedalloc:
+        mutex_unlock(&root->d_inode->i_mutex);
+        kfree(private);
+fail_alloc:
+        iput(inode);
+fail:
+        return rc;
+}
+int pstore_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct inode *inode = NULL;
+        struct dentry *root;
+        int err;
+        save_mount_options(sb, data);
+        pstore_sb = sb;
+        sb->s_maxbytes          = MAX_LFS_FILESIZE;
+        sb->s_blocksize         = PAGE_CACHE_SIZE;
+        sb->s_blocksize_bits    = PAGE_CACHE_SHIFT;
+        sb->s_magic             = PSTOREFS_MAGIC;
+        sb->s_op                = &pstore_ops;
+        sb->s_time_gran         = 1;
+        parse_options(data);
+        inode = pstore_get_inode(sb, NULL, S_IFDIR | 0755, 0);
+        if (!inode) {
+                err = -ENOMEM;
+                goto fail;
+        }
+        /* override ramfs "dir" options so we catch unlink(2) */
+        inode->i_op = &pstore_dir_inode_operations;
+        root = d_alloc_root(inode);
+        sb->s_root = root;
+        if (!root) {
+                err = -ENOMEM;
+                goto fail;
+        }
+        pstore_get_records();
+        return 0;
+fail:
+        iput(inode);
+        return err;
+}
+static struct dentry *pstore_mount(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *data)
+{
+        return mount_single(fs_type, flags, data, pstore_fill_super);
+}
+static void pstore_kill_sb(struct super_block *sb)
+{
+        kill_litter_super(sb);
+        pstore_sb = NULL;
+}
+static struct file_system_type pstore_fs_type = {
+        .name           = "pstore",
+        .mount          = pstore_mount,
+        .kill_sb        = pstore_kill_sb,
+};
+static int __init init_pstore_fs(void)
+{
+        return register_filesystem(&pstore_fs_type);
+}
+module_init(init_pstore_fs)
+MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
+MODULE_LICENSE("GPL");
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
new file mode 100644
index 000000000000..8c9f23eb1645
--- /dev/null
+++ b/fs/pstore/internal.h
@@ -0,0 +1,6 @@
+extern void     pstore_set_kmsg_bytes(int);
+extern void     pstore_get_records(void);
+extern int      pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
+                              char *data, size_t size,
+                              struct timespec time, int (*erase)(u64));
+extern int      pstore_is_mounted(void);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
new file mode 100644
index 000000000000..f835a25625ff
--- /dev/null
+++ b/fs/pstore/platform.c
@@ -0,0 +1,201 @@
+/*
+ * Persistent Storage - platform driver interface parts.
+ *
+ * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/atomic.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kmsg_dump.h>
+#include <linux/module.h>
+#include <linux/pstore.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include "internal.h"
+/*
+ * pstore_lock just protects "psinfo" during
+ * calls to pstore_register()
+ */
+static DEFINE_SPINLOCK(pstore_lock);
+static struct pstore_info *psinfo;
+/* How much of the console log to snapshot */
+static unsigned long kmsg_bytes = 10240;
+void pstore_set_kmsg_bytes(int bytes)
+{
+        kmsg_bytes = bytes;
+}
+/* Tag each group of saved records with a sequence number */
+static int      oopscount;
+static char *reason_str[] = {
+        "Oops", "Panic", "Kexec", "Restart", "Halt", "Poweroff", "Emergency"
+};
+/*
+ * callback from kmsg_dump. (s2,l2) has the most recently
+ * written bytes, older bytes are in (s1,l1). Save as much
+ * as we can from the end of the buffer.
+ */
+static void pstore_dump(struct kmsg_dumper *dumper,
+            enum kmsg_dump_reason reason,
+            const char *s1, unsigned long l1,
+            const char *s2, unsigned long l2)
+{
+        unsigned long   s1_start, s2_start;
+        unsigned long   l1_cpy, l2_cpy;
+        unsigned long   size, total = 0;
+        char            *dst, *why;
+        u64             id;
+        int             hsize, part = 1;
+        if (reason < ARRAY_SIZE(reason_str))
+                why = reason_str[reason];
+        else
+                why = "Unknown";
+        mutex_lock(&psinfo->buf_mutex);
+        oopscount++;
+        while (total < kmsg_bytes) {
+                dst = psinfo->buf;
+                hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part++);
+                size = psinfo->bufsize - hsize;
+                dst += hsize;
+                l2_cpy = min(l2, size);
+                l1_cpy = min(l1, size - l2_cpy);
+                if (l1_cpy + l2_cpy == 0)
+                        break;
+                s2_start = l2 - l2_cpy;
+                s1_start = l1 - l1_cpy;
+                memcpy(dst, s1 + s1_start, l1_cpy);
+                memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
+                id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy);
+                if (reason == KMSG_DUMP_OOPS && pstore_is_mounted())
+                        pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id,
+                                      psinfo->buf, hsize + l1_cpy + l2_cpy,
+                                      CURRENT_TIME, psinfo->erase);
+                l1 -= l1_cpy;
+                l2 -= l2_cpy;
+                total += l1_cpy + l2_cpy;
+        }
+        mutex_unlock(&psinfo->buf_mutex);
+}
+static struct kmsg_dumper pstore_dumper = {
+        .dump = pstore_dump,
+};
+/*
+ * platform specific persistent storage driver registers with
+ * us here. If pstore is already mounted, call the platform
+ * read function right away to populate the file system. If not
+ * then the pstore mount code will call us later to fill out
+ * the file system.
+ *
+ * Register with kmsg_dump to save last part of console log on panic.
+ */
+int pstore_register(struct pstore_info *psi)
+{
+        struct module *owner = psi->owner;
+        spin_lock(&pstore_lock);
+        if (psinfo) {
+                spin_unlock(&pstore_lock);
+                return -EBUSY;
+        }
+        psinfo = psi;
+        spin_unlock(&pstore_lock);
+        if (owner && !try_module_get(owner)) {
+                psinfo = NULL;
+                return -EINVAL;
+        }
+        if (pstore_is_mounted())
+                pstore_get_records();
+        kmsg_dump_register(&pstore_dumper);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(pstore_register);
+/*
+ * Read all the records from the persistent store. Create and
+ * file files in our filesystem.
+ */
+void pstore_get_records(void)
+{
+        struct pstore_info *psi = psinfo;
+        size_t                  size;
+        u64                     id;
+        enum pstore_type_id     type;
+        struct timespec         time;
+        int                     failed = 0;
+        if (!psi)
+                return;
+        mutex_lock(&psinfo->buf_mutex);
+        while ((size = psi->read(&id, &type, &time)) > 0) {
+                if (pstore_mkfile(type, psi->name, id, psi->buf, size,
+                                  time, psi->erase))
+                        failed++;
+        }
+        mutex_unlock(&psinfo->buf_mutex);
+        if (failed)
+                printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n",
+                       failed, psi->name);
+}
+/*
+ * Call platform driver to write a record to the
+ * persistent store.
+ */
+int pstore_write(enum pstore_type_id type, char *buf, size_t size)
+{
+        u64     id;
+        if (!psinfo)
+                return -ENODEV;
+        if (size > psinfo->bufsize)
+                return -EFBIG;
+        mutex_lock(&psinfo->buf_mutex);
+        memcpy(psinfo->buf, buf, size);
+        id = psinfo->write(type, size);
+        if (pstore_is_mounted())
+                pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf,
+                              size, CURRENT_TIME, psinfo->erase);
+        mutex_unlock(&psinfo->buf_mutex);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(pstore_write);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index e63b4171d583..2b0646613f5a 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -335,7 +335,6 @@ static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
 static const struct address_space_operations qnx4_aops = {
        .readpage       = qnx4_readpage,
        .writepage      = qnx4_writepage,
-        .sync_page      = block_sync_page,
        .write_begin    = qnx4_write_begin,
        .write_end      = generic_write_end,
        .bmap           = qnx4_bmap
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index a2a622e079f0..fcc8ae75d874 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -76,7 +76,7 @@
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
 #include <linux/quotaops.h>
-#include <linux/writeback.h> /* for inode_lock, oddly enough.. */
+#include "../internal.h" /* ugh */
 #include <asm/uaccess.h>
@@ -900,33 +900,38 @@ static void add_dquot_ref(struct super_block *sb, int type)
        int reserved = 0;
 #endif
-        spin_lock(&inode_lock);
+        spin_lock(&inode_sb_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+                spin_lock(&inode->i_lock);
+                if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+                    !atomic_read(&inode->i_writecount) ||
+                    !dqinit_needed(inode, type)) {
+                        spin_unlock(&inode->i_lock);
                        continue;
+                }
 #ifdef CONFIG_QUOTA_DEBUG
                if (unlikely(inode_get_rsv_space(inode) > 0))
                        reserved = 1;
 #endif
-                if (!atomic_read(&inode->i_writecount))
-                        continue;
-                if (!dqinit_needed(inode, type))
-                        continue;
                __iget(inode);
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode->i_lock);
+                spin_unlock(&inode_sb_list_lock);
                iput(old_inode);
                __dquot_initialize(inode, type);
-                /* We hold a reference to 'inode' so it couldn't have been
-                 * removed from s_inodes list while we dropped the inode_lock.
+                /*
-                 * We cannot iput the inode now as we can be holding the last
+                 * We hold a reference to 'inode' so it couldn't have been
-                 * reference and we cannot iput it under inode_lock. So we
+                 * removed from s_inodes list while we dropped the
-                 * keep the reference and iput it later. */
+                 * inode_sb_list_lock We cannot iput the inode now as we can be
+                 * holding the last reference and we cannot iput it under
+                 * inode_sb_list_lock. So we keep the reference and iput it
+                 * later.
+                 */
                old_inode = inode;
-                spin_lock(&inode_lock);
+                spin_lock(&inode_sb_list_lock);
        }
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_sb_list_lock);
        iput(old_inode);
 #ifdef CONFIG_QUOTA_DEBUG
@@ -1007,7 +1012,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
        struct inode *inode;
        int reserved = 0;
-        spin_lock(&inode_lock);
+        spin_lock(&inode_sb_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                /*
                 *  We have to scan also I_NEW inodes because they can already
@@ -1021,7 +1026,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
                        remove_inode_dquot_ref(inode, type, tofree_head);
                }
        }
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_sb_list_lock);
 #ifdef CONFIG_QUOTA_DEBUG
        if (reserved) {
                printk(KERN_WARNING "VFS (%s): Writes happened after quota"
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 65444d29406b..f1ab3604db5a 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -112,7 +112,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
        if (!info->dqi_priv) {
                printk(KERN_WARNING
                       "Not enough memory for quota information structure.\n");
-                return -1;
+                return -ENOMEM;
        }
        qinfo = info->dqi_priv;
        if (version == 0) {
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 792b3cb2cd18..3c3b00165114 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -31,9 +31,7 @@ endif
 # and causing a panic. Since this behavior only affects ppc32, this ifeq
 # will work around it. If any other architecture displays this behavior,
 # add it here.
-ifeq ($(CONFIG_PPC32),y)
+ccflags-$(CONFIG_PPC32) := $(call cc-ifversion, -lt, 0400, -O1)
-EXTRA_CFLAGS := $(call cc-ifversion, -lt, 0400, -O1)
-endif
 TAGS:
        etags *.c
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0bae036831e2..4fd5bb33dbb5 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1593,8 +1593,13 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
        struct inode *inode = dentry->d_inode;
        int maxlen = *lenp;
-        if (maxlen < 3)
+        if (need_parent && (maxlen < 5)) {
+                *lenp = 5;
                return 255;
+        } else if (maxlen < 3) {
+                *lenp = 3;
+                return 255;
+        }
        data[0] = inode->i_ino;
        data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
@@ -3212,7 +3217,6 @@ const struct address_space_operations reiserfs_address_space_operations = {
        .readpages = reiserfs_readpages,
        .releasepage = reiserfs_releasepage,
        .invalidatepage = reiserfs_invalidatepage,
-        .sync_page = block_sync_page,
        .write_begin = reiserfs_write_begin,
        .write_end = reiserfs_write_end,
        .bmap = reiserfs_aop_bmap,
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 79265fdc317a..4e153051bc75 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -59,7 +59,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        if (err)
                                break;
-                        if (!is_owner_or_cap(inode)) {
+                        if (!inode_owner_or_capable(inode)) {
                                err = -EPERM;
                                goto setflags_out;
                        }
@@ -103,7 +103,7 @@ setflags_out:
                err = put_user(inode->i_generation, (int __user *)arg);
                break;
        case REISERFS_IOC_SETVERSION:
-                if (!is_owner_or_cap(inode)) {
+                if (!inode_owner_or_capable(inode)) {
                        err = -EPERM;
                        break;
                }
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 3eea859e6990..c77514bd5776 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2876,7 +2876,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        reiserfs_mounted_fs_count++;
        if (reiserfs_mounted_fs_count <= 1) {
                reiserfs_write_unlock(sb);
-                commit_wq = create_workqueue("reiserfs");
+                commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
                reiserfs_write_lock(sb);
        }
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ba5f51ec3458..118662690cdf 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -593,7 +593,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
        new_inode_init(inode, dir, mode);
        jbegin_count += reiserfs_cache_default_acl(dir);
-        retval = reiserfs_security_init(dir, inode, &security);
+        retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
        if (retval < 0) {
                drop_new_inode(inode);
                return retval;
@@ -667,7 +667,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
        new_inode_init(inode, dir, mode);
        jbegin_count += reiserfs_cache_default_acl(dir);
-        retval = reiserfs_security_init(dir, inode, &security);
+        retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
        if (retval < 0) {
                drop_new_inode(inode);
                return retval;
@@ -747,7 +747,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        new_inode_init(inode, dir, mode);
        jbegin_count += reiserfs_cache_default_acl(dir);
-        retval = reiserfs_security_init(dir, inode, &security);
+        retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
        if (retval < 0) {
                drop_new_inode(inode);
                return retval;
@@ -771,7 +771,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                                        EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
                                        dentry, inode, &security);
        if (retval) {
-                dir->i_nlink--;
+                DEC_DIR_INODE_NLINK(dir)
                goto out_failed;
        }
@@ -1032,7 +1032,8 @@ static int reiserfs_symlink(struct inode *parent_dir,
        }
        new_inode_init(inode, parent_dir, mode);
-        retval = reiserfs_security_init(parent_dir, inode, &security);
+        retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name,
+                                        &security);
        if (retval < 0) {
                drop_new_inode(inode);
                return retval;
@@ -1122,10 +1123,6 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
                reiserfs_write_unlock(dir->i_sb);
                return -EMLINK;
        }
-        if (inode->i_nlink == 0) {
-                reiserfs_write_unlock(dir->i_sb);
-                return -ENOENT;
-        }
        /* inc before scheduling so reiserfs_unlink knows we are here */
        inc_nlink(inode);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 3cfb2e933644..5c11ca82b782 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -978,8 +978,6 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
 static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        if (nd->flags & LOOKUP_RCU)
-                return -ECHILD;
        return -EPERM;
 }
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 90d2fcb67a31..3dc38f1206fc 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -26,7 +26,7 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value,
        size_t jcreate_blocks;
        if (!reiserfs_posixacl(inode->i_sb))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EPERM;
        if (value) {
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 237c6928d3c6..ef66c18a9332 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -54,6 +54,7 @@ static size_t security_list(struct dentry *dentry, char *list, size_t list_len,
 * of blocks needed for the transaction. If successful, reiserfs_security
 * must be released using reiserfs_security_free when the caller is done. */
 int reiserfs_security_init(struct inode *dir, struct inode *inode,
+                           const struct qstr *qstr,
                           struct reiserfs_security_handle *sec)
 {
        int blocks = 0;
@@ -65,7 +66,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
        if (IS_PRIVATE(dir))
                return 0;
-        error = security_inode_init_security(inode, dir, &sec->name,
+        error = security_inode_init_security(inode, dir, qstr, &sec->name,
                                             &sec->value, &sec->length);
        if (error) {
                if (error == -EOPNOTSUPP)
diff --git a/fs/select.c b/fs/select.c
index e56560d2b08a..d33418fdc858 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -517,9 +517,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
-#define MAX_SELECT_SECONDS \
-        ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
 int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
                           fd_set __user *exp, struct timespec *end_time)
 {
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index aa68a8a31518..efc309fa3035 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -5,12 +5,12 @@ config SQUASHFS
        help
          Saying Y here includes support for SquashFS 4.0 (a Compressed
          Read-Only File System).  Squashfs is a highly compressed read-only
-          filesystem for Linux.  It uses zlib/lzo compression to compress both
+          filesystem for Linux.  It uses zlib, lzo or xz compression to
-          files, inodes and directories.  Inodes in the system are very small
+          compress both files, inodes and directories.  Inodes in the system
-          and all blocks are packed to minimise data overhead. Block sizes
+          are very small and all blocks are packed to minimise data overhead.
-          greater than 4K are supported up to a maximum of 1 Mbytes (default
+          Block sizes greater than 4K are supported up to a maximum of 1 Mbytes
-          block size 128K).  SquashFS 4.0 supports 64 bit filesystems and files
+          (default block size 128K).  SquashFS 4.0 supports 64 bit filesystems
-          (larger than 4GB), full uid/gid information, hard links and
+          and files (larger than 4GB), full uid/gid information, hard links and
          timestamps.
          Squashfs is intended for general read-only filesystem use, for
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2fb2882f0fa7..8ab48bc2fa7d 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -63,6 +63,14 @@ static struct buffer_head *get_block_length(struct super_block *sb,
                *length = (unsigned char) bh->b_data[*offset] |
                        (unsigned char) bh->b_data[*offset + 1] << 8;
                *offset += 2;
+                if (*offset == msblk->devblksize) {
+                        put_bh(bh);
+                        bh = sb_bread(sb, ++(*cur_index));
+                        if (bh == NULL)
+                                return NULL;
+                        *offset = 0;
+                }
        }
        return bh;
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index a5940e54c4dd..e921bd213738 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -23,6 +23,7 @@
 #include <linux/types.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include "squashfs_fs.h"
@@ -74,3 +75,36 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
        return decompressor[i];
 }
+void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        void *strm, *buffer = NULL;
+        int length = 0;
+        /*
+         * Read decompressor specific options from file system if present
+         */
+        if (SQUASHFS_COMP_OPTS(flags)) {
+                buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+                if (buffer == NULL)
+                        return ERR_PTR(-ENOMEM);
+                length = squashfs_read_data(sb, &buffer,
+                        sizeof(struct squashfs_super_block), 0, NULL,
+                        PAGE_CACHE_SIZE, 1);
+                if (length < 0) {
+                        strm = ERR_PTR(length);
+                        goto finished;
+                }
+        }
+        strm = msblk->decompressor->init(msblk, buffer, length);
+finished:
+        kfree(buffer);
+        return strm;
+}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 3b305a70f7aa..099745ad5691 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -24,7 +24,7 @@
 */
 struct squashfs_decompressor {
-        void    *(*init)(struct squashfs_sb_info *);
+        void    *(*init)(struct squashfs_sb_info *, void *, int);
        void    (*free)(void *);
        int     (*decompress)(struct squashfs_sb_info *, void **,
                struct buffer_head **, int, int, int, int, int);
@@ -33,11 +33,6 @@ struct squashfs_decompressor {
        int     supported;
 };
-static inline void *squashfs_decompressor_init(struct squashfs_sb_info *msblk)
-{
-        return msblk->decompressor->init(msblk);
-}
 static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk,
        void *s)
 {
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 0dc340aa2be9..3f79cd1d0c19 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -172,6 +172,11 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
                length += sizeof(dirh);
                dir_count = le32_to_cpu(dirh.count) + 1;
+                /* dir_count should never be larger than 256 */
+                if (dir_count > 256)
+                        goto failed_read;
                while (dir_count--) {
                        /*
                         * Read directory entry.
@@ -183,6 +188,10 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
                        size = le16_to_cpu(dire->size) + 1;
+                        /* size should never be larger than SQUASHFS_NAME_LEN */
+                        if (size > SQUASHFS_NAME_LEN)
+                                goto failed_read;
                        err = squashfs_read_metadata(inode->i_sb, dire->name,
                                        &block, &offset, size);
                        if (err < 0)
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 7da759e34c52..00f4dfc5f088 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -37,7 +37,7 @@ struct squashfs_lzo {
        void    *output;
 };
-static void *lzo_init(struct squashfs_sb_info *msblk)
+static void *lzo_init(struct squashfs_sb_info *msblk, void *buff, int len)
 {
        int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
@@ -58,7 +58,7 @@ failed2:
 failed:
        ERROR("Failed to allocate lzo workspace\n");
        kfree(stream);
-        return NULL;
+        return ERR_PTR(-ENOMEM);
 }
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 7a9464d08cf6..5d922a6701ab 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -176,6 +176,11 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
                length += sizeof(dirh);
                dir_count = le32_to_cpu(dirh.count) + 1;
+                /* dir_count should never be larger than 256 */
+                if (dir_count > 256)
+                        goto data_error;
                while (dir_count--) {
                        /*
                         * Read directory entry.
@@ -187,6 +192,10 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
                        size = le16_to_cpu(dire->size) + 1;
+                        /* size should never be larger than SQUASHFS_NAME_LEN */
+                        if (size > SQUASHFS_NAME_LEN)
+                                goto data_error;
                        err = squashfs_read_metadata(dir->i_sb, dire->name,
                                        &block, &offset, size);
                        if (err < 0)
@@ -228,6 +237,9 @@ exit_lookup:
        d_add(dentry, inode);
        return ERR_PTR(0);
+data_error:
+        err = -EIO;
 read_failure:
        ERROR("Unable to read directory block [%llx:%x]\n",
                squashfs_i(dir)->start + msblk->directory_table,
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index ba729d808876..1f2e608b8785 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -48,6 +48,7 @@ extern int squashfs_read_table(struct super_block *, void *, u64, int);
 /* decompressor.c */
 extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
+extern void *squashfs_decompressor_init(struct super_block *, unsigned short);
 /* export.c */
 extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 39533feffd6d..4582c568ef4d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -57,6 +57,7 @@
 #define SQUASHFS_ALWAYS_FRAG            5
 #define SQUASHFS_DUPLICATE              6
 #define SQUASHFS_EXPORT                 7
+#define SQUASHFS_COMP_OPT               10
 #define SQUASHFS_BIT(flag, bit)         ((flag >> bit) & 1)
@@ -81,6 +82,9 @@
 #define SQUASHFS_EXPORTABLE(flags)              SQUASHFS_BIT(flags, \
                                                SQUASHFS_EXPORT)
+#define SQUASHFS_COMP_OPTS(flags)               SQUASHFS_BIT(flags, \
+                                                SQUASHFS_COMP_OPT)
 /* Max number of types and file types */
 #define SQUASHFS_DIR_TYPE               1
 #define SQUASHFS_REG_TYPE               2
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 20700b9f2b4c..5c8184c061a4 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -199,10 +199,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        err = -ENOMEM;
-        msblk->stream = squashfs_decompressor_init(msblk);
-        if (msblk->stream == NULL)
-                goto failed_mount;
        msblk->block_cache = squashfs_cache_init("metadata",
                        SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
        if (msblk->block_cache == NULL)
@@ -215,6 +211,13 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
+        msblk->stream = squashfs_decompressor_init(sb, flags);
+        if (IS_ERR(msblk->stream)) {
+                err = PTR_ERR(msblk->stream);
+                msblk->stream = NULL;
+                goto failed_mount;
+        }
        /* Allocate and read id index table */
        msblk->id_table = squashfs_read_id_index_table(sb,
                le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids));
@@ -370,8 +373,8 @@ static void squashfs_put_super(struct super_block *sb)
 }
-static struct dentry *squashfs_mount(struct file_system_type *fs_type, int flags,
+static struct dentry *squashfs_mount(struct file_system_type *fs_type,
-                                const char *dev_name, void *data)
+                                int flags, const char *dev_name, void *data)
 {
        return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super);
 }
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index 856756ca5ee4..aa47a286d1f8 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -26,10 +26,10 @@
 #include <linux/buffer_head.h>
 #include <linux/slab.h>
 #include <linux/xz.h>
+#include <linux/bitops.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
@@ -38,24 +38,57 @@ struct squashfs_xz {
        struct xz_buf buf;
 };
-static void *squashfs_xz_init(struct squashfs_sb_info *msblk)
+struct comp_opts {
+        __le32 dictionary_size;
+        __le32 flags;
+};
+static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff,
+        int len)
 {
-        int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
+        struct comp_opts *comp_opts = buff;
+        struct squashfs_xz *stream;
+        int dict_size = msblk->block_size;
+        int err, n;
+        if (comp_opts) {
+                /* check compressor options are the expected length */
+                if (len < sizeof(*comp_opts)) {
+                        err = -EIO;
+                        goto failed;
+                }
+                dict_size = le32_to_cpu(comp_opts->dictionary_size);
+                /* the dictionary size should be 2^n or 2^n+2^(n+1) */
+                n = ffs(dict_size) - 1;
+                if (dict_size != (1 << n) && dict_size != (1 << n) +
+                                                (1 << (n + 1))) {
+                        err = -EIO;
+                        goto failed;
+                }
+        }
+        dict_size = max_t(int, dict_size, SQUASHFS_METADATA_SIZE);
-        struct squashfs_xz *stream = kmalloc(sizeof(*stream), GFP_KERNEL);
+        stream = kmalloc(sizeof(*stream), GFP_KERNEL);
-        if (stream == NULL)
+        if (stream == NULL) {
+                err = -ENOMEM;
                goto failed;
+        }
-        stream->state = xz_dec_init(XZ_PREALLOC, block_size);
+        stream->state = xz_dec_init(XZ_PREALLOC, dict_size);
-        if (stream->state == NULL)
+        if (stream->state == NULL) {
+                kfree(stream);
+                err = -ENOMEM;
                goto failed;
+        }
        return stream;
 failed:
-        ERROR("Failed to allocate xz workspace\n");
+        ERROR("Failed to initialise xz decompressor\n");
-        kfree(stream);
+        return ERR_PTR(err);
-        return NULL;
 }
@@ -95,12 +128,6 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
                        if (!buffer_uptodate(bh[k]))
                                goto release_mutex;
-                        if (avail == 0) {
-                                offset = 0;
-                                put_bh(bh[k++]);
-                                continue;
-                        }
                        stream->buf.in = bh[k]->b_data + offset;
                        stream->buf.in_size = avail;
                        stream->buf.in_pos = 0;
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 818a5e063faf..517688b32ffa 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -26,19 +26,19 @@
 #include <linux/buffer_head.h>
 #include <linux/slab.h>
 #include <linux/zlib.h>
+#include <linux/vmalloc.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs.h"
 #include "decompressor.h"
-static void *zlib_init(struct squashfs_sb_info *dummy)
+static void *zlib_init(struct squashfs_sb_info *dummy, void *buff, int len)
 {
        z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
        if (stream == NULL)
                goto failed;
-        stream->workspace = kmalloc(zlib_inflate_workspacesize(),
+        stream->workspace = vmalloc(zlib_inflate_workspacesize());
-                GFP_KERNEL);
        if (stream->workspace == NULL)
                goto failed;
@@ -47,7 +47,7 @@ static void *zlib_init(struct squashfs_sb_info *dummy)
 failed:
        ERROR("Failed to allocate zlib workspace\n");
        kfree(stream);
-        return NULL;
+        return ERR_PTR(-ENOMEM);
 }
@@ -56,7 +56,7 @@ static void zlib_free(void *strm)
        z_stream *stream = strm;
        if (stream)
-                kfree(stream->workspace);
+                vfree(stream->workspace);
        kfree(stream);
 }
@@ -82,12 +82,6 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
                        if (!buffer_uptodate(bh[k]))
                                goto release_mutex;
-                        if (avail == 0) {
-                                offset = 0;
-                                put_bh(bh[k++]);
-                                continue;
-                        }
                        stream->next_in = bh[k]->b_data + offset;
                        stream->avail_in = avail;
                        offset = 0;
diff --git a/fs/stat.c b/fs/stat.c
index d5c61cf2b703..961039121cb8 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,13 +75,16 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
        int error = -EINVAL;
        int lookup_flags = 0;
-        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0)
+        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
+                      AT_EMPTY_PATH)) != 0)
                goto out;
        if (!(flag & AT_SYMLINK_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
        if (flag & AT_NO_AUTOMOUNT)
                lookup_flags |= LOOKUP_NO_AUTOMOUNT;
+        if (flag & AT_EMPTY_PATH)
+                lookup_flags |= LOOKUP_EMPTY;
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
@@ -297,7 +300,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
        if (bufsiz <= 0)
                return -EINVAL;
-        error = user_path_at(dfd, pathname, 0, &path);
+        error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path);
        if (!error) {
                struct inode *inode = path.dentry->d_inode;
diff --git a/fs/statfs.c b/fs/statfs.c
index 30ea8c8a996b..8244924dec55 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -73,149 +73,135 @@ int vfs_statfs(struct path *path, struct kstatfs *buf)
 }
 EXPORT_SYMBOL(vfs_statfs);
-static int do_statfs_native(struct path *path, struct statfs *buf)
+int user_statfs(const char __user *pathname, struct kstatfs *st)
 {
-        struct kstatfs st;
+        struct path path;
-        int retval;
+        int error = user_path(pathname, &path);
+        if (!error) {
+                error = vfs_statfs(&path, st);
+                path_put(&path);
+        }
+        return error;
+}
-        retval = vfs_statfs(path, &st);
+int fd_statfs(int fd, struct kstatfs *st)
-        if (retval)
+{
-                return retval;
+        struct file *file = fget(fd);
+        int error = -EBADF;
+        if (file) {
+                error = vfs_statfs(&file->f_path, st);
+                fput(file);
+        }
+        return error;
+}
-        if (sizeof(*buf) == sizeof(st))
+static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
-                memcpy(buf, &st, sizeof(st));
+{
+        struct statfs buf;
+        if (sizeof(buf) == sizeof(*st))
+                memcpy(&buf, st, sizeof(*st));
        else {
-                if (sizeof buf->f_blocks == 4) {
+                if (sizeof buf.f_blocks == 4) {
-                        if ((st.f_blocks | st.f_bfree | st.f_bavail |
+                        if ((st->f_blocks | st->f_bfree | st->f_bavail |
-                             st.f_bsize | st.f_frsize) &
+                             st->f_bsize | st->f_frsize) &
                            0xffffffff00000000ULL)
                                return -EOVERFLOW;
                        /*
                         * f_files and f_ffree may be -1; it's okay to stuff
                         * that into 32 bits
                         */
-                        if (st.f_files != -1 &&
+                        if (st->f_files != -1 &&
-                            (st.f_files & 0xffffffff00000000ULL))
+                            (st->f_files & 0xffffffff00000000ULL))
                                return -EOVERFLOW;
-                        if (st.f_ffree != -1 &&
+                        if (st->f_ffree != -1 &&
-                            (st.f_ffree & 0xffffffff00000000ULL))
+                            (st->f_ffree & 0xffffffff00000000ULL))
                                return -EOVERFLOW;
                }
-                buf->f_type = st.f_type;
+                buf.f_type = st->f_type;
-                buf->f_bsize = st.f_bsize;
+                buf.f_bsize = st->f_bsize;
-                buf->f_blocks = st.f_blocks;
+                buf.f_blocks = st->f_blocks;
-                buf->f_bfree = st.f_bfree;
+                buf.f_bfree = st->f_bfree;
-                buf->f_bavail = st.f_bavail;
+                buf.f_bavail = st->f_bavail;
-                buf->f_files = st.f_files;
+                buf.f_files = st->f_files;
-                buf->f_ffree = st.f_ffree;
+                buf.f_ffree = st->f_ffree;
-                buf->f_fsid = st.f_fsid;
+                buf.f_fsid = st->f_fsid;
-                buf->f_namelen = st.f_namelen;
+                buf.f_namelen = st->f_namelen;
-                buf->f_frsize = st.f_frsize;
+                buf.f_frsize = st->f_frsize;
-                buf->f_flags = st.f_flags;
+                buf.f_flags = st->f_flags;
-                memset(buf->f_spare, 0, sizeof(buf->f_spare));
+                memset(buf.f_spare, 0, sizeof(buf.f_spare));
        }
+        if (copy_to_user(p, &buf, sizeof(buf)))
+                return -EFAULT;
        return 0;
 }
-static int do_statfs64(struct path *path, struct statfs64 *buf)
+static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p)
 {
-        struct kstatfs st;
+        struct statfs64 buf;
-        int retval;
+        if (sizeof(buf) == sizeof(*st))
+                memcpy(&buf, st, sizeof(*st));
-        retval = vfs_statfs(path, &st);
-        if (retval)
-                return retval;
-        if (sizeof(*buf) == sizeof(st))
-                memcpy(buf, &st, sizeof(st));
        else {
-                buf->f_type = st.f_type;
+                buf.f_type = st->f_type;
-                buf->f_bsize = st.f_bsize;
+                buf.f_bsize = st->f_bsize;
-                buf->f_blocks = st.f_blocks;
+                buf.f_blocks = st->f_blocks;
-                buf->f_bfree = st.f_bfree;
+                buf.f_bfree = st->f_bfree;
-                buf->f_bavail = st.f_bavail;
+                buf.f_bavail = st->f_bavail;
-                buf->f_files = st.f_files;
+                buf.f_files = st->f_files;
-                buf->f_ffree = st.f_ffree;
+                buf.f_ffree = st->f_ffree;
-                buf->f_fsid = st.f_fsid;
+                buf.f_fsid = st->f_fsid;
-                buf->f_namelen = st.f_namelen;
+                buf.f_namelen = st->f_namelen;
-                buf->f_frsize = st.f_frsize;
+                buf.f_frsize = st->f_frsize;
-                buf->f_flags = st.f_flags;
+                buf.f_flags = st->f_flags;
-                memset(buf->f_spare, 0, sizeof(buf->f_spare));
+                memset(buf.f_spare, 0, sizeof(buf.f_spare));
        }
+        if (copy_to_user(p, &buf, sizeof(buf)))
+                return -EFAULT;
        return 0;
 }
 SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
 {
-        struct path path;
+        struct kstatfs st;
-        int error;
+        int error = user_statfs(pathname, &st);
+        if (!error)
-        error = user_path(pathname, &path);
+                error = do_statfs_native(&st, buf);
-        if (!error) {
-                struct statfs tmp;
-                error = do_statfs_native(&path, &tmp);
-                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                        error = -EFAULT;
-                path_put(&path);
-        }
        return error;
 }
 SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
 {
-        struct path path;
+        struct kstatfs st;
-        long error;
+        int error;
        if (sz != sizeof(*buf))
                return -EINVAL;
-        error = user_path(pathname, &path);
+        error = user_statfs(pathname, &st);
-        if (!error) {
+        if (!error)
-                struct statfs64 tmp;
+                error = do_statfs64(&st, buf);
-                error = do_statfs64(&path, &tmp);
-                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                        error = -EFAULT;
-                path_put(&path);
-        }
        return error;
 }
 SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
 {
-        struct file *file;
+        struct kstatfs st;
-        struct statfs tmp;
+        int error = fd_statfs(fd, &st);
-        int error;
+        if (!error)
+                error = do_statfs_native(&st, buf);
-        error = -EBADF;
-        file = fget(fd);
-        if (!file)
-                goto out;
-        error = do_statfs_native(&file->f_path, &tmp);
-        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                error = -EFAULT;
-        fput(file);
-out:
        return error;
 }
 SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
 {
-        struct file *file;
+        struct kstatfs st;
-        struct statfs64 tmp;
        int error;
        if (sz != sizeof(*buf))
                return -EINVAL;
-        error = -EBADF;
+        error = fd_statfs(fd, &st);
-        file = fget(fd);
+        if (!error)
-        if (!file)
+                error = do_statfs64(&st, buf);
-                goto out;
-        error = do_statfs64(&file->f_path, &tmp);
-        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                error = -EFAULT;
-        fput(file);
-out:
        return error;
 }
diff --git a/fs/super.c b/fs/super.c
index 74e149efed81..8a06881b1920 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -71,6 +71,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
 #else
                INIT_LIST_HEAD(&s->s_files);
 #endif
+                s->s_bdi = &default_backing_dev_info;
                INIT_LIST_HEAD(&s->s_instances);
                INIT_HLIST_BL_HEAD(&s->s_anon);
                INIT_LIST_HEAD(&s->s_inodes);
@@ -177,6 +178,11 @@ void deactivate_locked_super(struct super_block *s)
        struct file_system_type *fs = s->s_type;
        if (atomic_dec_and_test(&s->s_active)) {
                fs->kill_sb(s);
+                /*
+                 * We need to call rcu_barrier so all the delayed rcu free
+                 * inodes are flushed before we release the fs module.
+                 */
+                rcu_barrier();
                put_filesystem(fs);
                put_super(s);
        } else {
@@ -838,23 +844,6 @@ error:
 }
 EXPORT_SYMBOL(mount_bdev);
-int get_sb_bdev(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data,
-        int (*fill_super)(struct super_block *, void *, int),
-        struct vfsmount *mnt)
-{
-        struct dentry *root;
-        root = mount_bdev(fs_type, flags, dev_name, data, fill_super);
-        if (IS_ERR(root))
-                return PTR_ERR(root);
-        mnt->mnt_root = root;
-        mnt->mnt_sb = root->d_sb;
-        return 0;
-}
-EXPORT_SYMBOL(get_sb_bdev);
 void kill_block_super(struct super_block *sb)
 {
        struct block_device *bdev = sb->s_bdev;
@@ -892,22 +881,6 @@ struct dentry *mount_nodev(struct file_system_type *fs_type,
 }
 EXPORT_SYMBOL(mount_nodev);
-int get_sb_nodev(struct file_system_type *fs_type,
-        int flags, void *data,
-        int (*fill_super)(struct super_block *, void *, int),
-        struct vfsmount *mnt)
-{
-        struct dentry *root;
-        root = mount_nodev(fs_type, flags, data, fill_super);
-        if (IS_ERR(root))
-                return PTR_ERR(root);
-        mnt->mnt_root = root;
-        mnt->mnt_sb = root->d_sb;
-        return 0;
-}
-EXPORT_SYMBOL(get_sb_nodev);
 static int compare_single(struct super_block *s, void *p)
 {
        return 1;
@@ -938,69 +911,36 @@ struct dentry *mount_single(struct file_system_type *fs_type,
 }
 EXPORT_SYMBOL(mount_single);
-int get_sb_single(struct file_system_type *fs_type,
+struct dentry *
-        int flags, void *data,
+mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
-        int (*fill_super)(struct super_block *, void *, int),
-        struct vfsmount *mnt)
-{
-        struct dentry *root;
-        root = mount_single(fs_type, flags, data, fill_super);
-        if (IS_ERR(root))
-                return PTR_ERR(root);
-        mnt->mnt_root = root;
-        mnt->mnt_sb = root->d_sb;
-        return 0;
-}
-EXPORT_SYMBOL(get_sb_single);
-struct vfsmount *
-vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
-        struct vfsmount *mnt;
        struct dentry *root;
+        struct super_block *sb;
        char *secdata = NULL;
-        int error;
+        int error = -ENOMEM;
-        if (!type)
-                return ERR_PTR(-ENODEV);
-        error = -ENOMEM;
-        mnt = alloc_vfsmnt(name);
-        if (!mnt)
-                goto out;
-        if (flags & MS_KERNMOUNT)
-                mnt->mnt_flags = MNT_INTERNAL;
        if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
                secdata = alloc_secdata();
                if (!secdata)
-                        goto out_mnt;
+                        goto out;
                error = security_sb_copy_data(data, secdata);
                if (error)
                        goto out_free_secdata;
        }
-        if (type->mount) {
+        root = type->mount(type, flags, name, data);
-                root = type->mount(type, flags, name, data);
+        if (IS_ERR(root)) {
-                if (IS_ERR(root)) {
+                error = PTR_ERR(root);
-                        error = PTR_ERR(root);
+                goto out_free_secdata;
-                        goto out_free_secdata;
-                }
-                mnt->mnt_root = root;
-                mnt->mnt_sb = root->d_sb;
-        } else {
-                error = type->get_sb(type, flags, name, data, mnt);
-                if (error < 0)
-                        goto out_free_secdata;
        }
-        BUG_ON(!mnt->mnt_sb);
+        sb = root->d_sb;
-        WARN_ON(!mnt->mnt_sb->s_bdi);
+        BUG_ON(!sb);
-        mnt->mnt_sb->s_flags |= MS_BORN;
+        WARN_ON(!sb->s_bdi);
+        WARN_ON(sb->s_bdi == &default_backing_dev_info);
+        sb->s_flags |= MS_BORN;
-        error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
+        error = security_sb_kern_mount(sb, flags, secdata);
        if (error)
                goto out_sb;
@@ -1011,27 +951,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
         * violate this rule. This warning should be either removed or
         * converted to a BUG() in 2.6.34.
         */
-        WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
+        WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
-                "negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes);
+                "negative value (%lld)\n", type->name, sb->s_maxbytes);
-        mnt->mnt_mountpoint = mnt->mnt_root;
+        up_write(&sb->s_umount);
-        mnt->mnt_parent = mnt;
-        up_write(&mnt->mnt_sb->s_umount);
        free_secdata(secdata);
-        return mnt;
+        return root;
 out_sb:
-        dput(mnt->mnt_root);
+        dput(root);
-        deactivate_locked_super(mnt->mnt_sb);
+        deactivate_locked_super(sb);
 out_free_secdata:
        free_secdata(secdata);
-out_mnt:
-        free_vfsmnt(mnt);
 out:
        return ERR_PTR(error);
 }
-EXPORT_SYMBOL_GPL(vfs_kern_mount);
 /**
 * freeze_super - lock the filesystem and force it into a consistent state
 * @sb: the super to lock
@@ -1121,49 +1055,3 @@ out:
        return 0;
 }
 EXPORT_SYMBOL(thaw_super);
-static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
-{
-        int err;
-        const char *subtype = strchr(fstype, '.');
-        if (subtype) {
-                subtype++;
-                err = -EINVAL;
-                if (!subtype[0])
-                        goto err;
-        } else
-                subtype = "";
-        mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
-        err = -ENOMEM;
-        if (!mnt->mnt_sb->s_subtype)
-                goto err;
-        return mnt;
- err:
-        mntput(mnt);
-        return ERR_PTR(err);
-}
-struct vfsmount *
-do_kern_mount(const char *fstype, int flags, const char *name, void *data)
-{
-        struct file_system_type *type = get_fs_type(fstype);
-        struct vfsmount *mnt;
-        if (!type)
-                return ERR_PTR(-ENODEV);
-        mnt = vfs_kern_mount(type, flags, name, data);
-        if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
-            !mnt->mnt_sb->s_subtype)
-                mnt = fs_set_subtype(mnt, fstype);
-        put_filesystem(type);
-        return mnt;
-}
-EXPORT_SYMBOL_GPL(do_kern_mount);
-struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
-{
-        return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
-}
-EXPORT_SYMBOL_GPL(kern_mount_data);
diff --git a/fs/sync.c b/fs/sync.c
index ba76b9623e7e..c38ec163da6c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/namei.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/syscalls.h>
@@ -33,7 +34,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
         * This should be safe, as we require bdi backing to actually
         * write out data in the first place
         */
-        if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info)
+        if (sb->s_bdi == &noop_backing_dev_info)
                return 0;
        if (sb->s_qcop && sb->s_qcop->quota_sync)
@@ -79,7 +80,7 @@ EXPORT_SYMBOL_GPL(sync_filesystem);
 static void sync_one_sb(struct super_block *sb, void *arg)
 {
-        if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi)
+        if (!(sb->s_flags & MS_RDONLY))
                __sync_filesystem(sb, *(int *)arg);
 }
 /*
@@ -128,6 +129,29 @@ void emergency_sync(void)
        }
 }
+/*
+ * sync a single super
+ */
+SYSCALL_DEFINE1(syncfs, int, fd)
+{
+        struct file *file;
+        struct super_block *sb;
+        int ret;
+        int fput_needed;
+        file = fget_light(fd, &fput_needed);
+        if (!file)
+                return -EBADF;
+        sb = file->f_dentry->d_sb;
+        down_read(&sb->s_umount);
+        ret = sync_filesystem(sb);
+        up_read(&sb->s_umount);
+        fput_light(file, fput_needed);
+        return ret;
+}
 /**
 * vfs_fsync_range - helper to sync a range of data & metadata to disk
 * @file:               file to sync
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 9ca66276315e..fa8d43c92bb8 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -488,7 +488,6 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations sysv_aops = {
        .readpage = sysv_readpage,
        .writepage = sysv_writepage,
-        .sync_page = block_sync_page,
        .write_begin = sysv_write_begin,
        .write_end = generic_write_end,
        .bmap = sysv_bmap
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index b427b1208c26..e474fbcf8bde 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -245,7 +245,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
                new_de = sysv_find_entry(new_dentry, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                sysv_set_link(new_de, new_page, old_inode);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
@@ -257,18 +256,15 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
                        if (new_dir->i_nlink >= SYSV_SB(new_dir->i_sb)->s_link_max)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = sysv_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
        sysv_delete_entry(old_de, old_page);
-        inode_dec_link_count(old_inode);
+        mark_inode_dirty(old_inode);
        if (dir_de) {
                sysv_set_link(dir_de, dir_page, new_dir);
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 830e3f76f442..d7440904be17 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -44,29 +44,17 @@ config UBIFS_FS_ZLIB
 # Debugging-related stuff
 config UBIFS_FS_DEBUG
-        bool "Enable debugging"
+        bool "Enable debugging support"
        depends on UBIFS_FS
        select DEBUG_FS
        select KALLSYMS_ALL
        help
-          This option enables UBIFS debugging.
+          This option enables UBIFS debugging support. It makes sure various
+          assertions, self-checks, debugging messages and test modes are compiled
-config UBIFS_FS_DEBUG_MSG_LVL
+          in (this all is compiled out otherwise). Assertions are light-weight
-        int "Default message level (0 = no extra messages, 3 = lots)"
+          and this option also enables them. Self-checks, debugging messages and
-        depends on UBIFS_FS_DEBUG
+          test modes are switched off by default. Thus, it is safe and actually
-        default "0"
+          recommended to have debugging support enabled, and it should not slow
-        help
+          down UBIFS. You can then further enable / disable individual  debugging
-          This controls the amount of debugging messages produced by UBIFS.
+          features using UBIFS module parameters and the corresponding sysfs
-          If reporting bugs, please try to have available a full dump of the
+          interfaces.
-          messages at level 1 while the misbehaviour was occurring. Level 2
-          may become necessary if level 1 messages were not enough to find the
-          bug. Generally Level 3 should be avoided.
-config UBIFS_FS_DEBUG_CHKS
-        bool "Enable extra checks"
-        depends on UBIFS_FS_DEBUG
-        help
-          If extra checks are enabled UBIFS will check the consistency of its
-          internal data structures during operation. However, UBIFS performance
-          is dramatically slower when this option is selected especially if the
-          file system is large.
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 02429d81ca33..b148fbc80f8d 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -48,6 +48,56 @@
 #include <linux/slab.h>
 #include "ubifs.h"
+/*
+ * nothing_to_commit - check if there is nothing to commit.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which checks if there is anything to commit. It is
+ * used as an optimization to avoid starting the commit if it is not really
+ * necessary. Indeed, the commit operation always assumes flash I/O (e.g.,
+ * writing the commit start node to the log), and it is better to avoid doing
+ * this unnecessarily. E.g., 'ubifs_sync_fs()' runs the commit, but if there is
+ * nothing to commit, it is more optimal to avoid any flash I/O.
+ *
+ * This function has to be called with @c->commit_sem locked for writing -
+ * this function does not take LPT/TNC locks because the @c->commit_sem
+ * guarantees that we have exclusive access to the TNC and LPT data structures.
+ *
+ * This function returns %1 if there is nothing to commit and %0 otherwise.
+ */
+static int nothing_to_commit(struct ubifs_info *c)
+{
+        /*
+         * During mounting or remounting from R/O mode to R/W mode we may
+         * commit for various recovery-related reasons.
+         */
+        if (c->mounting || c->remounting_rw)
+                return 0;
+        /*
+         * If the root TNC node is dirty, we definitely have something to
+         * commit.
+         */
+        if (c->zroot.znode && test_bit(DIRTY_ZNODE, &c->zroot.znode->flags))
+                return 0;
+        /*
+         * Even though the TNC is clean, the LPT tree may have dirty nodes. For
+         * example, this may happen if the budgeting subsystem invoked GC to
+         * make some free space, and the GC found an LEB with only dirty and
+         * free space. In this case GC would just change the lprops of this
+         * LEB (by turning all space into free space) and unmap it.
+         */
+        if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags))
+                return 0;
+        ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0);
+        ubifs_assert(c->dirty_pn_cnt == 0);
+        ubifs_assert(c->dirty_nn_cnt == 0);
+        return 1;
+}
 /**
 * do_commit - commit the journal.
 * @c: UBIFS file-system description object
@@ -70,6 +120,12 @@ static int do_commit(struct ubifs_info *c)
                goto out_up;
        }
+        if (nothing_to_commit(c)) {
+                up_write(&c->commit_sem);
+                err = 0;
+                goto out_cancel;
+        }
        /* Sync all write buffers (necessary for recovery) */
        for (i = 0; i < c->jhead_cnt; i++) {
                err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
@@ -162,12 +218,12 @@ static int do_commit(struct ubifs_info *c)
        if (err)
                goto out;
+out_cancel:
        spin_lock(&c->cs_lock);
        c->cmt_state = COMMIT_RESTING;
        wake_up(&c->cmt_wq);
        dbg_cmt("commit end");
        spin_unlock(&c->cs_lock);
        return 0;
 out_up:
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 0bee4dbffc31..f25a7339f800 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -43,8 +43,8 @@ DEFINE_SPINLOCK(dbg_lock);
 static char dbg_key_buf0[128];
 static char dbg_key_buf1[128];
-unsigned int ubifs_msg_flags = UBIFS_MSG_FLAGS_DEFAULT;
+unsigned int ubifs_msg_flags;
-unsigned int ubifs_chk_flags = UBIFS_CHK_FLAGS_DEFAULT;
+unsigned int ubifs_chk_flags;
 unsigned int ubifs_tst_flags;
 module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
@@ -810,16 +810,24 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
 {
        struct ubifs_scan_leb *sleb;
        struct ubifs_scan_node *snod;
+        void *buf;
        if (dbg_failure_mode)
                return;
        printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
               current->pid, lnum);
-        sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
+        buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+        if (!buf) {
+                ubifs_err("cannot allocate memory for dumping LEB %d", lnum);
+                return;
+        }
+        sleb = ubifs_scan(c, lnum, 0, buf, 0);
        if (IS_ERR(sleb)) {
                ubifs_err("scan error %d", (int)PTR_ERR(sleb));
-                return;
+                goto out;
        }
        printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum,
@@ -835,6 +843,9 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
        printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
               current->pid, lnum);
        ubifs_scan_destroy(sleb);
+out:
+        vfree(buf);
        return;
 }
@@ -2690,16 +2701,8 @@ int ubifs_debugging_init(struct ubifs_info *c)
        if (!c->dbg)
                return -ENOMEM;
-        c->dbg->buf = vmalloc(c->leb_size);
-        if (!c->dbg->buf)
-                goto out;
        failure_mode_init(c);
        return 0;
-out:
-        kfree(c->dbg);
-        return -ENOMEM;
 }
 /**
@@ -2709,7 +2712,6 @@ out:
 void ubifs_debugging_exit(struct ubifs_info *c)
 {
        failure_mode_exit(c);
-        vfree(c->dbg->buf);
        kfree(c->dbg);
 }
@@ -2813,19 +2815,19 @@ int dbg_debugfs_init_fs(struct ubifs_info *c)
        }
        fname = "dump_lprops";
-        dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
+        dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
        if (IS_ERR(dent))
                goto out_remove;
        d->dfs_dump_lprops = dent;
        fname = "dump_budg";
-        dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
+        dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
        if (IS_ERR(dent))
                goto out_remove;
        d->dfs_dump_budg = dent;
        fname = "dump_tnc";
-        dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
+        dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
        if (IS_ERR(dent))
                goto out_remove;
        d->dfs_dump_tnc = dent;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 69ebe4729151..919f0de29d8f 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -27,7 +27,6 @@
 /**
 * ubifs_debug_info - per-FS debugging information.
- * @buf: a buffer of LEB size, used for various purposes
 * @old_zroot: old index root - used by 'dbg_check_old_index()'
 * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
 * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
@@ -54,7 +53,6 @@
 * dfs_dump_tnc: "dump TNC" debugfs knob
 */
 struct ubifs_debug_info {
-        void *buf;
        struct ubifs_zbranch old_zroot;
        int old_zroot_level;
        unsigned long long old_zroot_sqnum;
@@ -173,7 +171,7 @@ const char *dbg_key_str1(const struct ubifs_info *c,
 #define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
 /*
- * Debugging message type flags (must match msg_type_names in debug.c).
+ * Debugging message type flags.
 *
 * UBIFS_MSG_GEN: general messages
 * UBIFS_MSG_JNL: journal messages
@@ -205,14 +203,8 @@ enum {
        UBIFS_MSG_RCVRY = 0x1000,
 };
-/* Debugging message type flags for each default debug message level */
-#define UBIFS_MSG_LVL_0 0
-#define UBIFS_MSG_LVL_1 0x1
-#define UBIFS_MSG_LVL_2 0x7f
-#define UBIFS_MSG_LVL_3 0xffff
 /*
- * Debugging check flags (must match chk_names in debug.c).
+ * Debugging check flags.
 *
 * UBIFS_CHK_GEN: general checks
 * UBIFS_CHK_TNC: check TNC
@@ -233,7 +225,7 @@ enum {
 };
 /*
- * Special testing flags (must match tst_names in debug.c).
+ * Special testing flags.
 *
 * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
 * UBIFS_TST_RCVRY: failure mode for recovery testing
@@ -243,22 +235,6 @@ enum {
        UBIFS_TST_RCVRY             = 0x4,
 };
-#if CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 1
-#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_1
-#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 2
-#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_2
-#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 3
-#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_3
-#else
-#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_0
-#endif
-#ifdef CONFIG_UBIFS_FS_DEBUG_CHKS
-#define UBIFS_CHK_FLAGS_DEFAULT 0xffffffff
-#else
-#define UBIFS_CHK_FLAGS_DEFAULT 0
-#endif
 extern spinlock_t dbg_lock;
 extern unsigned int ubifs_msg_flags;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 14f64b689d7f..7217d67a80a6 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -522,24 +522,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
        ubifs_assert(mutex_is_locked(&dir->i_mutex));
        ubifs_assert(mutex_is_locked(&inode->i_mutex));
-        /*
-         * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-         * otherwise has the potential to corrupt the orphan inode list.
-         *
-         * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and
-         * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not
-         * lock 'dirA->i_mutex', so this is possible. Both of the functions
-         * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes
-         * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this
-         * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA'
-         * to the list of orphans. After this, 'vfs_link()' will link
-         * 'dirB/fileB' to 'inodeA'. This is a problem because, for example,
-         * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode
-         * to the list of orphans.
-         */
-         if (inode->i_nlink == 0)
-                 return -ENOENT;
        err = dbg_check_synced_i_size(inode);
        if (err)
                return err;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index d77db7e36484..28be1e6a65e8 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -448,10 +448,12 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
                if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
                        /*
                         * We change whole page so no need to load it. But we
-                         * have to set the @PG_checked flag to make the further
+                         * do not know whether this page exists on the media or
-                         * code know that the page is new. This might be not
+                         * not, so we assume the latter because it requires
-                         * true, but it is better to budget more than to read
+                         * larger budget. The assumption is that it is better
-                         * the page from the media.
+                         * to budget a bit more than to read the page from the
+                         * media. Thus, we are setting the @PG_checked flag
+                         * here.
                         */
                        SetPageChecked(page);
                        skipped_read = 1;
@@ -559,6 +561,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
                dbg_gen("copied %d instead of %d, read page and repeat",
                        copied, len);
                cancel_budget(c, page, ui, appending);
+                ClearPageChecked(page);
                /*
                 * Return 0 to force VFS to repeat the whole operation, or the
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index d82173182eeb..dfd168b7807e 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -31,6 +31,26 @@
 * buffer is full or when it is not used for some time (by timer). This is
 * similar to the mechanism is used by JFFS2.
 *
+ * UBIFS distinguishes between minimum write size (@c->min_io_size) and maximum
+ * write size (@c->max_write_size). The latter is the maximum amount of bytes
+ * the underlying flash is able to program at a time, and writing in
+ * @c->max_write_size units should presumably be faster. Obviously,
+ * @c->min_io_size <= @c->max_write_size. Write-buffers are of
+ * @c->max_write_size bytes in size for maximum performance. However, when a
+ * write-buffer is flushed, only the portion of it (aligned to @c->min_io_size
+ * boundary) which contains data is written, not the whole write-buffer,
+ * because this is more space-efficient.
+ *
+ * This optimization adds few complications to the code. Indeed, on the one
+ * hand, we want to write in optimal @c->max_write_size bytes chunks, which
+ * also means aligning writes at the @c->max_write_size bytes offsets. On the
+ * other hand, we do not want to waste space when synchronizing the write
+ * buffer, so during synchronization we writes in smaller chunks. And this makes
+ * the next write offset to be not aligned to @c->max_write_size bytes. So the
+ * have to make sure that the write-buffer offset (@wbuf->offs) becomes aligned
+ * to @c->max_write_size bytes again. We do this by temporarily shrinking
+ * write-buffer size (@wbuf->size).
+ *
 * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
 * mutexes defined inside these objects. Since sometimes upper-level code
 * has to lock the write-buffer (e.g. journal space reservation code), many
@@ -46,8 +66,8 @@
 * UBIFS uses padding when it pads to the next min. I/O unit. In this case it
 * uses padding nodes or padding bytes, if the padding node does not fit.
 *
- * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes
+ * All UBIFS nodes are protected by CRC checksums and UBIFS checks CRC when
- * every time they are read from the flash media.
+ * they are read from the flash media.
 */
 #include <linux/crc32.h>
@@ -88,8 +108,12 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
 * This function may skip data nodes CRC checking if @c->no_chk_data_crc is
 * true, which is controlled by corresponding UBIFS mount option. However, if
 * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is
- * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is
+ * checked. Similarly, if @c->mounting or @c->remounting_rw is true (we are
- * ignored and CRC is checked.
+ * mounting or re-mounting to R/W mode), @c->no_chk_data_crc is ignored and CRC
+ * is checked. This is because during mounting or re-mounting from R/O mode to
+ * R/W mode we may read journal nodes (when replying the journal or doing the
+ * recovery) and the journal nodes may potentially be corrupted, so checking is
+ * required.
 *
 * This function returns zero in case of success and %-EUCLEAN in case of bad
 * CRC or magic.
@@ -131,8 +155,8 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
                   node_len > c->ranges[type].max_len)
                goto out_len;
-        if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc &&
+        if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->mounting &&
-             c->no_chk_data_crc)
+            !c->remounting_rw && c->no_chk_data_crc)
                return 0;
        crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
@@ -343,11 +367,17 @@ static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
 *
 * This function synchronizes write-buffer @buf and returns zero in case of
 * success or a negative error code in case of failure.
+ *
+ * Note, although write-buffers are of @c->max_write_size, this function does
+ * not necessarily writes all @c->max_write_size bytes to the flash. Instead,
+ * if the write-buffer is only partially filled with data, only the used part
+ * of the write-buffer (aligned on @c->min_io_size boundary) is synchronized.
+ * This way we waste less space.
 */
 int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
 {
        struct ubifs_info *c = wbuf->c;
-        int err, dirt;
+        int err, dirt, sync_len;
        cancel_wbuf_timer_nolock(wbuf);
        if (!wbuf->used || wbuf->lnum == -1)
@@ -357,27 +387,53 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
        dbg_io("LEB %d:%d, %d bytes, jhead %s",
               wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
        ubifs_assert(!(wbuf->avail & 7));
-        ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
+        ubifs_assert(wbuf->offs + wbuf->size <= c->leb_size);
+        ubifs_assert(wbuf->size >= c->min_io_size);
+        ubifs_assert(wbuf->size <= c->max_write_size);
+        ubifs_assert(wbuf->size % c->min_io_size == 0);
        ubifs_assert(!c->ro_media && !c->ro_mount);
+        if (c->leb_size - wbuf->offs >= c->max_write_size)
+                ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
        if (c->ro_error)
                return -EROFS;
-        ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
+        /*
+         * Do not write whole write buffer but write only the minimum necessary
+         * amount of min. I/O units.
+         */
+        sync_len = ALIGN(wbuf->used, c->min_io_size);
+        dirt = sync_len - wbuf->used;
+        if (dirt)
+                ubifs_pad(c, wbuf->buf + wbuf->used, dirt);
        err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
-                            c->min_io_size, wbuf->dtype);
+                            sync_len, wbuf->dtype);
        if (err) {
                ubifs_err("cannot write %d bytes to LEB %d:%d",
-                          c->min_io_size, wbuf->lnum, wbuf->offs);
+                          sync_len, wbuf->lnum, wbuf->offs);
                dbg_dump_stack();
                return err;
        }
-        dirt = wbuf->avail;
        spin_lock(&wbuf->lock);
-        wbuf->offs += c->min_io_size;
+        wbuf->offs += sync_len;
-        wbuf->avail = c->min_io_size;
+        /*
+         * Now @wbuf->offs is not necessarily aligned to @c->max_write_size.
+         * But our goal is to optimize writes and make sure we write in
+         * @c->max_write_size chunks and to @c->max_write_size-aligned offset.
+         * Thus, if @wbuf->offs is not aligned to @c->max_write_size now, make
+         * sure that @wbuf->offs + @wbuf->size is aligned to
+         * @c->max_write_size. This way we make sure that after next
+         * write-buffer flush we are again at the optimal offset (aligned to
+         * @c->max_write_size).
+         */
+        if (c->leb_size - wbuf->offs < c->max_write_size)
+                wbuf->size = c->leb_size - wbuf->offs;
+        else if (wbuf->offs & (c->max_write_size - 1))
+                wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs;
+        else
+                wbuf->size = c->max_write_size;
+        wbuf->avail = wbuf->size;
        wbuf->used = 0;
        wbuf->next_ino = 0;
        spin_unlock(&wbuf->lock);
@@ -420,7 +476,13 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
        spin_lock(&wbuf->lock);
        wbuf->lnum = lnum;
        wbuf->offs = offs;
-        wbuf->avail = c->min_io_size;
+        if (c->leb_size - wbuf->offs < c->max_write_size)
+                wbuf->size = c->leb_size - wbuf->offs;
+        else if (wbuf->offs & (c->max_write_size - 1))
+                wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs;
+        else
+                wbuf->size = c->max_write_size;
+        wbuf->avail = wbuf->size;
        wbuf->used = 0;
        spin_unlock(&wbuf->lock);
        wbuf->dtype = dtype;
@@ -500,8 +562,9 @@ out_timers:
 *
 * This function writes data to flash via write-buffer @wbuf. This means that
 * the last piece of the node won't reach the flash media immediately if it
- * does not take whole minimal I/O unit. Instead, the node will sit in RAM
+ * does not take whole max. write unit (@c->max_write_size). Instead, the node
- * until the write-buffer is synchronized (e.g., by timer).
+ * will sit in RAM until the write-buffer is synchronized (e.g., by timer, or
+ * because more data are appended to the write-buffer).
 *
 * This function returns zero in case of success and a negative error code in
 * case of failure. If the node cannot be written because there is no more
@@ -518,9 +581,14 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
        ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
        ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
        ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
-        ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
+        ubifs_assert(wbuf->avail > 0 && wbuf->avail <= wbuf->size);
+        ubifs_assert(wbuf->size >= c->min_io_size);
+        ubifs_assert(wbuf->size <= c->max_write_size);
+        ubifs_assert(wbuf->size % c->min_io_size == 0);
        ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
        ubifs_assert(!c->ro_media && !c->ro_mount);
+        if (c->leb_size - wbuf->offs >= c->max_write_size)
+                ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
        if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
                err = -ENOSPC;
@@ -543,14 +611,18 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
                        dbg_io("flush jhead %s wbuf to LEB %d:%d",
                               dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
                        err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
-                                            wbuf->offs, c->min_io_size,
+                                            wbuf->offs, wbuf->size,
                                            wbuf->dtype);
                        if (err)
                                goto out;
                        spin_lock(&wbuf->lock);
-                        wbuf->offs += c->min_io_size;
+                        wbuf->offs += wbuf->size;
-                        wbuf->avail = c->min_io_size;
+                        if (c->leb_size - wbuf->offs >= c->max_write_size)
+                                wbuf->size = c->max_write_size;
+                        else
+                                wbuf->size = c->leb_size - wbuf->offs;
+                        wbuf->avail = wbuf->size;
                        wbuf->used = 0;
                        wbuf->next_ino = 0;
                        spin_unlock(&wbuf->lock);
@@ -564,33 +636,57 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
                goto exit;
        }
-        /*
+        offs = wbuf->offs;
-         * The node is large enough and does not fit entirely within current
+        written = 0;
-         * minimal I/O unit. We have to fill and flush write-buffer and switch
-         * to the next min. I/O unit.
-         */
-        dbg_io("flush jhead %s wbuf to LEB %d:%d",
-               dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
-        memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
-        err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
-                            c->min_io_size, wbuf->dtype);
-        if (err)
-                goto out;
-        offs = wbuf->offs + c->min_io_size;
+        if (wbuf->used) {
-        len -= wbuf->avail;
+                /*
-        aligned_len -= wbuf->avail;
+                 * The node is large enough and does not fit entirely within
-        written = wbuf->avail;
+                 * current available space. We have to fill and flush
+                 * write-buffer and switch to the next max. write unit.
+                 */
+                dbg_io("flush jhead %s wbuf to LEB %d:%d",
+                       dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
+                memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
+                err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
+                                    wbuf->size, wbuf->dtype);
+                if (err)
+                        goto out;
+                offs += wbuf->size;
+                len -= wbuf->avail;
+                aligned_len -= wbuf->avail;
+                written += wbuf->avail;
+        } else if (wbuf->offs & (c->max_write_size - 1)) {
+                /*
+                 * The write-buffer offset is not aligned to
+                 * @c->max_write_size and @wbuf->size is less than
+                 * @c->max_write_size. Write @wbuf->size bytes to make sure the
+                 * following writes are done in optimal @c->max_write_size
+                 * chunks.
+                 */
+                dbg_io("write %d bytes to LEB %d:%d",
+                       wbuf->size, wbuf->lnum, wbuf->offs);
+                err = ubi_leb_write(c->ubi, wbuf->lnum, buf, wbuf->offs,
+                                    wbuf->size, wbuf->dtype);
+                if (err)
+                        goto out;
+                offs += wbuf->size;
+                len -= wbuf->size;
+                aligned_len -= wbuf->size;
+                written += wbuf->size;
+        }
        /*
-         * The remaining data may take more whole min. I/O units, so write the
+         * The remaining data may take more whole max. write units, so write the
-         * remains multiple to min. I/O unit size directly to the flash media.
+         * remains multiple to max. write unit size directly to the flash media.
         * We align node length to 8-byte boundary because we anyway flash wbuf
         * if the remaining space is less than 8 bytes.
         */
-        n = aligned_len >> c->min_io_shift;
+        n = aligned_len >> c->max_write_shift;
        if (n) {
-                n <<= c->min_io_shift;
+                n <<= c->max_write_shift;
                dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs);
                err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n,
                                    wbuf->dtype);
@@ -606,14 +702,18 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
        if (aligned_len)
                /*
                 * And now we have what's left and what does not take whole
-                 * min. I/O unit, so write it to the write-buffer and we are
+                 * max. write unit, so write it to the write-buffer and we are
                 * done.
                 */
                memcpy(wbuf->buf, buf + written, len);
        wbuf->offs = offs;
+        if (c->leb_size - wbuf->offs >= c->max_write_size)
+                wbuf->size = c->max_write_size;
+        else
+                wbuf->size = c->leb_size - wbuf->offs;
+        wbuf->avail = wbuf->size - aligned_len;
        wbuf->used = aligned_len;
-        wbuf->avail = c->min_io_size - aligned_len;
        wbuf->next_ino = 0;
        spin_unlock(&wbuf->lock);
@@ -837,11 +937,11 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
 {
        size_t size;
-        wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL);
+        wbuf->buf = kmalloc(c->max_write_size, GFP_KERNEL);
        if (!wbuf->buf)
                return -ENOMEM;
-        size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t);
+        size = (c->max_write_size / UBIFS_CH_SZ + 1) * sizeof(ino_t);
        wbuf->inodes = kmalloc(size, GFP_KERNEL);
        if (!wbuf->inodes) {
                kfree(wbuf->buf);
@@ -851,7 +951,14 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
        wbuf->used = 0;
        wbuf->lnum = wbuf->offs = -1;
-        wbuf->avail = c->min_io_size;
+        /*
+         * If the LEB starts at the max. write size aligned address, then
+         * write-buffer size has to be set to @c->max_write_size. Otherwise,
+         * set it to something smaller so that it ends at the closest max.
+         * write size boundary.
+         */
+        size = c->max_write_size - (c->leb_start % c->max_write_size);
+        wbuf->avail = wbuf->size = size;
        wbuf->dtype = UBI_UNKNOWN;
        wbuf->sync_callback = NULL;
        mutex_init(&wbuf->io_mutex);
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 8aacd64957a2..548acf494afd 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -160,7 +160,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                if (IS_RDONLY(inode))
                        return -EROFS;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                if (get_user(flags, (int __user *) arg))
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 914f1bd89e57..aed25e864227 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -690,7 +690,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
 {
        struct ubifs_data_node *data;
        int err, lnum, offs, compr_type, out_len;
-        int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR;
+        int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
        struct ubifs_inode *ui = ubifs_inode(inode);
        dbg_jnl("ino %lu, blk %u, len %d, key %s",
@@ -698,9 +698,19 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
                DBGKEY(key));
        ubifs_assert(len <= UBIFS_BLOCK_SIZE);
-        data = kmalloc(dlen, GFP_NOFS);
+        data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
-        if (!data)
+        if (!data) {
-                return -ENOMEM;
+                /*
+                 * Fall-back to the write reserve buffer. Note, we might be
+                 * currently on the memory reclaim path, when the kernel is
+                 * trying to free some memory by writing out dirty pages. The
+                 * write reserve buffer helps us to guarantee that we are
+                 * always able to write the data.
+                 */
+                allocated = 0;
+                mutex_lock(&c->write_reserve_mutex);
+                data = c->write_reserve_buf;
+        }
        data->ch.node_type = UBIFS_DATA_NODE;
        key_write(c, key, &data->key);
@@ -736,7 +746,10 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
                goto out_ro;
        finish_reservation(c);
-        kfree(data);
+        if (!allocated)
+                mutex_unlock(&c->write_reserve_mutex);
+        else
+                kfree(data);
        return 0;
 out_release:
@@ -745,7 +758,10 @@ out_ro:
        ubifs_ro_mode(c, err);
        finish_reservation(c);
 out_free:
-        kfree(data);
+        if (!allocated)
+                mutex_unlock(&c->write_reserve_mutex);
+        else
+                kfree(data);
        return err;
 }
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 4d4ca388889b..0ee0847f2421 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1035,7 +1035,8 @@ static int scan_check_cb(struct ubifs_info *c,
        struct ubifs_scan_leb *sleb;
        struct ubifs_scan_node *snod;
        struct ubifs_lp_stats *lst = &data->lst;
-        int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty;
+        int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
+        void *buf = NULL;
        cat = lp->flags & LPROPS_CAT_MASK;
        if (cat != LPROPS_UNCAT) {
@@ -1093,7 +1094,13 @@ static int scan_check_cb(struct ubifs_info *c,
                }
        }
-        sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
+        buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+        if (!buf) {
+                ubifs_err("cannot allocate memory to scan LEB %d", lnum);
+                goto out;
+        }
+        sleb = ubifs_scan(c, lnum, 0, buf, 0);
        if (IS_ERR(sleb)) {
                /*
                 * After an unclean unmount, empty and freeable LEBs
@@ -1105,7 +1112,8 @@ static int scan_check_cb(struct ubifs_info *c,
                        lst->empty_lebs += 1;
                        lst->total_free += c->leb_size;
                        lst->total_dark += ubifs_calc_dark(c, c->leb_size);
-                        return LPT_SCAN_CONTINUE;
+                        ret = LPT_SCAN_CONTINUE;
+                        goto exit;
                }
                if (lp->free + lp->dirty == c->leb_size &&
@@ -1115,10 +1123,12 @@ static int scan_check_cb(struct ubifs_info *c,
                        lst->total_free  += lp->free;
                        lst->total_dirty += lp->dirty;
                        lst->total_dark  +=  ubifs_calc_dark(c, c->leb_size);
-                        return LPT_SCAN_CONTINUE;
+                        ret = LPT_SCAN_CONTINUE;
+                        goto exit;
                }
                data->err = PTR_ERR(sleb);
-                return LPT_SCAN_STOP;
+                ret = LPT_SCAN_STOP;
+                goto exit;
        }
        is_idx = -1;
@@ -1236,7 +1246,10 @@ static int scan_check_cb(struct ubifs_info *c,
        }
        ubifs_scan_destroy(sleb);
-        return LPT_SCAN_CONTINUE;
+        ret = LPT_SCAN_CONTINUE;
+exit:
+        vfree(buf);
+        return ret;
 out_print:
        ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
@@ -1246,6 +1259,7 @@ out_print:
 out_destroy:
        ubifs_scan_destroy(sleb);
 out:
+        vfree(buf);
        data->err = -EINVAL;
        return LPT_SCAN_STOP;
 }
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 5c90dec5db0b..0c9c69bd983a 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1628,29 +1628,35 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
 {
        int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
        int ret;
-        void *buf = c->dbg->buf;
+        void *buf, *p;
        if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
                return 0;
+        buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+        if (!buf) {
+                ubifs_err("cannot allocate memory for ltab checking");
+                return 0;
+        }
        dbg_lp("LEB %d", lnum);
        err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
        if (err) {
                dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err);
-                return err;
+                goto out;
        }
        while (1) {
-                if (!is_a_node(c, buf, len)) {
+                if (!is_a_node(c, p, len)) {
                        int i, pad_len;
-                        pad_len = get_pad_len(c, buf, len);
+                        pad_len = get_pad_len(c, p, len);
                        if (pad_len) {
-                                buf += pad_len;
+                                p += pad_len;
                                len -= pad_len;
                                dirty += pad_len;
                                continue;
                        }
-                        if (!dbg_is_all_ff(buf, len)) {
+                        if (!dbg_is_all_ff(p, len)) {
                                dbg_msg("invalid empty space in LEB %d at %d",
                                        lnum, c->leb_size - len);
                                err = -EINVAL;
@@ -1668,16 +1674,21 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
                                        lnum, dirty, c->ltab[i].dirty);
                                err = -EINVAL;
                        }
-                        return err;
+                        goto out;
                }
-                node_type = get_lpt_node_type(c, buf, &node_num);
+                node_type = get_lpt_node_type(c, p, &node_num);
                node_len = get_lpt_node_len(c, node_type);
                ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len);
                if (ret == 1)
                        dirty += node_len;
-                buf += node_len;
+                p += node_len;
                len -= node_len;
        }
+        err = 0;
+out:
+        vfree(buf);
+        return err;
 }
 /**
@@ -1870,25 +1881,31 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
 {
        int err, len = c->leb_size, node_type, node_num, node_len, offs;
-        void *buf = c->dbg->buf;
+        void *buf, *p;
        printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
               current->pid, lnum);
+        buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+        if (!buf) {
+                ubifs_err("cannot allocate memory to dump LPT");
+                return;
+        }
        err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
        if (err) {
                ubifs_err("cannot read LEB %d, error %d", lnum, err);
-                return;
+                goto out;
        }
        while (1) {
                offs = c->leb_size - len;
-                if (!is_a_node(c, buf, len)) {
+                if (!is_a_node(c, p, len)) {
                        int pad_len;
-                        pad_len = get_pad_len(c, buf, len);
+                        pad_len = get_pad_len(c, p, len);
                        if (pad_len) {
                                printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n",
                                       lnum, offs, pad_len);
-                                buf += pad_len;
+                                p += pad_len;
                                len -= pad_len;
                                continue;
                        }
@@ -1898,7 +1915,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
                        break;
                }
-                node_type = get_lpt_node_type(c, buf, &node_num);
+                node_type = get_lpt_node_type(c, p, &node_num);
                switch (node_type) {
                case UBIFS_LPT_PNODE:
                {
@@ -1923,7 +1940,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
                        else
                                printk(KERN_DEBUG "LEB %d:%d, nnode, ",
                                       lnum, offs);
-                        err = ubifs_unpack_nnode(c, buf, &nnode);
+                        err = ubifs_unpack_nnode(c, p, &nnode);
                        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
                                printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
                                       nnode.nbranch[i].offs);
@@ -1944,15 +1961,18 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
                        break;
                default:
                        ubifs_err("LPT node type %d not recognized", node_type);
-                        return;
+                        goto out;
                }
-                buf += node_len;
+                p += node_len;
                len -= node_len;
        }
        printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
               current->pid, lnum);
+out:
+        vfree(buf);
+        return;
 }
 /**
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 82009c74b6a3..09df318e368f 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -892,15 +892,22 @@ static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb)
 static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
 {
        int lnum, err = 0;
+        void *buf;
        /* Check no-orphans flag and skip this if no orphans */
        if (c->no_orphs)
                return 0;
+        buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+        if (!buf) {
+                ubifs_err("cannot allocate memory to check orphans");
+                return 0;
+        }
        for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
                struct ubifs_scan_leb *sleb;
-                sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
+                sleb = ubifs_scan(c, lnum, 0, buf, 0);
                if (IS_ERR(sleb)) {
                        err = PTR_ERR(sleb);
                        break;
@@ -912,6 +919,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
                        break;
        }
+        vfree(buf);
        return err;
 }
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 77e9b874b6c2..936f2cbfe6b6 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -28,6 +28,23 @@
 * UBIFS always cleans away all remnants of an unclean un-mount, so that
 * errors do not accumulate. However UBIFS defers recovery if it is mounted
 * read-only, and the flash is not modified in that case.
+ *
+ * The general UBIFS approach to the recovery is that it recovers from
+ * corruptions which could be caused by power cuts, but it refuses to recover
+ * from corruption caused by other reasons. And UBIFS tries to distinguish
+ * between these 2 reasons of corruptions and silently recover in the former
+ * case and loudly complain in the latter case.
+ *
+ * UBIFS writes only to erased LEBs, so it writes only to the flash space
+ * containing only 0xFFs. UBIFS also always writes strictly from the beginning
+ * of the LEB to the end. And UBIFS assumes that the underlying flash media
+ * writes in @c->max_write_size bytes at a time.
+ *
+ * Hence, if UBIFS finds a corrupted node at offset X, it expects only the min.
+ * I/O unit corresponding to offset X to contain corrupted data, all the
+ * following min. I/O units have to contain empty space (all 0xFFs). If this is
+ * not true, the corruption cannot be the result of a power cut, and UBIFS
+ * refuses to mount.
 */
 #include <linux/crc32.h>
@@ -362,8 +379,9 @@ int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
 * @offs: offset to check
 *
 * This function returns %1 if @offs was in the last write to the LEB whose data
- * is in @buf, otherwise %0 is returned.  The determination is made by checking
+ * is in @buf, otherwise %0 is returned. The determination is made by checking
- * for subsequent empty space starting from the next @c->min_io_size boundary.
+ * for subsequent empty space starting from the next @c->max_write_size
+ * boundary.
 */
 static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
 {
@@ -371,10 +389,10 @@ static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
        uint8_t *p;
        /*
-         * Round up to the next @c->min_io_size boundary i.e. @offs is in the
+         * Round up to the next @c->max_write_size boundary i.e. @offs is in
-         * last wbuf written. After that should be empty space.
+         * the last wbuf written. After that should be empty space.
         */
-        empty_offs = ALIGN(offs + 1, c->min_io_size);
+        empty_offs = ALIGN(offs + 1, c->max_write_size);
        check_len = c->leb_size - empty_offs;
        p = buf + empty_offs - offs;
        return is_empty(p, check_len);
@@ -429,7 +447,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
        int skip, dlen = le32_to_cpu(ch->len);
        /* Check for empty space after the corrupt node's common header */
-        skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs;
+        skip = ALIGN(offs + UBIFS_CH_SZ, c->max_write_size) - offs;
        if (is_empty(buf + skip, len - skip))
                return 1;
        /*
@@ -441,7 +459,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
                return 0;
        }
        /* Now we know the corrupt node's length we can skip over it */
-        skip = ALIGN(offs + dlen, c->min_io_size) - offs;
+        skip = ALIGN(offs + dlen, c->max_write_size) - offs;
        /* After which there should be empty space */
        if (is_empty(buf + skip, len - skip))
                return 1;
@@ -671,10 +689,14 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                } else {
                        int corruption = first_non_ff(buf, len);
+                        /*
+                         * See header comment for this file for more
+                         * explanations about the reasons we have this check.
+                         */
                        ubifs_err("corrupt empty space LEB %d:%d, corruption "
                                  "starts at %d", lnum, offs, corruption);
                        /* Make sure we dump interesting non-0xFF data */
-                        offs = corruption;
+                        offs += corruption;
                        buf += corruption;
                        goto corrupted;
                }
@@ -836,12 +858,8 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
 static int recover_head(const struct ubifs_info *c, int lnum, int offs,
                        void *sbuf)
 {
-        int len, err;
+        int len = c->max_write_size, err;
-        if (c->min_io_size > 1)
-                len = c->min_io_size;
-        else
-                len = 512;
        if (offs + len > c->leb_size)
                len = c->leb_size - offs;
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 3e1ee57dbeaa..36216b46f772 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -328,7 +328,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
                if (!quiet)
                        ubifs_err("empty space starts at non-aligned offset %d",
                                  offs);
-                goto corrupted;;
+                goto corrupted;
        }
        ubifs_end_scan(c, sleb, lnum, offs);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 6e11c2975dcf..6ddd9973e681 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -512,9 +512,12 @@ static int init_constants_early(struct ubifs_info *c)
        c->leb_cnt = c->vi.size;
        c->leb_size = c->vi.usable_leb_size;
+        c->leb_start = c->di.leb_start;
        c->half_leb_size = c->leb_size / 2;
        c->min_io_size = c->di.min_io_size;
        c->min_io_shift = fls(c->min_io_size) - 1;
+        c->max_write_size = c->di.max_write_size;
+        c->max_write_shift = fls(c->max_write_size) - 1;
        if (c->leb_size < UBIFS_MIN_LEB_SZ) {
                ubifs_err("too small LEBs (%d bytes), min. is %d bytes",
@@ -534,6 +537,18 @@ static int init_constants_early(struct ubifs_info *c)
        }
        /*
+         * Maximum write size has to be greater or equivalent to min. I/O
+         * size, and be multiple of min. I/O size.
+         */
+        if (c->max_write_size < c->min_io_size ||
+            c->max_write_size % c->min_io_size ||
+            !is_power_of_2(c->max_write_size)) {
+                ubifs_err("bad write buffer size %d for %d min. I/O unit",
+                          c->max_write_size, c->min_io_size);
+                return -EINVAL;
+        }
+        /*
         * UBIFS aligns all node to 8-byte boundary, so to make function in
         * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is
         * less than 8.
@@ -541,6 +556,10 @@ static int init_constants_early(struct ubifs_info *c)
        if (c->min_io_size < 8) {
                c->min_io_size = 8;
                c->min_io_shift = 3;
+                if (c->max_write_size < c->min_io_size) {
+                        c->max_write_size = c->min_io_size;
+                        c->max_write_shift = c->min_io_shift;
+                }
        }
        c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size);
@@ -1202,11 +1221,14 @@ static int mount_ubifs(struct ubifs_info *c)
        if (c->bulk_read == 1)
                bu_init(c);
-        /*
+        if (!c->ro_mount) {
-         * We have to check all CRCs, even for data nodes, when we mount the FS
+                c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ,
-         * (specifically, when we are replaying).
+                                               GFP_KERNEL);
-         */
+                if (!c->write_reserve_buf)
-        c->always_chk_crc = 1;
+                        goto out_free;
+        }
+        c->mounting = 1;
        err = ubifs_read_superblock(c);
        if (err)
@@ -1382,7 +1404,7 @@ static int mount_ubifs(struct ubifs_info *c)
        if (err)
                goto out_infos;
-        c->always_chk_crc = 0;
+        c->mounting = 0;
        ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
                  c->vi.ubi_num, c->vi.vol_id, c->vi.name);
@@ -1403,6 +1425,7 @@ static int mount_ubifs(struct ubifs_info *c)
        dbg_msg("compiled on:         " __DATE__ " at " __TIME__);
        dbg_msg("min. I/O unit size:  %d bytes", c->min_io_size);
+        dbg_msg("max. write size:     %d bytes", c->max_write_size);
        dbg_msg("LEB size:            %d bytes (%d KiB)",
                c->leb_size, c->leb_size >> 10);
        dbg_msg("data journal heads:  %d",
@@ -1432,9 +1455,9 @@ static int mount_ubifs(struct ubifs_info *c)
                UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);
        dbg_msg("node sizes:          ref %zu, cmt. start %zu, orph %zu",
                UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
-        dbg_msg("max. node sizes:     data %zu, inode %zu dentry %zu",
+        dbg_msg("max. node sizes:     data %zu, inode %zu dentry %zu, idx %d",
                UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
-                UBIFS_MAX_DENT_NODE_SZ);
+                UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
        dbg_msg("dead watermark:      %d", c->dead_wm);
        dbg_msg("dark watermark:      %d", c->dark_wm);
        dbg_msg("LEB overhead:        %d", c->leb_overhead);
@@ -1474,6 +1497,7 @@ out_wbufs:
 out_cbuf:
        kfree(c->cbuf);
 out_free:
+        kfree(c->write_reserve_buf);
        kfree(c->bu.buf);
        vfree(c->ileb_buf);
        vfree(c->sbuf);
@@ -1512,6 +1536,7 @@ static void ubifs_umount(struct ubifs_info *c)
        kfree(c->cbuf);
        kfree(c->rcvrd_mst_node);
        kfree(c->mst_node);
+        kfree(c->write_reserve_buf);
        kfree(c->bu.buf);
        vfree(c->ileb_buf);
        vfree(c->sbuf);
@@ -1543,7 +1568,6 @@ static int ubifs_remount_rw(struct ubifs_info *c)
        mutex_lock(&c->umount_mutex);
        dbg_save_space_info(c);
        c->remounting_rw = 1;
-        c->always_chk_crc = 1;
        err = check_free_space(c);
        if (err)
@@ -1598,6 +1622,10 @@ static int ubifs_remount_rw(struct ubifs_info *c)
                goto out;
        }
+        c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL);
+        if (!c->write_reserve_buf)
+                goto out;
        err = ubifs_lpt_init(c, 0, 1);
        if (err)
                goto out;
@@ -1650,7 +1678,6 @@ static int ubifs_remount_rw(struct ubifs_info *c)
        dbg_gen("re-mounted read-write");
        c->ro_mount = 0;
        c->remounting_rw = 0;
-        c->always_chk_crc = 0;
        err = dbg_check_space_info(c);
        mutex_unlock(&c->umount_mutex);
        return err;
@@ -1663,11 +1690,12 @@ out:
                c->bgt = NULL;
        }
        free_wbufs(c);
+        kfree(c->write_reserve_buf);
+        c->write_reserve_buf = NULL;
        vfree(c->ileb_buf);
        c->ileb_buf = NULL;
        ubifs_lpt_free(c, 1);
        c->remounting_rw = 0;
-        c->always_chk_crc = 0;
        mutex_unlock(&c->umount_mutex);
        return err;
 }
@@ -1707,6 +1735,8 @@ static void ubifs_remount_ro(struct ubifs_info *c)
        free_wbufs(c);
        vfree(c->orph_buf);
        c->orph_buf = NULL;
+        kfree(c->write_reserve_buf);
+        c->write_reserve_buf = NULL;
        vfree(c->ileb_buf);
        c->ileb_buf = NULL;
        ubifs_lpt_free(c, 1);
@@ -1937,6 +1967,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
        mutex_init(&c->mst_mutex);
        mutex_init(&c->umount_mutex);
        mutex_init(&c->bu_mutex);
+        mutex_init(&c->write_reserve_mutex);
        init_waitqueue_head(&c->cmt_wq);
        c->buds = RB_ROOT;
        c->old_idx = RB_ROOT;
@@ -1954,6 +1985,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
        INIT_LIST_HEAD(&c->old_buds);
        INIT_LIST_HEAD(&c->orph_list);
        INIT_LIST_HEAD(&c->orph_new);
+        c->no_chk_data_crc = 1;
        c->vfs_sb = sb;
        c->highest_inum = UBIFS_FIRST_INO;
@@ -1979,7 +2011,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
         */
        c->bdi.name = "ubifs",
        c->bdi.capabilities = BDI_CAP_MAP_COPY;
-        c->bdi.unplug_io_fn = default_unplug_io_fn;
        err  = bdi_init(&c->bdi);
        if (err)
                goto out_close;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index ad9cf0133622..de485979ca39 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -447,8 +447,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 *
 * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc
 * is true (it is controlled by corresponding mount option). However, if
- * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always
+ * @c->mounting or @c->remounting_rw is true (we are mounting or re-mounting to
- * checked.
+ * R/W mode), @c->no_chk_data_crc is ignored and CRC is checked. This is
+ * because during mounting or re-mounting from R/O mode to R/W mode we may read
+ * journal nodes (when replying the journal or doing the recovery) and the
+ * journal nodes may potentially be corrupted, so checking is required.
 */
 static int try_read_node(const struct ubifs_info *c, void *buf, int type,
                         int len, int lnum, int offs)
@@ -476,7 +479,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
        if (node_len != len)
                return 0;
-        if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc)
+        if (type == UBIFS_DATA_NODE && c->no_chk_data_crc && !c->mounting &&
+            !c->remounting_rw)
                return 1;
        crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 381d6b207a52..8c40ad3c6721 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -151,6 +151,12 @@
 */
 #define WORST_COMPR_FACTOR 2
+/*
+ * How much memory is needed for a buffer where we comress a data node.
+ */
+#define COMPRESSED_DATA_NODE_BUF_SZ \
+        (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR)
 /* Maximum expected tree height for use by bottom_up_buf */
 #define BOTTOM_UP_HEIGHT 64
@@ -646,6 +652,7 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
 * @offs: write-buffer offset in this logical eraseblock
 * @avail: number of bytes available in the write-buffer
 * @used:  number of used bytes in the write-buffer
+ * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range)
 * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM,
 * %UBI_UNKNOWN)
 * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep
@@ -680,6 +687,7 @@ struct ubifs_wbuf {
        int offs;
        int avail;
        int used;
+        int size;
        int dtype;
        int jhead;
        int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
@@ -1003,6 +1011,11 @@ struct ubifs_debug_info;
 * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu
 * @bu: pre-allocated bulk-read information
 *
+ * @write_reserve_mutex: protects @write_reserve_buf
+ * @write_reserve_buf: on the write path we allocate memory, which might
+ *                     sometimes be unavailable, in which case we use this
+ *                     write reserve buffer
+ *
 * @log_lebs: number of logical eraseblocks in the log
 * @log_bytes: log size in bytes
 * @log_last: last LEB of the log
@@ -1024,7 +1037,12 @@ struct ubifs_debug_info;
 *
 * @min_io_size: minimal input/output unit size
 * @min_io_shift: number of bits in @min_io_size minus one
+ * @max_write_size: maximum amount of bytes the underlying flash can write at a
+ *                  time (MTD write buffer size)
+ * @max_write_shift: number of bits in @max_write_size minus one
 * @leb_size: logical eraseblock size in bytes
+ * @leb_start: starting offset of logical eraseblocks within physical
+ *             eraseblocks
 * @half_leb_size: half LEB size
 * @idx_leb_size: how many bytes of an LEB are effectively available when it is
 *                used to store indexing nodes (@leb_size - @max_idx_node_sz)
@@ -1166,22 +1184,21 @@ struct ubifs_debug_info;
 * @rp_uid: reserved pool user ID
 * @rp_gid: reserved pool group ID
 *
- * @empty: if the UBI device is empty
+ * @empty: %1 if the UBI device is empty
+ * @need_recovery: %1 if the file-system needs recovery
+ * @replaying: %1 during journal replay
+ * @mounting: %1 while mounting
+ * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
 * @replay_tree: temporary tree used during journal replay
 * @replay_list: temporary list used during journal replay
 * @replay_buds: list of buds to replay
 * @cs_sqnum: sequence number of first node in the log (commit start node)
 * @replay_sqnum: sequence number of node currently being replayed
- * @need_recovery: file-system needs recovery
- * @replaying: set to %1 during journal replay
 * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W
 *                    mode
 * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted
 *                  FS to R/W mode
 * @size_tree: inode size information for recovery
- * @remounting_rw: set while re-mounting from R/O mode to R/W mode
- * @always_chk_crc: always check CRCs (while mounting and remounting to R/W
- *                  mode)
 * @mount_opts: UBIFS-specific mount options
 *
 * @dbg: debugging-related information
@@ -1250,6 +1267,9 @@ struct ubifs_info {
        struct mutex bu_mutex;
        struct bu_info bu;
+        struct mutex write_reserve_mutex;
+        void *write_reserve_buf;
        int log_lebs;
        long long log_bytes;
        int log_last;
@@ -1271,7 +1291,10 @@ struct ubifs_info {
        int min_io_size;
        int min_io_shift;
+        int max_write_size;
+        int max_write_shift;
        int leb_size;
+        int leb_start;
        int half_leb_size;
        int idx_leb_size;
        int leb_cnt;
@@ -1402,19 +1425,19 @@ struct ubifs_info {
        gid_t rp_gid;
        /* The below fields are used only during mounting and re-mounting */
-        int empty;
+        unsigned int empty:1;
+        unsigned int need_recovery:1;
+        unsigned int replaying:1;
+        unsigned int mounting:1;
+        unsigned int remounting_rw:1;
        struct rb_root replay_tree;
        struct list_head replay_list;
        struct list_head replay_buds;
        unsigned long long cs_sqnum;
        unsigned long long replay_sqnum;
-        int need_recovery;
-        int replaying;
        struct list_head unclean_leb_list;
        struct ubifs_mst_node *rcvrd_mst_node;
        struct rb_root size_tree;
-        int remounting_rw;
-        int always_chk_crc;
        struct ubifs_mount_opts mount_opts;
 #ifdef CONFIG_UBIFS_FS_DEBUG
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 306ee39ef2c3..95518a9f589e 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -27,11 +27,10 @@
 #include "udf_i.h"
 #include "udf_sb.h"
-#define udf_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
+#define udf_clear_bit   __test_and_clear_bit_le
-#define udf_set_bit(nr, addr) ext2_set_bit(nr, addr)
+#define udf_set_bit     __test_and_set_bit_le
-#define udf_test_bit(nr, addr) ext2_test_bit(nr, addr)
+#define udf_test_bit    test_bit_le
-#define udf_find_next_one_bit(addr, size, offset) \
+#define udf_find_next_one_bit   find_next_bit_le
-                ext2_find_next_bit(addr, size, offset)
 static int read_block_bitmap(struct super_block *sb,
                             struct udf_bitmap *bitmap, unsigned int block,
@@ -297,7 +296,7 @@ repeat:
                                break;
                        }
                } else {
-                        bit = udf_find_next_one_bit((char *)bh->b_data,
+                        bit = udf_find_next_one_bit(bh->b_data,
                                                    sb->s_blocksize << 3,
                                                    group_start << 3);
                        if (bit < sb->s_blocksize << 3)
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 89c78486cbbe..2a346bb1d9f5 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -98,7 +98,6 @@ static int udf_adinicb_write_end(struct file *file,
 const struct address_space_operations udf_adinicb_aops = {
        .readpage       = udf_adinicb_readpage,
        .writepage      = udf_adinicb_writepage,
-        .sync_page      = block_sync_page,
        .write_begin = simple_write_begin,
        .write_end = udf_adinicb_write_end,
 };
@@ -123,8 +122,8 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                if (inode->i_sb->s_blocksize <
                                (udf_file_entry_alloc_offset(inode) +
                                                pos + count)) {
-                        udf_expand_file_adinicb(inode, pos + count, &err);
+                        err = udf_expand_file_adinicb(inode);
-                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+                        if (err) {
                                udf_debug("udf_expand_adinicb: err=%d\n", err);
                                up_write(&iinfo->i_data_sem);
                                return err;
@@ -237,7 +236,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
        if ((attr->ia_valid & ATTR_SIZE) &&
            attr->ia_size != i_size_read(inode)) {
-                error = vmtruncate(inode, attr->ia_size);
+                error = udf_setsize(inode, attr->ia_size);
                if (error)
                        return error;
        }
@@ -249,5 +248,4 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
 const struct inode_operations udf_file_inode_operations = {
        .setattr                = udf_setattr,
-        .truncate               = udf_truncate,
 };
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index c6a2e782b97b..1d1358ed80c1 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -73,14 +73,12 @@ void udf_evict_inode(struct inode *inode)
        struct udf_inode_info *iinfo = UDF_I(inode);
        int want_delete = 0;
-        truncate_inode_pages(&inode->i_data, 0);
        if (!inode->i_nlink && !is_bad_inode(inode)) {
                want_delete = 1;
-                inode->i_size = 0;
+                udf_setsize(inode, 0);
-                udf_truncate(inode);
                udf_update_inode(inode, IS_SYNC(inode));
-        }
+        } else
+                truncate_inode_pages(&inode->i_data, 0);
        invalidate_inode_buffers(inode);
        end_writeback(inode);
        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
@@ -117,9 +115,18 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
        ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block);
        if (unlikely(ret)) {
-                loff_t isize = mapping->host->i_size;
+                struct inode *inode = mapping->host;
-                if (pos + len > isize)
+                struct udf_inode_info *iinfo = UDF_I(inode);
-                        vmtruncate(mapping->host, isize);
+                loff_t isize = inode->i_size;
+                if (pos + len > isize) {
+                        truncate_pagecache(inode, pos + len, isize);
+                        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+                                down_write(&iinfo->i_data_sem);
+                                udf_truncate_extents(inode);
+                                up_write(&iinfo->i_data_sem);
+                        }
+                }
        }
        return ret;
@@ -133,36 +140,36 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations udf_aops = {
        .readpage       = udf_readpage,
        .writepage      = udf_writepage,
-        .sync_page      = block_sync_page,
        .write_begin            = udf_write_begin,
        .write_end              = generic_write_end,
        .bmap           = udf_bmap,
 };
-void udf_expand_file_adinicb(struct inode *inode, int newsize, int *err)
+int udf_expand_file_adinicb(struct inode *inode)
 {
        struct page *page;
        char *kaddr;
        struct udf_inode_info *iinfo = UDF_I(inode);
+        int err;
        struct writeback_control udf_wbc = {
                .sync_mode = WB_SYNC_NONE,
                .nr_to_write = 1,
        };
-        /* from now on we have normal address_space methods */
-        inode->i_data.a_ops = &udf_aops;
        if (!iinfo->i_lenAlloc) {
                if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
                        iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
                else
                        iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
+                /* from now on we have normal address_space methods */
+                inode->i_data.a_ops = &udf_aops;
                mark_inode_dirty(inode);
-                return;
+                return 0;
        }
-        page = grab_cache_page(inode->i_mapping, 0);
+        page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
-        BUG_ON(!PageLocked(page));
+        if (!page)
+                return -ENOMEM;
        if (!PageUptodate(page)) {
                kaddr = kmap(page);
@@ -181,11 +188,24 @@ void udf_expand_file_adinicb(struct inode *inode, int newsize, int *err)
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
        else
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
+        /* from now on we have normal address_space methods */
-        inode->i_data.a_ops->writepage(page, &udf_wbc);
+        inode->i_data.a_ops = &udf_aops;
+        err = inode->i_data.a_ops->writepage(page, &udf_wbc);
+        if (err) {
+                /* Restore everything back so that we don't lose data... */
+                lock_page(page);
+                kaddr = kmap(page);
+                memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr,
+                       inode->i_size);
+                kunmap(page);
+                unlock_page(page);
+                iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
+                inode->i_data.a_ops = &udf_adinicb_aops;
+        }
        page_cache_release(page);
        mark_inode_dirty(inode);
+        return err;
 }
 struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
@@ -348,8 +368,10 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block,
 }
 /* Extend the file by 'blocks' blocks, return the number of extents added */
-int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
+static int udf_do_extend_file(struct inode *inode,
-                    struct kernel_long_ad *last_ext, sector_t blocks)
+                              struct extent_position *last_pos,
+                              struct kernel_long_ad *last_ext,
+                              sector_t blocks)
 {
        sector_t add;
        int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
@@ -357,6 +379,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
        struct kernel_lb_addr prealloc_loc = {};
        int prealloc_len = 0;
        struct udf_inode_info *iinfo;
+        int err;
        /* The previous extent is fake and we should not extend by anything
         * - there's nothing to do... */
@@ -422,26 +445,29 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
        /* Create enough extents to cover the whole hole */
        while (blocks > add) {
                blocks -= add;
-                if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
+                err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
-                                 last_ext->extLength, 1) == -1)
+                                   last_ext->extLength, 1);
-                        return -1;
+                if (err)
+                        return err;
                count++;
        }
        if (blocks) {
                last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
                        (blocks << sb->s_blocksize_bits);
-                if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
+                err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
-                                 last_ext->extLength, 1) == -1)
+                                   last_ext->extLength, 1);
-                        return -1;
+                if (err)
+                        return err;
                count++;
        }
 out:
        /* Do we have some preallocated blocks saved? */
        if (prealloc_len) {
-                if (udf_add_aext(inode, last_pos, &prealloc_loc,
+                err = udf_add_aext(inode, last_pos, &prealloc_loc,
-                                 prealloc_len, 1) == -1)
+                                   prealloc_len, 1);
-                        return -1;
+                if (err)
+                        return err;
                last_ext->extLocation = prealloc_loc;
                last_ext->extLength = prealloc_len;
                count++;
@@ -453,11 +479,68 @@ out:
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
                last_pos->offset -= sizeof(struct long_ad);
        else
-                return -1;
+                return -EIO;
        return count;
 }
+static int udf_extend_file(struct inode *inode, loff_t newsize)
+{
+        struct extent_position epos;
+        struct kernel_lb_addr eloc;
+        uint32_t elen;
+        int8_t etype;
+        struct super_block *sb = inode->i_sb;
+        sector_t first_block = newsize >> sb->s_blocksize_bits, offset;
+        int adsize;
+        struct udf_inode_info *iinfo = UDF_I(inode);
+        struct kernel_long_ad extent;
+        int err;
+        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+                adsize = sizeof(struct short_ad);
+        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+                adsize = sizeof(struct long_ad);
+        else
+                BUG();
+        etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
+        /* File has extent covering the new size (could happen when extending
+         * inside a block)? */
+        if (etype != -1)
+                return 0;
+        if (newsize & (sb->s_blocksize - 1))
+                offset++;
+        /* Extended file just to the boundary of the last file block? */
+        if (offset == 0)
+                return 0;
+        /* Truncate is extending the file by 'offset' blocks */
+        if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
+            (epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
+                /* File has no extents at all or has empty last
+                 * indirect extent! Create a fake extent... */
+                extent.extLocation.logicalBlockNum = 0;
+                extent.extLocation.partitionReferenceNum = 0;
+                extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
+        } else {
+                epos.offset -= adsize;
+                etype = udf_next_aext(inode, &epos, &extent.extLocation,
+                                      &extent.extLength, 0);
+                extent.extLength |= etype << 30;
+        }
+        err = udf_do_extend_file(inode, &epos, &extent, offset);
+        if (err < 0)
+                goto out;
+        err = 0;
+        iinfo->i_lenExtents = newsize;
+out:
+        brelse(epos.bh);
+        return err;
+}
 static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                                        int *err, sector_t *phys, int *new)
 {
@@ -540,7 +623,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                        elen = EXT_RECORDED_ALLOCATED |
                                ((elen + inode->i_sb->s_blocksize - 1) &
                                 ~(inode->i_sb->s_blocksize - 1));
-                        etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
+                        udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
                }
                brelse(prev_epos.bh);
                brelse(cur_epos.bh);
@@ -564,19 +647,17 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                        memset(&laarr[0].extLocation, 0x00,
                                sizeof(struct kernel_lb_addr));
                        laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
-                        /* Will udf_extend_file() create real extent from
+                        /* Will udf_do_extend_file() create real extent from
                           a fake one? */
                        startnum = (offset > 0);
                }
                /* Create extents for the hole between EOF and offset */
-                ret = udf_extend_file(inode, &prev_epos, laarr, offset);
+                ret = udf_do_extend_file(inode, &prev_epos, laarr, offset);
-                if (ret == -1) {
+                if (ret < 0) {
                        brelse(prev_epos.bh);
                        brelse(cur_epos.bh);
                        brelse(next_epos.bh);
-                        /* We don't really know the error here so we just make
+                        *err = ret;
-                         * something up */
-                        *err = -ENOSPC;
                        return NULL;
                }
                c = 0;
@@ -1005,52 +1086,66 @@ struct buffer_head *udf_bread(struct inode *inode, int block,
        return NULL;
 }
-void udf_truncate(struct inode *inode)
+int udf_setsize(struct inode *inode, loff_t newsize)
 {
-        int offset;
        int err;
        struct udf_inode_info *iinfo;
+        int bsize = 1 << inode->i_blkbits;
        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
              S_ISLNK(inode->i_mode)))
-                return;
+                return -EINVAL;
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                return;
+                return -EPERM;
        iinfo = UDF_I(inode);
-        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+        if (newsize > inode->i_size) {
                down_write(&iinfo->i_data_sem);
-                if (inode->i_sb->s_blocksize <
+                if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
-                                (udf_file_entry_alloc_offset(inode) +
+                        if (bsize <
-                                 inode->i_size)) {
+                            (udf_file_entry_alloc_offset(inode) + newsize)) {
-                        udf_expand_file_adinicb(inode, inode->i_size, &err);
+                                err = udf_expand_file_adinicb(inode);
-                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+                                if (err) {
-                                inode->i_size = iinfo->i_lenAlloc;
+                                        up_write(&iinfo->i_data_sem);
-                                up_write(&iinfo->i_data_sem);
+                                        return err;
-                                return;
+                                }
                        } else
-                                udf_truncate_extents(inode);
+                                iinfo->i_lenAlloc = newsize;
-                } else {
+                }
-                        offset = inode->i_size & (inode->i_sb->s_blocksize - 1);
+                err = udf_extend_file(inode, newsize);
-                        memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset,
+                if (err) {
-                                0x00, inode->i_sb->s_blocksize -
+                        up_write(&iinfo->i_data_sem);
-                                offset - udf_file_entry_alloc_offset(inode));
+                        return err;
-                        iinfo->i_lenAlloc = inode->i_size;
                }
+                truncate_setsize(inode, newsize);
                up_write(&iinfo->i_data_sem);
        } else {
-                block_truncate_page(inode->i_mapping, inode->i_size,
+                if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
-                                    udf_get_block);
+                        down_write(&iinfo->i_data_sem);
+                        memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize,
+                               0x00, bsize - newsize -
+                               udf_file_entry_alloc_offset(inode));
+                        iinfo->i_lenAlloc = newsize;
+                        truncate_setsize(inode, newsize);
+                        up_write(&iinfo->i_data_sem);
+                        goto update_time;
+                }
+                err = block_truncate_page(inode->i_mapping, newsize,
+                                          udf_get_block);
+                if (err)
+                        return err;
                down_write(&iinfo->i_data_sem);
+                truncate_setsize(inode, newsize);
                udf_truncate_extents(inode);
                up_write(&iinfo->i_data_sem);
        }
+update_time:
        inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
        if (IS_SYNC(inode))
                udf_sync_inode(inode);
        else
                mark_inode_dirty(inode);
+        return 0;
 }
 static void __udf_read_inode(struct inode *inode)
@@ -1637,14 +1732,13 @@ struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
        return NULL;
 }
-int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
+int udf_add_aext(struct inode *inode, struct extent_position *epos,
-                    struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+                 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
 {
        int adsize;
        struct short_ad *sad = NULL;
        struct long_ad *lad = NULL;
        struct allocExtDesc *aed;
-        int8_t etype;
        uint8_t *ptr;
        struct udf_inode_info *iinfo = UDF_I(inode);
@@ -1660,7 +1754,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
                adsize = sizeof(struct long_ad);
        else
-                return -1;
+                return -EIO;
        if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
                unsigned char *sptr, *dptr;
@@ -1672,12 +1766,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                                                obloc.partitionReferenceNum,
                                                obloc.logicalBlockNum, &err);
                if (!epos->block.logicalBlockNum)
-                        return -1;
+                        return -ENOSPC;
                nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
                                                                 &epos->block,
                                                                 0));
                if (!nbh)
-                        return -1;
+                        return -EIO;
                lock_buffer(nbh);
                memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize);
                set_buffer_uptodate(nbh);
@@ -1746,7 +1840,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                epos->bh = nbh;
        }
-        etype = udf_write_aext(inode, epos, eloc, elen, inc);
+        udf_write_aext(inode, epos, eloc, elen, inc);
        if (!epos->bh) {
                iinfo->i_lenAlloc += adsize;
@@ -1764,11 +1858,11 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                mark_buffer_dirty_inode(epos->bh, inode);
        }
-        return etype;
+        return 0;
 }
-int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
+void udf_write_aext(struct inode *inode, struct extent_position *epos,
-                      struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+                    struct kernel_lb_addr *eloc, uint32_t elen, int inc)
 {
        int adsize;
        uint8_t *ptr;
@@ -1798,7 +1892,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
                adsize = sizeof(struct long_ad);
                break;
        default:
-                return -1;
+                return;
        }
        if (epos->bh) {
@@ -1817,8 +1911,6 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
        if (inc)
                epos->offset += adsize;
-        return (elen >> 30);
 }
 int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 2be0f9eb86d2..f1dce848ef96 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -32,6 +32,8 @@
 #include <linux/crc-itu-t.h>
 #include <linux/exportfs.h>
+enum { UDF_MAX_LINKS = 0xffff };
 static inline int udf_match(int len1, const unsigned char *name1, int len2,
                            const unsigned char *name2)
 {
@@ -650,7 +652,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct udf_inode_info *iinfo;
        err = -EMLINK;
-        if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
+        if (dir->i_nlink >= UDF_MAX_LINKS)
                goto out;
        err = -EIO;
@@ -1034,9 +1036,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        struct fileIdentDesc cfi, *fi;
        int err;
-        if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
+        if (inode->i_nlink >= UDF_MAX_LINKS)
                return -EMLINK;
-        }
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi) {
@@ -1131,9 +1132,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
                        goto end_rename;
                retval = -EMLINK;
-                if (!new_inode &&
+                if (!new_inode && new_dir->i_nlink >= UDF_MAX_LINKS)
-                        new_dir->i_nlink >=
-                                (256 << sizeof(new_dir->i_nlink)) - 1)
                        goto end_rename;
        }
        if (!nfi) {
@@ -1287,8 +1286,13 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
        struct fid *fid = (struct fid *)fh;
        int type = FILEID_UDF_WITHOUT_PARENT;
-        if (len < 3 || (connectable && len < 5))
+        if (connectable && (len < 5)) {
+                *lenp = 5;
                return 255;
+        } else if (len < 3) {
+                *lenp = 3;
+                return 255;
+        }
        *lenp = 3;
        fid->udf.block = location.logicalBlockNum;
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 225527cdc885..8424308db4b4 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -197,6 +197,11 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
        mark_buffer_dirty_inode(epos->bh, inode);
 }
+/*
+ * Truncate extents of inode to inode->i_size. This function can be used only
+ * for making file shorter. For making file longer, udf_extend_file() has to
+ * be used.
+ */
 void udf_truncate_extents(struct inode *inode)
 {
        struct extent_position epos;
@@ -219,96 +224,65 @@ void udf_truncate_extents(struct inode *inode)
        etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
        byte_offset = (offset << sb->s_blocksize_bits) +
                (inode->i_size & (sb->s_blocksize - 1));
-        if (etype != -1) {
+        if (etype == -1) {
-                epos.offset -= adsize;
+                /* We should extend the file? */
-                extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
+                WARN_ON(byte_offset);
-                epos.offset += adsize;
+                return;
-                if (byte_offset)
+        }
-                        lenalloc = epos.offset;
+        epos.offset -= adsize;
-                else
+        extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
-                        lenalloc = epos.offset - adsize;
+        epos.offset += adsize;
+        if (byte_offset)
-                if (!epos.bh)
+                lenalloc = epos.offset;
-                        lenalloc -= udf_file_entry_alloc_offset(inode);
+        else
-                else
+                lenalloc = epos.offset - adsize;
-                        lenalloc -= sizeof(struct allocExtDesc);
-                while ((etype = udf_current_aext(inode, &epos, &eloc,
-                                                 &elen, 0)) != -1) {
-                        if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
-                                udf_write_aext(inode, &epos, &neloc, nelen, 0);
-                                if (indirect_ext_len) {
-                                        /* We managed to free all extents in the
-                                         * indirect extent - free it too */
-                                        BUG_ON(!epos.bh);
-                                        udf_free_blocks(sb, inode, &epos.block,
-                                                        0, indirect_ext_len);
-                                } else if (!epos.bh) {
-                                        iinfo->i_lenAlloc = lenalloc;
-                                        mark_inode_dirty(inode);
-                                } else
-                                        udf_update_alloc_ext_desc(inode,
-                                                        &epos, lenalloc);
-                                brelse(epos.bh);
-                                epos.offset = sizeof(struct allocExtDesc);
-                                epos.block = eloc;
-                                epos.bh = udf_tread(sb,
-                                                udf_get_lb_pblock(sb, &eloc, 0));
-                                if (elen)
-                                        indirect_ext_len =
-                                                (elen + sb->s_blocksize - 1) >>
-                                                sb->s_blocksize_bits;
-                                else
-                                        indirect_ext_len = 1;
-                        } else {
-                                extent_trunc(inode, &epos, &eloc, etype,
-                                             elen, 0);
-                                epos.offset += adsize;
-                        }
-                }
-                if (indirect_ext_len) {
+        if (!epos.bh)
-                        BUG_ON(!epos.bh);
+                lenalloc -= udf_file_entry_alloc_offset(inode);
-                        udf_free_blocks(sb, inode, &epos.block, 0,
+        else
-                                        indirect_ext_len);
+                lenalloc -= sizeof(struct allocExtDesc);
-                } else if (!epos.bh) {
-                        iinfo->i_lenAlloc = lenalloc;
-                        mark_inode_dirty(inode);
-                } else
-                        udf_update_alloc_ext_desc(inode, &epos, lenalloc);
-        } else if (inode->i_size) {
-                if (byte_offset) {
-                        struct kernel_long_ad extent;
-                        /*
+        while ((etype = udf_current_aext(inode, &epos, &eloc,
-                         *  OK, there is not extent covering inode->i_size and
+                                         &elen, 0)) != -1) {
-                         *  no extent above inode->i_size => truncate is
+                if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
-                         *  extending the file by 'offset' blocks.
+                        udf_write_aext(inode, &epos, &neloc, nelen, 0);
-                         */
+                        if (indirect_ext_len) {
-                        if ((!epos.bh &&
+                                /* We managed to free all extents in the
-                             epos.offset ==
+                                 * indirect extent - free it too */
-                                        udf_file_entry_alloc_offset(inode)) ||
+                                BUG_ON(!epos.bh);
-                            (epos.bh && epos.offset ==
+                                udf_free_blocks(sb, inode, &epos.block,
-                                                sizeof(struct allocExtDesc))) {
+                                                0, indirect_ext_len);
-                                /* File has no extents at all or has empty last
+                        } else if (!epos.bh) {
-                                 * indirect extent! Create a fake extent... */
+                                iinfo->i_lenAlloc = lenalloc;
-                                extent.extLocation.logicalBlockNum = 0;
+                                mark_inode_dirty(inode);
-                                extent.extLocation.partitionReferenceNum = 0;
+                        } else
-                                extent.extLength =
+                                udf_update_alloc_ext_desc(inode,
-                                        EXT_NOT_RECORDED_NOT_ALLOCATED;
+                                                &epos, lenalloc);
-                        } else {
+                        brelse(epos.bh);
-                                epos.offset -= adsize;
+                        epos.offset = sizeof(struct allocExtDesc);
-                                etype = udf_next_aext(inode, &epos,
+                        epos.block = eloc;
-                                                      &extent.extLocation,
+                        epos.bh = udf_tread(sb,
-                                                      &extent.extLength, 0);
+                                        udf_get_lb_pblock(sb, &eloc, 0));
-                                extent.extLength |= etype << 30;
+                        if (elen)
-                        }
+                                indirect_ext_len =
-                        udf_extend_file(inode, &epos, &extent,
+                                        (elen + sb->s_blocksize - 1) >>
-                                        offset +
+                                        sb->s_blocksize_bits;
-                                        ((inode->i_size &
+                        else
-                                                (sb->s_blocksize - 1)) != 0));
+                                indirect_ext_len = 1;
+                } else {
+                        extent_trunc(inode, &epos, &eloc, etype, elen, 0);
+                        epos.offset += adsize;
                }
        }
+        if (indirect_ext_len) {
+                BUG_ON(!epos.bh);
+                udf_free_blocks(sb, inode, &epos.block, 0, indirect_ext_len);
+        } else if (!epos.bh) {
+                iinfo->i_lenAlloc = lenalloc;
+                mark_inode_dirty(inode);
+        } else
+                udf_update_alloc_ext_desc(inode, &epos, lenalloc);
        iinfo->i_lenExtents = inode->i_size;
        brelse(epos.bh);
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index eba48209f9f3..dbd52d4b5eed 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -136,22 +136,20 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
 extern long udf_ioctl(struct file *, unsigned int, unsigned long);
 /* inode.c */
 extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
-extern void udf_expand_file_adinicb(struct inode *, int, int *);
+extern int udf_expand_file_adinicb(struct inode *);
 extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
 extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
-extern void udf_truncate(struct inode *);
+extern int udf_setsize(struct inode *, loff_t);
 extern void udf_read_inode(struct inode *);
 extern void udf_evict_inode(struct inode *);
 extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
 extern long udf_block_map(struct inode *, sector_t);
-extern int udf_extend_file(struct inode *, struct extent_position *,
-                           struct kernel_long_ad *, sector_t);
 extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
                         struct kernel_lb_addr *, uint32_t *, sector_t *);
-extern int8_t udf_add_aext(struct inode *, struct extent_position *,
+extern int udf_add_aext(struct inode *, struct extent_position *,
+                        struct kernel_lb_addr *, uint32_t, int);
+extern void udf_write_aext(struct inode *, struct extent_position *,
                           struct kernel_lb_addr *, uint32_t, int);
-extern int8_t udf_write_aext(struct inode *, struct extent_position *,
-                             struct kernel_lb_addr *, uint32_t, int);
 extern int8_t udf_delete_aext(struct inode *, struct extent_position,
                              struct kernel_lb_addr, uint32_t);
 extern int8_t udf_next_aext(struct inode *, struct extent_position *,
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index 30c8f223253d..e4f10a40768a 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -1,7 +1,6 @@
 config UFS_FS
        tristate "UFS file system support (read only)"
        depends on BLOCK
-        depends on BKL # probably fixable
        help
          BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
          OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 2b251f2093af..27a4babe7df0 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -34,7 +34,6 @@
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
@@ -43,7 +42,7 @@
 #include "swab.h"
 #include "util.h"
-static u64 ufs_frag_map(struct inode *inode, sector_t frag);
+static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock);
 static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4])
 {
@@ -82,7 +81,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
 * the begining of the filesystem.
 */
-static u64 ufs_frag_map(struct inode *inode, sector_t frag)
+static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock)
 {
        struct ufs_inode_info *ufsi = UFS_I(inode);
        struct super_block *sb = inode->i_sb;
@@ -107,7 +106,8 @@ static u64 ufs_frag_map(struct inode *inode, sector_t frag)
        p = offsets;
-        lock_kernel();
+        if (needs_lock)
+                lock_ufs(sb);
        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
                goto ufs2;
@@ -152,7 +152,8 @@ ufs2:
        ret = temp + (u64) (frag & uspi->s_fpbmask);
 out:
-        unlock_kernel();
+        if (needs_lock)
+                unlock_ufs(sb);
        return ret;
 }
@@ -415,14 +416,16 @@ out:
 int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
 {
        struct super_block * sb = inode->i_sb;
-        struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi;
+        struct ufs_sb_info * sbi = UFS_SB(sb);
+        struct ufs_sb_private_info * uspi = sbi->s_uspi;
        struct buffer_head * bh;
        int ret, err, new;
        unsigned long ptr,phys;
        u64 phys64 = 0;
+        bool needs_lock = (sbi->mutex_owner != current);
        
        if (!create) {
-                phys64 = ufs_frag_map(inode, fragment);
+                phys64 = ufs_frag_map(inode, fragment, needs_lock);
                UFSD("phys64 = %llu\n", (unsigned long long)phys64);
                if (phys64)
                        map_bh(bh_result, sb, phys64);
@@ -436,7 +439,8 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head
        ret = 0;
        bh = NULL;
-        lock_kernel();
+        if (needs_lock)
+                lock_ufs(sb);
        UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
        if (fragment >
@@ -498,7 +502,9 @@ out:
                set_buffer_new(bh_result);
        map_bh(bh_result, sb, phys);
 abort:
-        unlock_kernel();
+        if (needs_lock)
+                unlock_ufs(sb);
        return err;
 abort_too_big:
@@ -506,48 +512,6 @@ abort_too_big:
        goto abort;
 }
-static struct buffer_head *ufs_getfrag(struct inode *inode,
-                                       unsigned int fragment,
-                                       int create, int *err)
-{
-        struct buffer_head dummy;
-        int error;
-        dummy.b_state = 0;
-        dummy.b_blocknr = -1000;
-        error = ufs_getfrag_block(inode, fragment, &dummy, create);
-        *err = error;
-        if (!error && buffer_mapped(&dummy)) {
-                struct buffer_head *bh;
-                bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
-                if (buffer_new(&dummy)) {
-                        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
-                        set_buffer_uptodate(bh);
-                        mark_buffer_dirty(bh);
-                }
-                return bh;
-        }
-        return NULL;
-}
-struct buffer_head * ufs_bread (struct inode * inode, unsigned fragment,
-        int create, int * err)
-{
-        struct buffer_head * bh;
-        UFSD("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment);
-        bh = ufs_getfrag (inode, fragment, create, err);
-        if (!bh || buffer_uptodate(bh))                 
-                return bh;
-        ll_rw_block (READ, 1, &bh);
-        wait_on_buffer (bh);
-        if (buffer_uptodate(bh))
-                return bh;
-        brelse (bh);
-        *err = -EIO;
-        return NULL;
-}
 static int ufs_writepage(struct page *page, struct writeback_control *wbc)
 {
        return block_write_full_page(page,ufs_getfrag_block,wbc);
@@ -588,7 +552,6 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations ufs_aops = {
        .readpage = ufs_readpage,
        .writepage = ufs_writepage,
-        .sync_page = block_sync_page,
        .write_begin = ufs_write_begin,
        .write_end = generic_write_end,
        .bmap = ufs_bmap
@@ -900,9 +863,9 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
 int ufs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        int ret;
-        lock_kernel();
+        lock_ufs(inode->i_sb);
        ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
-        unlock_kernel();
+        unlock_ufs(inode->i_sb);
        return ret;
 }
@@ -922,22 +885,22 @@ void ufs_evict_inode(struct inode * inode)
        if (want_delete) {
                loff_t old_i_size;
                /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
-                lock_kernel();
+                lock_ufs(inode->i_sb);
                mark_inode_dirty(inode);
                ufs_update_inode(inode, IS_SYNC(inode));
                old_i_size = inode->i_size;
                inode->i_size = 0;
                if (inode->i_blocks && ufs_truncate(inode, old_i_size))
                        ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n");
-                unlock_kernel();
+                unlock_ufs(inode->i_sb);
        }
        invalidate_inode_buffers(inode);
        end_writeback(inode);
        if (want_delete) {
-                lock_kernel();
+                lock_ufs(inode->i_sb);
                ufs_free_inode (inode);
-                unlock_kernel();
+                unlock_ufs(inode->i_sb);
        }
 }
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 12f39b9e4437..29309e25417f 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -29,7 +29,6 @@
 #include <linux/time.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -55,16 +54,16 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
        if (dentry->d_name.len > UFS_MAXNAMLEN)
                return ERR_PTR(-ENAMETOOLONG);
-        lock_kernel();
+        lock_ufs(dir->i_sb);
        ino = ufs_inode_by_name(dir, &dentry->d_name);
        if (ino) {
                inode = ufs_iget(dir->i_sb, ino);
                if (IS_ERR(inode)) {
-                        unlock_kernel();
+                        unlock_ufs(dir->i_sb);
                        return ERR_CAST(inode);
                }
        }
-        unlock_kernel();
+        unlock_ufs(dir->i_sb);
        d_add(dentry, inode);
        return NULL;
 }
@@ -93,9 +92,9 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
                inode->i_fop = &ufs_file_operations;
                inode->i_mapping->a_ops = &ufs_aops;
                mark_inode_dirty(inode);
-                lock_kernel();
+                lock_ufs(dir->i_sb);
                err = ufs_add_nondir(dentry, inode);
-                unlock_kernel();
+                unlock_ufs(dir->i_sb);
        }
        UFSD("END: err=%d\n", err);
        return err;
@@ -115,9 +114,9 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
                init_special_inode(inode, mode, rdev);
                ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev);
                mark_inode_dirty(inode);
-                lock_kernel();
+                lock_ufs(dir->i_sb);
                err = ufs_add_nondir(dentry, inode);
-                unlock_kernel();
+                unlock_ufs(dir->i_sb);
        }
        return err;
 }
@@ -133,7 +132,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
        if (l > sb->s_blocksize)
                goto out_notlocked;
-        lock_kernel();
+        lock_ufs(dir->i_sb);
        inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
@@ -156,7 +155,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
        err = ufs_add_nondir(dentry, inode);
 out:
-        unlock_kernel();
+        unlock_ufs(dir->i_sb);
 out_notlocked:
        return err;
@@ -172,9 +171,9 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
        struct inode *inode = old_dentry->d_inode;
        int error;
-        lock_kernel();
+        lock_ufs(dir->i_sb);
        if (inode->i_nlink >= UFS_LINK_MAX) {
-                unlock_kernel();
+                unlock_ufs(dir->i_sb);
                return -EMLINK;
        }
@@ -183,7 +182,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
        ihold(inode);
        error = ufs_add_nondir(dentry, inode);
-        unlock_kernel();
+        unlock_ufs(dir->i_sb);
        return error;
 }
@@ -195,7 +194,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
        if (dir->i_nlink >= UFS_LINK_MAX)
                goto out;
-        lock_kernel();
+        lock_ufs(dir->i_sb);
        inode_inc_link_count(dir);
        inode = ufs_new_inode(dir, S_IFDIR|mode);
@@ -216,7 +215,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
        err = ufs_add_link(dentry, inode);
        if (err)
                goto out_fail;
-        unlock_kernel();
+        unlock_ufs(dir->i_sb);
        d_instantiate(dentry, inode);
 out:
@@ -228,7 +227,7 @@ out_fail:
        iput (inode);
 out_dir:
        inode_dec_link_count(dir);
-        unlock_kernel();
+        unlock_ufs(dir->i_sb);
        goto out;
 }
@@ -259,7 +258,7 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
        struct inode * inode = dentry->d_inode;
        int err= -ENOTEMPTY;
-        lock_kernel();
+        lock_ufs(dir->i_sb);
        if (ufs_empty_dir (inode)) {
                err = ufs_unlink(dir, dentry);
                if (!err) {
@@ -268,7 +267,7 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
                        inode_dec_link_count(dir);
                }
        }
-        unlock_kernel();
+        unlock_ufs(dir->i_sb);
        return err;
 }
@@ -306,7 +305,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                ufs_set_link(new_dir, new_de, new_page, old_inode);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
@@ -318,12 +316,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        if (new_dir->i_nlink >= UFS_LINK_MAX)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = ufs_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
@@ -331,12 +326,11 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
        /*
         * Like most other Unix systems, set the ctime for inodes on a
         * rename.
-         * inode_dec_link_count() will mark the inode dirty.
         */
        old_inode->i_ctime = CURRENT_TIME_SEC;
        ufs_delete_entry(old_dir, old_de, old_page);
-        inode_dec_link_count(old_inode);
+        mark_inode_dirty(old_inode);
        if (dir_de) {
                ufs_set_link(old_inode, dir_de, dir_page, new_dir);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 2c61ac5d4e48..7693d6293404 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -84,7 +84,6 @@
 #include <linux/blkdev.h>
 #include <linux/init.h>
 #include <linux/parser.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/log2.h>
@@ -96,6 +95,26 @@
 #include "swab.h"
 #include "util.h"
+void lock_ufs(struct super_block *sb)
+{
+#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
+        struct ufs_sb_info *sbi = UFS_SB(sb);
+        mutex_lock(&sbi->mutex);
+        sbi->mutex_owner = current;
+#endif
+}
+void unlock_ufs(struct super_block *sb)
+{
+#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
+        struct ufs_sb_info *sbi = UFS_SB(sb);
+        sbi->mutex_owner = NULL;
+        mutex_unlock(&sbi->mutex);
+#endif
+}
 static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
 {
        struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
@@ -313,7 +332,6 @@ void ufs_panic (struct super_block * sb, const char * function,
        struct ufs_super_block_first * usb1;
        va_list args;
        
-        lock_kernel();
        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first(uspi);
        
@@ -521,7 +539,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
         */
        size = uspi->s_cssize;
        blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
-        base = space = kmalloc(size, GFP_KERNEL);
+        base = space = kmalloc(size, GFP_NOFS);
        if (!base)
                goto failed; 
        sbi->s_csp = (struct ufs_csum *)space;
@@ -546,7 +564,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
         * Read cylinder group (we read only first fragment from block
         * at this time) and prepare internal data structures for cg caching.
         */
-        if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_KERNEL)))
+        if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_NOFS)))
                goto failed;
        for (i = 0; i < uspi->s_ncg; i++) 
                sbi->s_ucg[i] = NULL;
@@ -564,7 +582,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
                ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data);
        }
        for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) {
-                if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_KERNEL)))
+                if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_NOFS)))
                        goto failed;
                sbi->s_cgno[i] = UFS_CGNO_EMPTY;
        }
@@ -646,8 +664,6 @@ static void ufs_put_super_internal(struct super_block *sb)
        
        UFSD("ENTER\n");
-        lock_kernel();
        ufs_put_cstotal(sb);
        size = uspi->s_cssize;
        blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -676,8 +692,6 @@ static void ufs_put_super_internal(struct super_block *sb)
        kfree (sbi->s_ucg);
        kfree (base);
-        unlock_kernel();
        UFSD("EXIT\n");
 }
@@ -696,8 +710,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
        unsigned maxsymlen;
        int ret = -EINVAL;
-        lock_kernel();
        uspi = NULL;
        ubh = NULL;
        flags = 0;
@@ -718,6 +730,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                goto failed;
        }
 #endif
+        mutex_init(&sbi->mutex);
        /*
         * Set default mount options
         * Parse mount options
@@ -1165,7 +1178,6 @@ magic_found:
                        goto failed;
        UFSD("EXIT\n");
-        unlock_kernel();
        return 0;
 dalloc_failed:
@@ -1177,12 +1189,10 @@ failed:
        kfree(sbi);
        sb->s_fs_info = NULL;
        UFSD("EXIT (FAILED)\n");
-        unlock_kernel();
        return ret;
 failed_nomem:
        UFSD("EXIT (NOMEM)\n");
-        unlock_kernel();
        return -ENOMEM;
 }
@@ -1193,8 +1203,8 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
        struct ufs_super_block_third * usb3;
        unsigned flags;
+        lock_ufs(sb);
        lock_super(sb);
-        lock_kernel();
        UFSD("ENTER\n");
@@ -1213,8 +1223,8 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
        sb->s_dirt = 0;
        UFSD("EXIT\n");
-        unlock_kernel();
        unlock_super(sb);
+        unlock_ufs(sb);
        return 0;
 }
@@ -1256,7 +1266,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        unsigned new_mount_opt, ufstype;
        unsigned flags;
-        lock_kernel();
+        lock_ufs(sb);
        lock_super(sb);
        uspi = UFS_SB(sb)->s_uspi;
        flags = UFS_SB(sb)->s_flags;
@@ -1272,7 +1282,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        ufs_set_opt (new_mount_opt, ONERROR_LOCK);
        if (!ufs_parse_options (data, &new_mount_opt)) {
                unlock_super(sb);
-                unlock_kernel();
+                unlock_ufs(sb);
                return -EINVAL;
        }
        if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
@@ -1280,14 +1290,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
                printk("ufstype can't be changed during remount\n");
                unlock_super(sb);
-                unlock_kernel();
+                unlock_ufs(sb);
                return -EINVAL;
        }
        if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
                UFS_SB(sb)->s_mount_opt = new_mount_opt;
                unlock_super(sb);
-                unlock_kernel();
+                unlock_ufs(sb);
                return 0;
        }
        
@@ -1313,7 +1323,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
                printk("ufs was compiled with read-only support, "
                "can't be mounted as read-write\n");
                unlock_super(sb);
-                unlock_kernel();
+                unlock_ufs(sb);
                return -EINVAL;
 #else
                if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 
@@ -1323,13 +1333,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
                    ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
                        printk("this ufstype is read-only supported\n");
                        unlock_super(sb);
-                        unlock_kernel();
+                        unlock_ufs(sb);
                        return -EINVAL;
                }
                if (!ufs_read_cylinder_structures(sb)) {
                        printk("failed during remounting\n");
                        unlock_super(sb);
-                        unlock_kernel();
+                        unlock_ufs(sb);
                        return -EPERM;
                }
                sb->s_flags &= ~MS_RDONLY;
@@ -1337,7 +1347,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        }
        UFS_SB(sb)->s_mount_opt = new_mount_opt;
        unlock_super(sb);
-        unlock_kernel();
+        unlock_ufs(sb);
        return 0;
 }
@@ -1371,7 +1381,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct ufs_super_block_third *usb3;
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
-        lock_kernel();
+        lock_ufs(sb);
        usb1 = ubh_get_usb_first(uspi);
        usb2 = ubh_get_usb_second(uspi);
@@ -1395,7 +1405,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_fsid.val[0] = (u32)id;
        buf->f_fsid.val[1] = (u32)(id >> 32);
-        unlock_kernel();
+        unlock_ufs(sb);
        return 0;
 }
@@ -1405,7 +1415,7 @@ static struct kmem_cache * ufs_inode_cachep;
 static struct inode *ufs_alloc_inode(struct super_block *sb)
 {
        struct ufs_inode_info *ei;
-        ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_KERNEL);
+        ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
        ei->vfs_inode.i_version = 1;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index a58f9155fc9a..11014302c9ca 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -40,7 +40,6 @@
 #include <linux/time.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/sched.h>
@@ -467,7 +466,6 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
        block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block);
-        lock_kernel();
        while (1) {
                retry = ufs_trunc_direct(inode);
                retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK,
@@ -481,13 +479,12 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
                        break;
                if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
                        ufs_sync_inode (inode);
-                blk_run_address_space(inode->i_mapping);
+                blk_flush_plug(current);
                yield();
        }
        inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
        ufsi->i_lastfrag = DIRECT_FRAGMENT;
-        unlock_kernel();
        mark_inode_dirty(inode);
 out:
        UFSD("EXIT: err %d\n", err);
@@ -510,7 +507,9 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
                /* XXX(truncate): truncate_setsize should be called last */
                truncate_setsize(inode, attr->ia_size);
+                lock_ufs(inode->i_sb);
                error = ufs_truncate(inode, old_i_size);
+                unlock_ufs(inode->i_sb);
                if (error)
                        return error;
        }
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index c08782e1b48a..5be2755dd715 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -18,6 +18,8 @@ struct ufs_sb_info {
        unsigned s_cgno[UFS_MAX_GROUP_LOADED];
        unsigned short s_cg_loaded;
        unsigned s_mount_opt;
+        struct mutex mutex;
+        struct task_struct *mutex_owner;
 };
 struct ufs_inode_info {
@@ -109,7 +111,6 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long);
 extern int ufs_write_inode (struct inode *, struct writeback_control *);
 extern int ufs_sync_inode (struct inode *);
 extern void ufs_evict_inode (struct inode *);
-extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
 extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create);
 /* namei.c */
@@ -154,4 +155,7 @@ static inline u32 ufs_dtogd(struct ufs_sb_private_info * uspi, u64 b)
        return do_div(b, uspi->s_fpg);
 }
+extern void lock_ufs(struct super_block *sb);
+extern void unlock_ufs(struct super_block *sb);
 #endif /* _UFS_UFS_H */
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index d2c36d53fe66..95425b59ce0a 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -27,7 +27,7 @@ struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi,
        if (count > UFS_MAXFRAG)
                return NULL;
        ubh = (struct ufs_buffer_head *)
-                kmalloc (sizeof (struct ufs_buffer_head), GFP_KERNEL);
+                kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS);
        if (!ubh)
                return NULL;
        ubh->fragment = fragment;
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 9f8775ce381c..954175928240 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -408,7 +408,7 @@ static inline unsigned _ubh_find_next_zero_bit_(
        for (;;) {
                count = min_t(unsigned int, size + offset, uspi->s_bpf);
                size -= count - offset;
-                pos = ext2_find_next_zero_bit (ubh->bh[base]->b_data, count, offset);
+                pos = find_next_zero_bit_le(ubh->bh[base]->b_data, count, offset);
                if (pos < count || !size)
                        break;
                base++;
diff --git a/fs/utimes.c b/fs/utimes.c
index 179b58690657..ba653f3dc1bc 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -95,7 +95,7 @@ static int utimes_common(struct path *path, struct timespec *times)
                if (IS_IMMUTABLE(inode))
                        goto mnt_drop_write_and_out;
-                if (!is_owner_or_cap(inode)) {
+                if (!inode_owner_or_capable(inode)) {
                        error = inode_permission(inode, MAY_WRITE);
                        if (error)
                                goto mnt_drop_write_and_out;
diff --git a/fs/xattr.c b/fs/xattr.c
index 01bb8135e14a..a19acdb81cd1 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -59,7 +59,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
                if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
                        return -EPERM;
                if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
-                    (mask & MAY_WRITE) && !is_owner_or_cap(inode))
+                    (mask & MAY_WRITE) && !inode_owner_or_capable(inode))
                        return -EPERM;
        }
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index faca44997099..284a7c89697e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,14 +16,11 @@
 # Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #
-EXTRA_CFLAGS +=  -I$(src) -I$(src)/linux-2.6
+ccflags-y := -I$(src) -I$(src)/linux-2.6
+ccflags-$(CONFIG_XFS_DEBUG) += -g
 XFS_LINUX := linux-2.6
-ifeq ($(CONFIG_XFS_DEBUG),y)
-        EXTRA_CFLAGS += -g
-endif
 obj-$(CONFIG_XFS_FS)            += xfs.o
 xfs-y                           += linux-2.6/xfs_trace.o
@@ -105,11 +102,10 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
                                   xfs_globals.o \
                                   xfs_ioctl.o \
                                   xfs_iops.o \
+                                   xfs_message.o \
                                   xfs_super.o \
                                   xfs_sync.o \
                                   xfs_xattr.o)
 # Objects in support/
-xfs-y                           += $(addprefix support/, \
+xfs-y                           += support/uuid.o
-                                   debug.o \
-                                   uuid.o)
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 666c9db48eb6..a907de565db3 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -23,6 +23,7 @@
 #include <linux/backing-dev.h>
 #include "time.h"
 #include "kmem.h"
+#include "xfs_message.h"
 /*
 * Greedy allocation.  May fail and may return vmalloced memory.
@@ -56,8 +57,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
                if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
                        return ptr;
                if (!(++retries % 100))
-                        printk(KERN_ERR "XFS: possible memory allocation "
+                        xfs_err(NULL,
-                                        "deadlock in %s (mode:0x%x)\n",
+                "possible memory allocation deadlock in %s (mode:0x%x)",
                                        __func__, lflags);
                congestion_wait(BLK_RW_ASYNC, HZ/50);
        } while (1);
@@ -112,8 +113,8 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
                if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
                        return ptr;
                if (!(++retries % 100))
-                        printk(KERN_ERR "XFS: possible memory allocation "
+                        xfs_err(NULL,
-                                        "deadlock in %s (mode:0x%x)\n",
+                "possible memory allocation deadlock in %s (mode:0x%x)",
                                        __func__, lflags);
                congestion_wait(BLK_RW_ASYNC, HZ/50);
        } while (1);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index ec7bbb5645b6..52dbd14260ba 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -413,8 +413,7 @@ xfs_submit_ioend_bio(
        if (xfs_ioend_new_eof(ioend))
                xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
-        submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
+        submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
-                   WRITE_SYNC_PLUG : WRITE, bio);
 }
 STATIC struct bio *
@@ -854,7 +853,7 @@ xfs_aops_discard_page(
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                goto out_invalidate;
-        xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+        xfs_alert(ip->i_mount,
                "page discard on page %p, inode 0x%llx, offset %llu.",
                        page, ip->i_ino, offset);
@@ -872,7 +871,7 @@ xfs_aops_discard_page(
                if (error) {
                        /* something screwed, just bail */
                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                                xfs_alert(ip->i_mount,
                        "page discard unable to remove delalloc mapping.");
                        }
                        break;
@@ -1411,7 +1410,7 @@ xfs_vm_write_failed(
                if (error) {
                        /* something screwed, just bail */
                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                                xfs_alert(ip->i_mount,
                        "xfs_vm_write_failed: unable to clean up ino %lld",
                                                ip->i_ino);
                        }
@@ -1495,7 +1494,6 @@ const struct address_space_operations xfs_address_space_operations = {
        .readpages              = xfs_vm_readpages,
        .writepage              = xfs_vm_writepage,
        .writepages             = xfs_vm_writepages,
-        .sync_page              = block_sync_page,
        .releasepage            = xfs_vm_releasepage,
        .invalidatepage         = xfs_vm_invalidatepage,
        .write_begin            = xfs_vm_write_begin,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ac1c7e8378dd..c05324d3282c 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -401,9 +401,8 @@ _xfs_buf_lookup_pages(
                         * handle buffer allocation failures we can't do much.
                         */
                        if (!(++retries % 100))
-                                printk(KERN_ERR
+                                xfs_err(NULL,
-                                        "XFS: possible memory allocation "
+                "possible memory allocation deadlock in %s (mode:0x%x)",
-                                        "deadlock in %s (mode:0x%x)\n",
                                        __func__, gfp_mask);
                        XFS_STATS_INC(xb_page_retries);
@@ -615,8 +614,8 @@ xfs_buf_get(
        if (!(bp->b_flags & XBF_MAPPED)) {
                error = _xfs_buf_map_pages(bp, flags);
                if (unlikely(error)) {
-                        printk(KERN_WARNING "%s: failed to map pages\n",
+                        xfs_warn(target->bt_mount,
-                                        __func__);
+                                "%s: failed to map pages\n", __func__);
                        goto no_buffer;
                }
        }
@@ -850,8 +849,8 @@ xfs_buf_get_uncached(
        error = _xfs_buf_map_pages(bp, XBF_MAPPED);
        if (unlikely(error)) {
-                printk(KERN_WARNING "%s: failed to map pages\n",
+                xfs_warn(target->bt_mount,
-                                __func__);
+                        "%s: failed to map pages\n", __func__);
                goto fail_free_mem;
        }
@@ -991,7 +990,7 @@ xfs_buf_lock(
        if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
                xfs_log_force(bp->b_target->bt_mount, 0);
        if (atomic_read(&bp->b_io_remaining))
-                blk_run_address_space(bp->b_target->bt_mapping);
+                blk_flush_plug(current);
        down(&bp->b_sema);
        XB_SET_OWNER(bp);
@@ -1035,9 +1034,7 @@ xfs_buf_wait_unpin(
                set_current_state(TASK_UNINTERRUPTIBLE);
                if (atomic_read(&bp->b_pin_count) == 0)
                        break;
-                if (atomic_read(&bp->b_io_remaining))
+                io_schedule();
-                        blk_run_address_space(bp->b_target->bt_mapping);
-                schedule();
        }
        remove_wait_queue(&bp->b_waiters, &wait);
        set_current_state(TASK_RUNNING);
@@ -1443,7 +1440,7 @@ xfs_buf_iowait(
        trace_xfs_buf_iowait(bp, _RET_IP_);
        if (atomic_read(&bp->b_io_remaining))
-                blk_run_address_space(bp->b_target->bt_mapping);
+                blk_flush_plug(current);
        wait_for_completion(&bp->b_iowait);
        trace_xfs_buf_iowait_done(bp, _RET_IP_);
@@ -1617,8 +1614,8 @@ xfs_setsize_buftarg_flags(
        btp->bt_smask = sectorsize - 1;
        if (set_blocksize(btp->bt_bdev, sectorsize)) {
-                printk(KERN_WARNING
+                xfs_warn(btp->bt_mount,
-                        "XFS: Cannot set_blocksize to %u on device %s\n",
+                        "Cannot set_blocksize to %u on device %s\n",
                        sectorsize, XFS_BUFTARG_NAME(btp));
                return EINVAL;
        }
@@ -1667,7 +1664,6 @@ xfs_mapping_buftarg(
        struct inode            *inode;
        struct address_space    *mapping;
        static const struct address_space_operations mapping_aops = {
-                .sync_page = block_sync_page,
                .migratepage = fail_migrate_page,
        };
@@ -1948,7 +1944,7 @@ xfsbufd(
                        count++;
                }
                if (count)
-                        blk_run_address_space(target->bt_mapping);
+                        blk_flush_plug(current);
        } while (!kthread_should_stop());
@@ -1996,7 +1992,7 @@ xfs_flush_buftarg(
        if (wait) {
                /* Expedite and wait for IO to complete. */
-                blk_run_address_space(target->bt_mapping);
+                blk_flush_plug(current);
                while (!list_empty(&wait_list)) {
                        bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
@@ -2022,11 +2018,12 @@ xfs_buf_init(void)
        if (!xfslogd_workqueue)
                goto out_free_buf_zone;
-        xfsdatad_workqueue = create_workqueue("xfsdatad");
+        xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
        if (!xfsdatad_workqueue)
                goto out_destroy_xfslogd_workqueue;
-        xfsconvertd_workqueue = create_workqueue("xfsconvertd");
+        xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
+                                                WQ_MEM_RECLAIM, 1);
        if (!xfsconvertd_workqueue)
                goto out_destroy_xfsdatad_workqueue;
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
index 05201ae719e5..d61611c88012 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -152,6 +152,8 @@ xfs_ioc_trim(
        if (!capable(CAP_SYS_ADMIN))
                return -XFS_ERROR(EPERM);
+        if (!blk_queue_discard(q))
+                return -XFS_ERROR(EOPNOTSUPP);
        if (copy_from_user(&range, urange, sizeof(range)))
                return -XFS_ERROR(EFAULT);
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index fc0114da7fdd..f4f878fc0083 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -89,8 +89,10 @@ xfs_fs_encode_fh(
         * seven combinations work.  The real answer is "don't use v2".
         */
        len = xfs_fileid_length(fileid_type);
-        if (*max_len < len)
+        if (*max_len < len) {
+                *max_len = len;
                return 255;
+        }
        *max_len = len;
        switch (fileid_type) {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index b06ede1d0bed..0ca0e3c024d7 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -695,14 +695,19 @@ xfs_ioc_fsgeometry_v1(
        xfs_mount_t             *mp,
        void                    __user *arg)
 {
-        xfs_fsop_geom_v1_t      fsgeo;
+        xfs_fsop_geom_t         fsgeo;
        int                     error;
-        error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3);
+        error = xfs_fs_geometry(mp, &fsgeo, 3);
        if (error)
                return -error;
-        if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
+        /*
+         * Caller should have passed an argument of type
+         * xfs_fsop_geom_v1_t.  This is a proper subset of the
+         * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
+         */
+        if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
                return -XFS_ERROR(EFAULT);
        return 0;
 }
@@ -985,10 +990,22 @@ xfs_ioctl_setattr(
                /*
                 * Extent size must be a multiple of the appropriate block
-                 * size, if set at all.
+                 * size, if set at all. It must also be smaller than the
+                 * maximum extent size supported by the filesystem.
+                 *
+                 * Also, for non-realtime files, limit the extent size hint to
+                 * half the size of the AGs in the filesystem so alignment
+                 * doesn't result in extents larger than an AG.
                 */
                if (fa->fsx_extsize != 0) {
-                        xfs_extlen_t    size;
+                        xfs_extlen_t    size;
+                        xfs_fsblock_t   extsize_fsb;
+                        extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+                        if (extsize_fsb > MAXEXTLEN) {
+                                code = XFS_ERROR(EINVAL);
+                                goto error_return;
+                        }
                        if (XFS_IS_REALTIME_INODE(ip) ||
                            ((mask & FSX_XFLAGS) &&
@@ -997,6 +1014,10 @@ xfs_ioctl_setattr(
                                       mp->m_sb.sb_blocklog;
                        } else {
                                size = mp->m_sb.sb_blocksize;
+                                if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
+                                        code = XFS_ERROR(EINVAL);
+                                        goto error_return;
+                                }
                        }
                        if (fa->fsx_extsize % size) {
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index bd5727852fd6..9ff7fc603d2f 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -102,7 +102,8 @@ xfs_mark_inode_dirty(
 STATIC int
 xfs_init_security(
        struct inode    *inode,
-        struct inode    *dir)
+        struct inode    *dir,
+        const struct qstr *qstr)
 {
        struct xfs_inode *ip = XFS_I(inode);
        size_t          length;
@@ -110,7 +111,7 @@ xfs_init_security(
        unsigned char   *name;
        int             error;
-        error = security_inode_init_security(inode, dir, (char **)&name,
+        error = security_inode_init_security(inode, dir, qstr, (char **)&name,
                                             &value, &length);
        if (error) {
                if (error == -EOPNOTSUPP)
@@ -194,7 +195,7 @@ xfs_vn_mknod(
        inode = VFS_I(ip);
-        error = xfs_init_security(inode, dir);
+        error = xfs_init_security(inode, dir, &dentry->d_name);
        if (unlikely(error))
                goto out_cleanup_inode;
@@ -367,7 +368,7 @@ xfs_vn_symlink(
        inode = VFS_I(cip);
-        error = xfs_init_security(inode, dir);
+        error = xfs_init_security(inode, dir, &dentry->d_name);
        if (unlikely(error))
                goto out_cleanup_inode;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 096494997747..244be9cbfe78 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -39,7 +39,6 @@
 #include <mrlock.h>
 #include <time.h>
-#include <support/debug.h>
 #include <support/uuid.h>
 #include <linux/semaphore.h>
@@ -86,6 +85,7 @@
 #include <xfs_aops.h>
 #include <xfs_super.h>
 #include <xfs_buf.h>
+#include <xfs_message.h>
 /*
 * Feature macros (disable/enable)
@@ -280,4 +280,25 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
 #define __arch_pack
 #endif
+#define ASSERT_ALWAYS(expr)     \
+        (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+#ifndef DEBUG
+#define ASSERT(expr)    ((void)0)
+#ifndef STATIC
+# define STATIC static noinline
+#endif
+#else /* DEBUG */
+#define ASSERT(expr)    \
+        (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+#ifndef STATIC
+# define STATIC noinline
+#endif
+#endif /* DEBUG */
 #endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
new file mode 100644
index 000000000000..508e06fd7d1e
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2011 Red Hat, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+/*
+ * XFS logging functions
+ */
+static int
+__xfs_printk(
+        const char              *level,
+        const struct xfs_mount  *mp,
+        struct va_format        *vaf)
+{
+        if (mp && mp->m_fsname)
+                return printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
+        return printk("%sXFS: %pV\n", level, vaf);
+}
+int xfs_printk(
+        const char              *level,
+        const struct xfs_mount  *mp,
+        const char              *fmt, ...)
+{
+        struct va_format        vaf;
+        va_list                 args;
+        int                      r;
+        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        r = __xfs_printk(level, mp, &vaf);
+        va_end(args);
+        return r;
+}
+#define define_xfs_printk_level(func, kern_level)               \
+int func(const struct xfs_mount *mp, const char *fmt, ...)      \
+{                                                               \
+        struct va_format        vaf;                            \
+        va_list                 args;                           \
+        int                     r;                              \
+                                                                \
+        va_start(args, fmt);                                    \
+                                                                \
+        vaf.fmt = fmt;                                          \
+        vaf.va = &args;                                         \
+                                                                \
+        r = __xfs_printk(kern_level, mp, &vaf);                 \
+        va_end(args);                                           \
+                                                                \
+        return r;                                               \
+}                                                               \
+define_xfs_printk_level(xfs_emerg, KERN_EMERG);
+define_xfs_printk_level(xfs_alert, KERN_ALERT);
+define_xfs_printk_level(xfs_crit, KERN_CRIT);
+define_xfs_printk_level(xfs_err, KERN_ERR);
+define_xfs_printk_level(xfs_warn, KERN_WARNING);
+define_xfs_printk_level(xfs_notice, KERN_NOTICE);
+define_xfs_printk_level(xfs_info, KERN_INFO);
+#ifdef DEBUG
+define_xfs_printk_level(xfs_debug, KERN_DEBUG);
+#endif
+int
+xfs_alert_tag(
+        const struct xfs_mount  *mp,
+        int                     panic_tag,
+        const char              *fmt, ...)
+{
+        struct va_format        vaf;
+        va_list                 args;
+        int                     do_panic = 0;
+        int                     r;
+        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
+                xfs_printk(KERN_ALERT, mp,
+                        "XFS: Transforming an alert into a BUG.");
+                do_panic = 1;
+        }
+        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        r = __xfs_printk(KERN_ALERT, mp, &vaf);
+        va_end(args);
+        BUG_ON(do_panic);
+        return r;
+}
+void
+assfail(char *expr, char *file, int line)
+{
+        xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
+                expr, file, line);
+        BUG();
+}
+void
+xfs_hex_dump(void *p, int length)
+{
+        print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
+}
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
new file mode 100644
index 000000000000..e77ffa16745b
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -0,0 +1,38 @@
+#ifndef __XFS_MESSAGE_H
+#define __XFS_MESSAGE_H 1
+struct xfs_mount;
+extern int xfs_printk(const char *level, const struct xfs_mount *mp,
+                      const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4)));
+extern int xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern int xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern int xfs_alert_tag(const struct xfs_mount *mp, int tag,
+                         const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4)));
+extern int xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern int xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern int xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern int xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern int xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+#ifdef DEBUG
+extern int xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+#else
+#define xfs_debug(mp, fmt, ...) (0)
+#endif
+extern void assfail(char *expr, char *f, int l);
+extern void xfs_hex_dump(void *p, int length);
+#endif  /* __XFS_MESSAGE_H */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 9731898083ae..818c4cf2de86 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -173,6 +173,15 @@ xfs_parseargs(
        __uint8_t               iosizelog = 0;
        /*
+         * set up the mount name first so all the errors will refer to the
+         * correct device.
+         */
+        mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
+        if (!mp->m_fsname)
+                return ENOMEM;
+        mp->m_fsname_len = strlen(mp->m_fsname) + 1;
+        /*
         * Copy binary VFS mount flags we are interested in.
         */
        if (sb->s_flags & MS_RDONLY)
@@ -189,6 +198,7 @@ xfs_parseargs(
        mp->m_flags |= XFS_MOUNT_BARRIER;
        mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
        mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+        mp->m_flags |= XFS_MOUNT_DELAYLOG;
        /*
         * These can be overridden by the mount option parsing.
@@ -207,24 +217,21 @@ xfs_parseargs(
                if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
                        mp->m_logbufs = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
                        mp->m_logbsize = suffix_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -232,14 +239,12 @@ xfs_parseargs(
                        if (!mp->m_logname)
                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_MTPT)) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s option not allowed on this system",
-                                "XFS: %s option not allowed on this system",
                                this_char);
                        return EINVAL;
                } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -248,8 +253,7 @@ xfs_parseargs(
                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -257,8 +261,7 @@ xfs_parseargs(
                        iosizelog = ffs(iosize) - 1;
                } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -280,16 +283,14 @@ xfs_parseargs(
                        mp->m_flags |= XFS_MOUNT_SWALLOC;
                } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
                        dsunit = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -297,8 +298,7 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
                        mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
 #if !XFS_BIG_INUMS
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s option not allowed on this system",
-                                "XFS: %s option not allowed on this system",
                                this_char);
                        return EINVAL;
 #endif
@@ -356,20 +356,19 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
                        mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
                } else if (!strcmp(this_char, "ihashsize")) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: ihashsize no longer used, option is deprecated.");
+        "ihashsize no longer used, option is deprecated.");
                } else if (!strcmp(this_char, "osyncisdsync")) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: osyncisdsync has no effect, option is deprecated.");
+        "osyncisdsync has no effect, option is deprecated.");
                } else if (!strcmp(this_char, "osyncisosync")) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: osyncisosync has no effect, option is deprecated.");
+        "osyncisosync has no effect, option is deprecated.");
                } else if (!strcmp(this_char, "irixsgid")) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: irixsgid is now a sysctl(2) variable, option is deprecated.");
+        "irixsgid is now a sysctl(2) variable, option is deprecated.");
                } else {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "unknown mount option [%s].", this_char);
-                                "XFS: unknown mount option [%s].", this_char);
                        return EINVAL;
                }
        }
@@ -379,40 +378,37 @@ xfs_parseargs(
         */
        if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
            !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only.");
+                xfs_warn(mp, "no-recovery mounts must be read-only.");
                return EINVAL;
        }
        if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: sunit and swidth options incompatible with the noalign option");
+        "sunit and swidth options incompatible with the noalign option");
                return EINVAL;
        }
 #ifndef CONFIG_XFS_QUOTA
        if (XFS_IS_QUOTA_RUNNING(mp)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "quota support not available in this kernel.");
-                        "XFS: quota support not available in this kernel.");
                return EINVAL;
        }
 #endif
        if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
            (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "cannot mount with both project and group quota");
-                        "XFS: cannot mount with both project and group quota");
                return EINVAL;
        }
        if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "sunit and swidth must be specified together");
-                        "XFS: sunit and swidth must be specified together");
                return EINVAL;
        }
        if (dsunit && (dswidth % dsunit != 0)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)",
+        "stripe width (%d) must be a multiple of the stripe unit (%d)",
                        dswidth, dsunit);
                return EINVAL;
        }
@@ -438,8 +434,7 @@ done:
            mp->m_logbufs != 0 &&
            (mp->m_logbufs < XLOG_MIN_ICLOGS ||
             mp->m_logbufs > XLOG_MAX_ICLOGS)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
-                        "XFS: invalid logbufs value: %d [not %d-%d]",
                        mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
                return XFS_ERROR(EINVAL);
        }
@@ -448,22 +443,16 @@ done:
            (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
             mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
             !is_power_of_2(mp->m_logbsize))) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+                        "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
                        mp->m_logbsize);
                return XFS_ERROR(EINVAL);
        }
-        mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
-        if (!mp->m_fsname)
-                return ENOMEM;
-        mp->m_fsname_len = strlen(mp->m_fsname) + 1;
        if (iosizelog) {
                if (iosizelog > XFS_MAX_IO_LOG ||
                    iosizelog < XFS_MIN_IO_LOG) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
-                "XFS: invalid log iosize: %d [not %d-%d]",
                                iosizelog, XFS_MIN_IO_LOG,
                                XFS_MAX_IO_LOG);
                        return XFS_ERROR(EINVAL);
@@ -610,7 +599,7 @@ xfs_blkdev_get(
                                    mp);
        if (IS_ERR(*bdevp)) {
                error = PTR_ERR(*bdevp);
-                printk("XFS: Invalid device [%s], error=%d\n", name, error);
+                xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
        }
        return -error;
@@ -664,23 +653,23 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp)
        int error;
        if (mp->m_logdev_targp != mp->m_ddev_targp) {
-                xfs_fs_cmn_err(CE_NOTE, mp,
+                xfs_notice(mp,
                  "Disabling barriers, not supported with external log device");
                mp->m_flags &= ~XFS_MOUNT_BARRIER;
                return;
        }
        if (xfs_readonly_buftarg(mp->m_ddev_targp)) {
-                xfs_fs_cmn_err(CE_NOTE, mp,
+                xfs_notice(mp,
-                  "Disabling barriers, underlying device is readonly");
+                        "Disabling barriers, underlying device is readonly");
                mp->m_flags &= ~XFS_MOUNT_BARRIER;
                return;
        }
        error = xfs_barrier_test(mp);
        if (error) {
-                xfs_fs_cmn_err(CE_NOTE, mp,
+                xfs_notice(mp,
-                  "Disabling barriers, trial barrier write failed");
+                        "Disabling barriers, trial barrier write failed");
                mp->m_flags &= ~XFS_MOUNT_BARRIER;
                return;
        }
@@ -743,8 +732,8 @@ xfs_open_devices(
                        goto out_close_logdev;
                if (rtdev == ddev || rtdev == logdev) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev.");
+        "Cannot mount filesystem with identical rtdev and ddev/logdev.");
                        error = EINVAL;
                        goto out_close_rtdev;
                }
@@ -1345,8 +1334,8 @@ xfs_fs_remount(
                         * options that we can't actually change.
                         */
 #if 0
-                        printk(KERN_INFO
+                        xfs_info(mp,
-        "XFS: mount option \"%s\" not supported for remount\n", p);
+                "mount option \"%s\" not supported for remount\n", p);
                        return -EINVAL;
 #else
                        break;
@@ -1367,8 +1356,7 @@ xfs_fs_remount(
                if (mp->m_update_flags) {
                        error = xfs_mount_log_sb(mp, mp->m_update_flags);
                        if (error) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "failed to write sb changes");
-                                        "XFS: failed to write sb changes");
                                return error;
                        }
                        mp->m_update_flags = 0;
@@ -1452,15 +1440,15 @@ xfs_finish_flags(
                        mp->m_logbsize = mp->m_sb.sb_logsunit;
                } else if (mp->m_logbsize > 0 &&
                           mp->m_logbsize < mp->m_sb.sb_logsunit) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: logbuf size must be greater than or equal to log stripe size");
+                "logbuf size must be greater than or equal to log stripe size");
                        return XFS_ERROR(EINVAL);
                }
        } else {
                /* Fail a mount if the logbuf is larger than 32K */
                if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: logbuf size for version 1 logs must be 16K or 32K");
+                "logbuf size for version 1 logs must be 16K or 32K");
                        return XFS_ERROR(EINVAL);
                }
        }
@@ -1477,8 +1465,8 @@ xfs_finish_flags(
         * prohibit r/w mounts of read-only filesystems
         */
        if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: cannot mount a read-only filesystem as read-write");
+                        "cannot mount a read-only filesystem as read-write");
                return XFS_ERROR(EROFS);
        }
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index e22f0057d21f..6c10f1d2e3d3 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -425,8 +425,7 @@ xfs_quiesce_attr(
        /* Push the superblock and write an unmount record */
        error = xfs_log_sbcount(mp, 1);
        if (error)
-                xfs_fs_cmn_err(CE_WARN, mp,
+                xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
-                                "xfs_attr_quiesce: failed to log sb changes. "
                                "Frozen image may not be consistent.");
        xfs_log_unmount_write(mp);
        xfs_unmountfs_writesb(mp);
@@ -806,7 +805,7 @@ xfs_reclaim_inode(
         * pass on the error.
         */
        if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount,
                        "inode 0x%llx background reclaim flush failed with %d",
                        (long long)ip->i_ino, error);
        }
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index ee3cee097e7e..ee2d2adaa438 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -37,7 +37,7 @@ xfs_stats_clear_proc_handler(
        ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
        if (!ret && write && *valp) {
-                printk("XFS Clearing xfsstats\n");
+                xfs_notice(NULL, "Clearing xfsstats");
                for_each_possible_cpu(c) {
                        preempt_disable();
                        /* save vn_active, it's a universal truth! */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index d22aa3103106..7e2416478503 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -544,9 +544,10 @@ xfs_qm_dqtobp(
        /*
         * A simple sanity check in case we got a corrupted dquot...
         */
-        if (xfs_qm_dqcheck(ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
+        error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
                           flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
-                           "dqtobp")) {
+                           "dqtobp");
+        if (error) {
                if (!(flags & XFS_QMOPT_DQREPAIR)) {
                        xfs_trans_brelse(tp, bp);
                        return XFS_ERROR(EIO);
@@ -827,7 +828,7 @@ xfs_qm_dqget(
        if (xfs_do_dqerror) {
                if ((xfs_dqerror_target == mp->m_ddev_targp) &&
                    (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
-                        cmn_err(CE_DEBUG, "Returning error in dqget");
+                        xfs_debug(mp, "Returning error in dqget");
                        return (EIO);
                }
        }
@@ -1207,8 +1208,9 @@ xfs_qm_dqflush(
        /*
         * A simple sanity check in case we got a corrupted dquot..
         */
-        if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
+        error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
-                           XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
+                           XFS_QMOPT_DOWARN, "dqflush (incore copy)");
+        if (error) {
                xfs_buf_relse(bp);
                xfs_dqfunlock(dqp);
                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -1391,8 +1393,8 @@ xfs_qm_dqpurge(
                 */
                error = xfs_qm_dqflush(dqp, SYNC_WAIT);
                if (error)
-                        xfs_fs_cmn_err(CE_WARN, mp,
+                        xfs_warn(mp, "%s: dquot %p flush failed",
-                                "xfs_qm_dqpurge: dquot %p flush failed", dqp);
+                                __func__, dqp);
                xfs_dqflock(dqp);
        }
        ASSERT(atomic_read(&dqp->q_pincount) == 0);
@@ -1425,36 +1427,38 @@ xfs_qm_dqpurge(
 void
 xfs_qm_dqprint(xfs_dquot_t *dqp)
 {
-        cmn_err(CE_DEBUG, "-----------KERNEL DQUOT----------------");
+        struct xfs_mount        *mp = dqp->q_mount;
-        cmn_err(CE_DEBUG, "---- dquotID =  %d",
+        xfs_debug(mp, "-----------KERNEL DQUOT----------------");
+        xfs_debug(mp, "---- dquotID =  %d",
                (int)be32_to_cpu(dqp->q_core.d_id));
-        cmn_err(CE_DEBUG, "---- type    =  %s", DQFLAGTO_TYPESTR(dqp));
+        xfs_debug(mp, "---- type    =  %s", DQFLAGTO_TYPESTR(dqp));
-        cmn_err(CE_DEBUG, "---- fs      =  0x%p", dqp->q_mount);
+        xfs_debug(mp, "---- fs      =  0x%p", dqp->q_mount);
-        cmn_err(CE_DEBUG, "---- blkno   =  0x%x", (int) dqp->q_blkno);
+        xfs_debug(mp, "---- blkno   =  0x%x", (int) dqp->q_blkno);
-        cmn_err(CE_DEBUG, "---- boffset =  0x%x", (int) dqp->q_bufoffset);
+        xfs_debug(mp, "---- boffset =  0x%x", (int) dqp->q_bufoffset);
-        cmn_err(CE_DEBUG, "---- blkhlimit =  %Lu (0x%x)",
+        xfs_debug(mp, "---- blkhlimit =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_blk_hardlimit),
                (int)be64_to_cpu(dqp->q_core.d_blk_hardlimit));
-        cmn_err(CE_DEBUG, "---- blkslimit =  %Lu (0x%x)",
+        xfs_debug(mp, "---- blkslimit =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_blk_softlimit),
                (int)be64_to_cpu(dqp->q_core.d_blk_softlimit));
-        cmn_err(CE_DEBUG, "---- inohlimit =  %Lu (0x%x)",
+        xfs_debug(mp, "---- inohlimit =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_ino_hardlimit),
                (int)be64_to_cpu(dqp->q_core.d_ino_hardlimit));
-        cmn_err(CE_DEBUG, "---- inoslimit =  %Lu (0x%x)",
+        xfs_debug(mp, "---- inoslimit =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_ino_softlimit),
                (int)be64_to_cpu(dqp->q_core.d_ino_softlimit));
-        cmn_err(CE_DEBUG, "---- bcount  =  %Lu (0x%x)",
+        xfs_debug(mp, "---- bcount  =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_bcount),
                (int)be64_to_cpu(dqp->q_core.d_bcount));
-        cmn_err(CE_DEBUG, "---- icount  =  %Lu (0x%x)",
+        xfs_debug(mp, "---- icount  =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_icount),
                (int)be64_to_cpu(dqp->q_core.d_icount));
-        cmn_err(CE_DEBUG, "---- btimer  =  %d",
+        xfs_debug(mp, "---- btimer  =  %d",
                (int)be32_to_cpu(dqp->q_core.d_btimer));
-        cmn_err(CE_DEBUG, "---- itimer  =  %d",
+        xfs_debug(mp, "---- itimer  =  %d",
                (int)be32_to_cpu(dqp->q_core.d_itimer));
-        cmn_err(CE_DEBUG, "---------------------------");
+        xfs_debug(mp, "---------------------------");
 }
 #endif
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 2a1f3dc10a02..9e0e2fa3f2c8 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -136,9 +136,8 @@ xfs_qm_dquot_logitem_push(
         */
        error = xfs_qm_dqflush(dqp, 0);
        if (error)
-                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+                xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
-                        "xfs_qm_dquot_logitem_push: push error %d on dqp %p",
+                        __func__, error, dqp);
-                        error, dqp);
        xfs_dqunlock(dqp);
 }
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index f8e854b4fde8..254ee062bd7d 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -80,7 +80,7 @@ xfs_qm_dquot_list_print(
        int             i = 0;
        list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
-                cmn_err(CE_DEBUG, "   %d. \"%d (%s)\"   "
+                xfs_debug(mp, "   %d. \"%d (%s)\"   "
                                  "bcnt = %lld, icnt = %lld, refs = %d",
                        i++, be32_to_cpu(dqp->q_core.d_id),
                        DQFLAGTO_TYPESTR(dqp),
@@ -205,7 +205,7 @@ xfs_qm_destroy(
        list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
                xfs_dqlock(dqp);
 #ifdef QUOTADEBUG
-                cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
+                xfs_debug(dqp->q_mount, "FREELIST destroy 0x%p", dqp);
 #endif
                list_del_init(&dqp->q_freelist);
                xfs_Gqm->qm_dqfrlist_cnt--;
@@ -341,9 +341,7 @@ xfs_qm_mount_quotas(
         * quotas immediately.
         */
        if (mp->m_sb.sb_rextents) {
-                cmn_err(CE_NOTE,
+                xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
-                        "Cannot turn on quotas for realtime filesystem %s",
-                        mp->m_fsname);
                mp->m_qflags = 0;
                goto write_changes;
        }
@@ -402,14 +400,13 @@ xfs_qm_mount_quotas(
                         * off, but the on disk superblock doesn't know that !
                         */
                        ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp, "%s: Superblock update failed!",
-                                "XFS mount_quotas: Superblock update failed!");
+                                __func__);
                }
        }
        if (error) {
-                xfs_fs_cmn_err(CE_WARN, mp,
+                xfs_warn(mp, "Failed to initialize disk quotas.");
-                        "Failed to initialize disk quotas.");
                return;
        }
@@ -1230,13 +1227,6 @@ xfs_qm_qino_alloc(
        }
        /*
-         * Keep an extra reference to this quota inode. This inode is
-         * locked exclusively and joined to the transaction already.
-         */
-        ASSERT(xfs_isilocked(*ip, XFS_ILOCK_EXCL));
-        IHOLD(*ip);
-        /*
         * Make the changes in the superblock, and log those too.
         * sbfields arg may contain fields other than *QUOTINO;
         * VERSIONNUM for example.
@@ -1264,7 +1254,7 @@ xfs_qm_qino_alloc(
        xfs_mod_sb(tp, sbfields);
        if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
-                xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!");
+                xfs_alert(mp, "%s failed (error %d)!", __func__, error);
                return error;
        }
        return 0;
@@ -1299,7 +1289,7 @@ xfs_qm_reset_dqcounts(
                 * output any warnings because it's perfectly possible to
                 * find uninitialised dquot blks. See comment in xfs_qm_dqcheck.
                 */
-                (void) xfs_qm_dqcheck(ddq, id+j, type, XFS_QMOPT_DQREPAIR,
+                (void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
                                      "xfs_quotacheck");
                ddq->d_bcount = 0;
                ddq->d_icount = 0;
@@ -1676,7 +1666,7 @@ xfs_qm_quotacheck(
         */
        ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
-        cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
+        xfs_notice(mp, "Quotacheck needed: Please wait.");
        /*
         * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset
@@ -1754,9 +1744,9 @@ xfs_qm_quotacheck(
 error_return:
        if (error) {
-                cmn_err(CE_WARN, "XFS quotacheck %s: Unsuccessful (Error %d): "
+                xfs_warn(mp,
-                        "Disabling quotas.",
+        "Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
-                        mp->m_fsname, error);
+                        error);
                /*
                 * We must turn off quotas.
                 */
@@ -1764,12 +1754,11 @@ xfs_qm_quotacheck(
                ASSERT(xfs_Gqm != NULL);
                xfs_qm_destroy_quotainfo(mp);
                if (xfs_mount_reset_sbqflags(mp)) {
-                        cmn_err(CE_WARN, "XFS quotacheck %s: "
+                        xfs_warn(mp,
-                                "Failed to reset quota flags.", mp->m_fsname);
+                                "Quotacheck: Failed to reset quota flags.");
                }
-        } else {
+        } else
-                cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
+                xfs_notice(mp, "Quotacheck: Done.");
-        }
        return (error);
 }
@@ -1863,12 +1852,14 @@ xfs_qm_dqreclaim_one(void)
        xfs_dquot_t     *dqpout;
        xfs_dquot_t     *dqp;
        int             restarts;
+        int             startagain;
        restarts = 0;
        dqpout = NULL;
        /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
-startagain:
+again:
+        startagain = 0;
        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
        list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
@@ -1885,13 +1876,10 @@ startagain:
                        ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
                        trace_xfs_dqreclaim_want(dqp);
-                        xfs_dqunlock(dqp);
-                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                                return NULL;
                        XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-                        goto startagain;
+                        restarts++;
+                        startagain = 1;
+                        goto dqunlock;
                }
                /*
@@ -1906,23 +1894,20 @@ startagain:
                        ASSERT(list_empty(&dqp->q_mplist));
                        list_del_init(&dqp->q_freelist);
                        xfs_Gqm->qm_dqfrlist_cnt--;
-                        xfs_dqunlock(dqp);
                        dqpout = dqp;
                        XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-                        break;
+                        goto dqunlock;
                }
                ASSERT(dqp->q_hash);
                ASSERT(!list_empty(&dqp->q_mplist));
                /*
-                 * Try to grab the flush lock. If this dquot is in the process of
+                 * Try to grab the flush lock. If this dquot is in the process
-                 * getting flushed to disk, we don't want to reclaim it.
+                 * of getting flushed to disk, we don't want to reclaim it.
                 */
-                if (!xfs_dqflock_nowait(dqp)) {
+                if (!xfs_dqflock_nowait(dqp))
-                        xfs_dqunlock(dqp);
+                        goto dqunlock;
-                        continue;
-                }
                /*
                 * We have the flush lock so we know that this is not in the
@@ -1941,11 +1926,10 @@ startagain:
                         */
                        error = xfs_qm_dqflush(dqp, 0);
                        if (error) {
-                                xfs_fs_cmn_err(CE_WARN, mp,
+                                xfs_warn(mp, "%s: dquot %p flush failed",
-                        "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
+                                        __func__, dqp);
                        }
-                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
+                        goto dqunlock;
-                        continue;
                }
                /*
@@ -1967,13 +1951,8 @@ startagain:
                 */
                if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
                        restarts++;
-                        mutex_unlock(&dqp->q_hash->qh_lock);
+                        startagain = 1;
-                        xfs_dqfunlock(dqp);
+                        goto qhunlock;
-                        xfs_dqunlock(dqp);
-                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-                        if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                                return NULL;
-                        goto startagain;
                }
                ASSERT(dqp->q_nrefs == 0);
@@ -1986,14 +1965,20 @@ startagain:
                xfs_Gqm->qm_dqfrlist_cnt--;
                dqpout = dqp;
                mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+qhunlock:
                mutex_unlock(&dqp->q_hash->qh_lock);
 dqfunlock:
                xfs_dqfunlock(dqp);
+dqunlock:
                xfs_dqunlock(dqp);
                if (dqpout)
                        break;
                if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                        return NULL;
+                        break;
+                if (startagain) {
+                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+                        goto again;
+                }
        }
        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
        return dqpout;
@@ -2119,7 +2104,7 @@ xfs_qm_write_sb_changes(
        int             error;
 #ifdef QUOTADEBUG
-        cmn_err(CE_NOTE, "Writing superblock quota changes :%s", mp->m_fsname);
+        xfs_notice(mp, "Writing superblock quota changes");
 #endif
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
        if ((error = xfs_trans_reserve(tp, 0,
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 45b5cb1788ab..774d7ec6df8e 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -119,8 +119,7 @@ xfs_qm_newmount(
             (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
            (!gquotaondisk &&  XFS_IS_OQUOTA_ON(mp)))  &&
            xfs_dev_is_read_only(mp, "changing quota state")) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "please mount with%s%s%s%s.",
-                        "XFS: please mount with%s%s%s%s.",
                        (!quotaondisk ? "out quota" : ""),
                        (uquotaondisk ? " usrquota" : ""),
                        (pquotaondisk ? " prjquota" : ""),
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index bdebc183223e..c82f06778a27 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -41,12 +41,6 @@
 #include "xfs_qm.h"
 #include "xfs_trace.h"
-#ifdef DEBUG
-# define qdprintk(s, args...)   cmn_err(CE_DEBUG, s, ## args)
-#else
-# define qdprintk(s, args...)   do { } while (0)
-#endif
 STATIC int      xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
 STATIC int      xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
                                        uint);
@@ -294,7 +288,8 @@ xfs_qm_scall_trunc_qfiles(
        int             error = 0, error2 = 0;
        if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
-                qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
+                xfs_debug(mp, "%s: flags=%x m_qflags=%x\n",
+                        __func__, flags, mp->m_qflags);
                return XFS_ERROR(EINVAL);
        }
@@ -331,7 +326,8 @@ xfs_qm_scall_quotaon(
        sbflags = 0;
        if (flags == 0) {
-                qdprintk("quotaon: zero flags, m_qflags=%x\n", mp->m_qflags);
+                xfs_debug(mp, "%s: zero flags, m_qflags=%x\n",
+                        __func__, mp->m_qflags);
                return XFS_ERROR(EINVAL);
        }
@@ -352,8 +348,9 @@ xfs_qm_scall_quotaon(
            (flags & XFS_GQUOTA_ACCT) == 0 &&
            (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
            (flags & XFS_OQUOTA_ENFD))) {
-                qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n",
+                xfs_debug(mp,
-                        flags, mp->m_sb.sb_qflags);
+                        "%s: Can't enforce without acct, flags=%x sbflags=%x\n",
+                        __func__, flags, mp->m_sb.sb_qflags);
                return XFS_ERROR(EINVAL);
        }
        /*
@@ -541,7 +538,7 @@ xfs_qm_scall_setqlim(
                        q->qi_bsoftlimit = soft;
                }
        } else {
-                qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
+                xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft);
        }
        hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
                (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
@@ -557,7 +554,7 @@ xfs_qm_scall_setqlim(
                        q->qi_rtbsoftlimit = soft;
                }
        } else {
-                qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
+                xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
        }
        hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
@@ -574,7 +571,7 @@ xfs_qm_scall_setqlim(
                        q->qi_isoftlimit = soft;
                }
        } else {
-                qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
+                xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft);
        }
        /*
@@ -939,10 +936,11 @@ struct mutex  qcheck_lock;
 #define DQTEST_LIST_PRINT(l, NXT, title) \
 { \
          xfs_dqtest_t  *dqp; int i = 0;\
-          cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
+          xfs_debug(NULL, "%s (#%d)", title, (int) (l)->qh_nelems); \
          for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \
               dqp = (xfs_dqtest_t *)dqp->NXT) { \
-                cmn_err(CE_DEBUG, "  %d. \"%d (%s)\"  bcnt = %d, icnt = %d", \
+                xfs_debug(dqp->q_mount,         \
+                        "  %d. \"%d (%s)\"  bcnt = %d, icnt = %d", \
                         ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp),      \
                         dqp->d_bcount, dqp->d_icount); } \
 }
@@ -966,16 +964,17 @@ xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
 }
 STATIC void
 xfs_qm_dqtest_print(
-        xfs_dqtest_t    *d)
+        struct xfs_mount        *mp,
+        struct dqtest           *d)
 {
-        cmn_err(CE_DEBUG, "-----------DQTEST DQUOT----------------");
+        xfs_debug(mp, "-----------DQTEST DQUOT----------------");
-        cmn_err(CE_DEBUG, "---- dquot ID = %d", d->d_id);
+        xfs_debug(mp, "---- dquot ID = %d", d->d_id);
-        cmn_err(CE_DEBUG, "---- fs       = 0x%p", d->q_mount);
+        xfs_debug(mp, "---- fs       = 0x%p", d->q_mount);
-        cmn_err(CE_DEBUG, "---- bcount   = %Lu (0x%x)",
+        xfs_debug(mp, "---- bcount   = %Lu (0x%x)",
                d->d_bcount, (int)d->d_bcount);
-        cmn_err(CE_DEBUG, "---- icount   = %Lu (0x%x)",
+        xfs_debug(mp, "---- icount   = %Lu (0x%x)",
                d->d_icount, (int)d->d_icount);
-        cmn_err(CE_DEBUG, "---------------------------");
+        xfs_debug(mp, "---------------------------");
 }
 STATIC void
@@ -989,12 +988,14 @@ xfs_qm_dqtest_failed(
 {
        qmtest_nfails++;
        if (error)
-                cmn_err(CE_DEBUG, "quotacheck failed id=%d, err=%d\nreason: %s",
+                xfs_debug(dqp->q_mount,
-                       d->d_id, error, reason);
+                        "quotacheck failed id=%d, err=%d\nreason: %s",
+                        d->d_id, error, reason);
        else
-                cmn_err(CE_DEBUG, "quotacheck failed id=%d (%s) [%d != %d]",
+                xfs_debug(dqp->q_mount,
-                       d->d_id, reason, (int)a, (int)b);
+                        "quotacheck failed id=%d (%s) [%d != %d]",
-        xfs_qm_dqtest_print(d);
+                        d->d_id, reason, (int)a, (int)b);
+        xfs_qm_dqtest_print(dqp->q_mount, d);
        if (dqp)
                xfs_qm_dqprint(dqp);
 }
@@ -1021,9 +1022,9 @@ xfs_dqtest_cmp2(
            be64_to_cpu(dqp->q_core.d_bcount) >=
            be64_to_cpu(dqp->q_core.d_blk_softlimit)) {
                if (!dqp->q_core.d_btimer && dqp->q_core.d_id) {
-                        cmn_err(CE_DEBUG,
+                        xfs_debug(dqp->q_mount,
-                                "%d [%s] [0x%p] BLK TIMER NOT STARTED",
+                                "%d [%s] BLK TIMER NOT STARTED",
-                                d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+                                d->d_id, DQFLAGTO_TYPESTR(d));
                        err++;
                }
        }
@@ -1031,16 +1032,16 @@ xfs_dqtest_cmp2(
            be64_to_cpu(dqp->q_core.d_icount) >=
            be64_to_cpu(dqp->q_core.d_ino_softlimit)) {
                if (!dqp->q_core.d_itimer && dqp->q_core.d_id) {
-                        cmn_err(CE_DEBUG,
+                        xfs_debug(dqp->q_mount,
-                                "%d [%s] [0x%p] INO TIMER NOT STARTED",
+                                "%d [%s] INO TIMER NOT STARTED",
-                                d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+                                d->d_id, DQFLAGTO_TYPESTR(d));
                        err++;
                }
        }
 #ifdef QUOTADEBUG
        if (!err) {
-                cmn_err(CE_DEBUG, "%d [%s] [0x%p] qchecked",
+                xfs_debug(dqp->q_mount, "%d [%s] qchecked",
-                        d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+                        d->d_id, DQFLAGTO_TYPESTR(d));
        }
 #endif
        return (err);
@@ -1137,8 +1138,8 @@ xfs_qm_internalqcheck_adjust(
        if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
                *res = BULKSTAT_RV_NOTHING;
-                qdprintk("internalqcheck: ino=%llu, uqino=%llu, gqino=%llu\n",
+                xfs_debug(mp, "%s: ino=%llu, uqino=%llu, gqino=%llu\n",
-                        (unsigned long long) ino,
+                        __func__, (unsigned long long) ino,
                        (unsigned long long) mp->m_sb.sb_uquotino,
                        (unsigned long long) mp->m_sb.sb_gquotino);
                return XFS_ERROR(EINVAL);
@@ -1223,12 +1224,12 @@ xfs_qm_internalqcheck(
                                 xfs_qm_internalqcheck_adjust,
                                 0, NULL, &done);
                if (error) {
-                        cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error);
+                        xfs_debug(mp, "Bulkstat returned error 0x%x", error);
                        break;
                }
        } while (!done);
-        cmn_err(CE_DEBUG, "Checking results against system dquots");
+        xfs_debug(mp, "Checking results against system dquots");
        for (i = 0; i < qmtest_hashmask; i++) {
                xfs_dqtest_t    *d, *n;
                xfs_dqhash_t    *h;
@@ -1246,10 +1247,10 @@ xfs_qm_internalqcheck(
        }
        if (qmtest_nfails) {
-                cmn_err(CE_DEBUG, "******** quotacheck failed  ********");
+                xfs_debug(mp, "******** quotacheck failed  ********");
-                cmn_err(CE_DEBUG, "failures = %d", qmtest_nfails);
+                xfs_debug(mp, "failures = %d", qmtest_nfails);
        } else {
-                cmn_err(CE_DEBUG, "******** quotacheck successful! ********");
+                xfs_debug(mp, "******** quotacheck successful! ********");
        }
        kmem_free(qmtest_udqtab);
        kmem_free(qmtest_gdqtab);
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 7de91d1b75c0..2a3648731331 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -643,8 +643,9 @@ xfs_trans_dqresv(
             (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
              (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
 #ifdef QUOTADEBUG
-                cmn_err(CE_DEBUG, "BLK Res: nblks=%ld + resbcount=%Ld"
+                xfs_debug(mp,
-                          " > hardlimit=%Ld?", nblks, *resbcountp, hardlimit);
+                        "BLK Res: nblks=%ld + resbcount=%Ld > hardlimit=%Ld?",
+                        nblks, *resbcountp, hardlimit);
 #endif
                if (nblks > 0) {
                        /*
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
deleted file mode 100644
index 0df88897ef84..000000000000
--- a/fs/xfs/support/debug.c
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <xfs.h>
-#include "debug.h"
-/* xfs_mount.h drags a lot of crap in, sorry.. */
-#include "xfs_sb.h"
-#include "xfs_inum.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_error.h"
-void
-cmn_err(
-        const char      *lvl,
-        const char      *fmt,
-        ...)
-{
-        struct va_format vaf;
-        va_list         args;
-        va_start(args, fmt);
-        vaf.fmt = fmt;
-        vaf.va = &args;
-        printk("%s%pV", lvl, &vaf);
-        va_end(args);
-        BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
-}
-void
-xfs_fs_cmn_err(
-        const char              *lvl,
-        struct xfs_mount        *mp,
-        const char              *fmt,
-        ...)
-{
-        struct va_format        vaf;
-        va_list                 args;
-        va_start(args, fmt);
-        vaf.fmt = fmt;
-        vaf.va = &args;
-        printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf);
-        va_end(args);
-        BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
-}
-/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */
-void
-xfs_cmn_err(
-        int                     panic_tag,
-        const char              *lvl,
-        struct xfs_mount        *mp,
-        const char              *fmt,
-        ...)
-{
-        struct va_format        vaf;
-        va_list                 args;
-        int                     do_panic = 0;
-        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
-                printk(KERN_ALERT "XFS: Transforming an alert into a BUG.");
-                do_panic = 1;
-        }
-        va_start(args, fmt);
-        vaf.fmt = fmt;
-        vaf.va = &args;
-        printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf);
-        va_end(args);
-        BUG_ON(do_panic);
-}
-void
-assfail(char *expr, char *file, int line)
-{
-        printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr,
-               file, line);
-        BUG();
-}
-void
-xfs_hex_dump(void *p, int length)
-{
-        print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
-}
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
deleted file mode 100644
index 05699f67d475..000000000000
--- a/fs/xfs/support/debug.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_DEBUG_H__
-#define __XFS_SUPPORT_DEBUG_H__
-#include <stdarg.h>
-struct xfs_mount;
-#define CE_DEBUG        KERN_DEBUG
-#define CE_CONT         KERN_INFO
-#define CE_NOTE         KERN_NOTICE
-#define CE_WARN         KERN_WARNING
-#define CE_ALERT        KERN_ALERT
-#define CE_PANIC        KERN_EMERG
-void cmn_err(const char *lvl, const char *fmt, ...)
-                __attribute__ ((format (printf, 2, 3)));
-void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp,
-                const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
-void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp,
-                const char *fmt, ...) __attribute__ ((format (printf, 4, 5)));
-extern void assfail(char *expr, char *f, int l);
-#define ASSERT_ALWAYS(expr)     \
-        (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-#ifndef DEBUG
-#define ASSERT(expr)    ((void)0)
-#ifndef STATIC
-# define STATIC static noinline
-#endif
-#else /* DEBUG */
-#define ASSERT(expr)    \
-        (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-#ifndef STATIC
-# define STATIC noinline
-#endif
-#endif /* DEBUG */
-#endif  /* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index f3227984a9bf..4bc3c649aee4 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -147,10 +147,9 @@ xfs_alloc_get_rec(
 */
 STATIC void
 xfs_alloc_compute_aligned(
+        xfs_alloc_arg_t *args,          /* allocation argument structure */
        xfs_agblock_t   foundbno,       /* starting block in found extent */
        xfs_extlen_t    foundlen,       /* length in found extent */
-        xfs_extlen_t    alignment,      /* alignment for allocation */
-        xfs_extlen_t    minlen,         /* minimum length for allocation */
        xfs_agblock_t   *resbno,        /* result block number */
        xfs_extlen_t    *reslen)        /* result length */
 {
@@ -158,8 +157,8 @@ xfs_alloc_compute_aligned(
        xfs_extlen_t    diff;
        xfs_extlen_t    len;
-        if (alignment > 1 && foundlen >= minlen) {
+        if (args->alignment > 1 && foundlen >= args->minlen) {
-                bno = roundup(foundbno, alignment);
+                bno = roundup(foundbno, args->alignment);
                diff = bno - foundbno;
                len = diff >= foundlen ? 0 : foundlen - diff;
        } else {
@@ -464,6 +463,27 @@ xfs_alloc_read_agfl(
        return 0;
 }
+STATIC int
+xfs_alloc_update_counters(
+        struct xfs_trans        *tp,
+        struct xfs_perag        *pag,
+        struct xfs_buf          *agbp,
+        long                    len)
+{
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+        pag->pagf_freeblks += len;
+        be32_add_cpu(&agf->agf_freeblks, len);
+        xfs_trans_agblocks_delta(tp, len);
+        if (unlikely(be32_to_cpu(agf->agf_freeblks) >
+                     be32_to_cpu(agf->agf_length)))
+                return EFSCORRUPTED;
+        xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
+        return 0;
+}
 /*
 * Allocation group level functions.
 */
@@ -505,49 +525,44 @@ xfs_alloc_ag_vextent(
                ASSERT(0);
                /* NOTREACHED */
        }
-        if (error)
+        if (error || args->agbno == NULLAGBLOCK)
                return error;
-        /*
-         * If the allocation worked, need to change the agf structure
-         * (and log it), and the superblock.
-         */
-        if (args->agbno != NULLAGBLOCK) {
-                xfs_agf_t       *agf;   /* allocation group freelist header */
-                long            slen = (long)args->len;
-                ASSERT(args->len >= args->minlen && args->len <= args->maxlen);
+        ASSERT(args->len >= args->minlen);
-                ASSERT(!(args->wasfromfl) || !args->isfl);
+        ASSERT(args->len <= args->maxlen);
-                ASSERT(args->agbno % args->alignment == 0);
+        ASSERT(!args->wasfromfl || !args->isfl);
-                if (!(args->wasfromfl)) {
+        ASSERT(args->agbno % args->alignment == 0);
-                        agf = XFS_BUF_TO_AGF(args->agbp);
+        if (!args->wasfromfl) {
-                        be32_add_cpu(&agf->agf_freeblks, -(args->len));
+                error = xfs_alloc_update_counters(args->tp, args->pag,
-                        xfs_trans_agblocks_delta(args->tp,
+                                                  args->agbp,
-                                                 -((long)(args->len)));
+                                                  -((long)(args->len)));
-                        args->pag->pagf_freeblks -= args->len;
+                if (error)
-                        ASSERT(be32_to_cpu(agf->agf_freeblks) <=
+                        return error;
-                                be32_to_cpu(agf->agf_length));
-                        xfs_alloc_log_agf(args->tp, args->agbp,
+                /*
-                                                XFS_AGF_FREEBLKS);
+                 * Search the busylist for these blocks and mark the
-                        /*
+                 * transaction as synchronous if blocks are found. This
-                         * Search the busylist for these blocks and mark the
+                 * avoids the need to block due to a synchronous log
-                         * transaction as synchronous if blocks are found. This
+                 * force to ensure correct ordering as the synchronous
-                         * avoids the need to block due to a synchronous log
+                 * transaction will guarantee that for us.
-                         * force to ensure correct ordering as the synchronous
+                 */
-                         * transaction will guarantee that for us.
+                if (xfs_alloc_busy_search(args->mp, args->agno,
-                         */
+                                        args->agbno, args->len))
-                        if (xfs_alloc_busy_search(args->mp, args->agno,
+                        xfs_trans_set_sync(args->tp);
-                                                args->agbno, args->len))
-                                xfs_trans_set_sync(args->tp);
-                }
-                if (!args->isfl)
-                        xfs_trans_mod_sb(args->tp,
-                                args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
-                                        XFS_TRANS_SB_FDBLOCKS, -slen);
-                XFS_STATS_INC(xs_allocx);
-                XFS_STATS_ADD(xs_allocb, args->len);
        }
-        return 0;
+        if (!args->isfl) {
+                xfs_trans_mod_sb(args->tp, args->wasdel ?
+                                 XFS_TRANS_SB_RES_FDBLOCKS :
+                                 XFS_TRANS_SB_FDBLOCKS,
+                                 -((long)(args->len)));
+        }
+        XFS_STATS_INC(xs_allocx);
+        XFS_STATS_ADD(xs_allocb, args->len);
+        return error;
 }
 /*
@@ -693,8 +708,7 @@ xfs_alloc_find_best_extent(
                if (error)
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                xfs_alloc_compute_aligned(*sbno, *slen, args->alignment,
+                xfs_alloc_compute_aligned(args, *sbno, *slen, &bno, slena);
-                                          args->minlen, &bno, slena);
                /*
                 * The good extent is closer than this one.
@@ -866,8 +880,8 @@ xfs_alloc_ag_vextent_near(
                        if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
+                        xfs_alloc_compute_aligned(args, ltbno, ltlen,
-                                        args->minlen, &ltbnoa, &ltlena);
+                                                  &ltbnoa, &ltlena);
                        if (ltlena < args->minlen)
                                continue;
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
@@ -987,8 +1001,8 @@ xfs_alloc_ag_vextent_near(
                        if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
+                        xfs_alloc_compute_aligned(args, ltbno, ltlen,
-                                        args->minlen, &ltbnoa, &ltlena);
+                                                  &ltbnoa, &ltlena);
                        if (ltlena >= args->minlen)
                                break;
                        if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
@@ -1003,8 +1017,8 @@ xfs_alloc_ag_vextent_near(
                        if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        xfs_alloc_compute_aligned(gtbno, gtlen, args->alignment,
+                        xfs_alloc_compute_aligned(args, gtbno, gtlen,
-                                        args->minlen, &gtbnoa, &gtlena);
+                                                  &gtbnoa, &gtlena);
                        if (gtlena >= args->minlen)
                                break;
                        if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
@@ -1183,8 +1197,7 @@ xfs_alloc_ag_vextent_size(
         * once aligned; if not, we search left for something better.
         * This can't happen in the second case above.
         */
-        xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen,
+        xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
-                &rbno, &rlen);
        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
                        (rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1209,8 +1222,8 @@ xfs_alloc_ag_vextent_size(
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        if (flen < bestrlen)
                                break;
-                        xfs_alloc_compute_aligned(fbno, flen, args->alignment,
+                        xfs_alloc_compute_aligned(args, fbno, flen,
-                                args->minlen, &rbno, &rlen);
+                                                  &rbno, &rlen);
                        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
                        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
                                (rlen <= flen && rbno + rlen <= fbno + flen),
@@ -1388,6 +1401,7 @@ xfs_free_ag_extent(
        xfs_mount_t     *mp;            /* mount point struct for filesystem */
        xfs_agblock_t   nbno;           /* new starting block of freespace */
        xfs_extlen_t    nlen;           /* new length of freespace */
+        xfs_perag_t     *pag;           /* per allocation group data */
        mp = tp->t_mountp;
        /*
@@ -1586,30 +1600,20 @@ xfs_free_ag_extent(
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
        cnt_cur = NULL;
        /*
         * Update the freespace totals in the ag and superblock.
         */
-        {
+        pag = xfs_perag_get(mp, agno);
-                xfs_agf_t       *agf;
+        error = xfs_alloc_update_counters(tp, pag, agbp, len);
-                xfs_perag_t     *pag;           /* per allocation group data */
+        xfs_perag_put(pag);
+        if (error)
-                pag = xfs_perag_get(mp, agno);
+                goto error0;
-                pag->pagf_freeblks += len;
-                xfs_perag_put(pag);
+        if (!isfl)
+                xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
-                agf = XFS_BUF_TO_AGF(agbp);
+        XFS_STATS_INC(xs_freex);
-                be32_add_cpu(&agf->agf_freeblks, len);
+        XFS_STATS_ADD(xs_freeb, len);
-                xfs_trans_agblocks_delta(tp, len);
-                XFS_WANT_CORRUPTED_GOTO(
-                        be32_to_cpu(agf->agf_freeblks) <=
-                        be32_to_cpu(agf->agf_length),
-                        error0);
-                xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
-                if (!isfl)
-                        xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
-                XFS_STATS_INC(xs_freex);
-                XFS_STATS_ADD(xs_freeb, len);
-        }
        trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 0ab56b32c7eb..d0b3bc72005b 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -75,6 +75,22 @@ typedef unsigned int xfs_alloctype_t;
 #define XFS_ALLOC_SET_ASIDE(mp)  (4 + ((mp)->m_sb.sb_agcount * 4))
 /*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ *      - the AG superblock, AGF, AGI and AGFL
+ *      - the AGF (bno and cnt) and AGI btree root blocks
+ *      - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+#define XFS_ALLOC_AG_MAX_USABLE(mp)     \
+        ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
+/*
 * Argument structure for xfs_alloc routines.
 * This is turned into a structure to avoid having 20 arguments passed
 * down several levels of the stack.
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 4111cd3966c7..fa00788de2f5 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1038,17 +1038,34 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in the middle part of a previous delayed allocation.
                 * Contiguity is impossible here.
                 * This case is avoided almost all the time.
+                 *
+                 * We start with a delayed allocation:
+                 *
+                 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
+                 *  PREV @ idx
+                 *
+                 * and we are allocating:
+                 *                     +rrrrrrrrrrrrrrrrr+
+                 *                            new
+                 *
+                 * and we set it up for insertion as:
+                 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
+                 *                            new
+                 *  PREV @ idx          LEFT              RIGHT
+                 *                      inserted at idx + 1
                 */
                temp = new->br_startoff - PREV.br_startoff;
-                trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
-                xfs_bmbt_set_blockcount(ep, temp);
-                r[0] = *new;
-                r[1].br_state = PREV.br_state;
-                r[1].br_startblock = 0;
-                r[1].br_startoff = new_endoff;
                temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
-                r[1].br_blockcount = temp2;
+                trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
-                xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
+                xfs_bmbt_set_blockcount(ep, temp);      /* truncate PREV */
+                LEFT = *new;
+                RIGHT.br_state = PREV.br_state;
+                RIGHT.br_startblock = nullstartblock(
+                                (int)xfs_bmap_worst_indlen(ip, temp2));
+                RIGHT.br_startoff = new_endoff;
+                RIGHT.br_blockcount = temp2;
+                /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
+                xfs_iext_insert(ip, idx + 1, 2, &LEFT, state);
                ip->i_df.if_lastex = idx + 1;
                ip->i_d.di_nextents++;
                if (cur == NULL)
@@ -2348,6 +2365,13 @@ xfs_bmap_rtalloc(
         */
        if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
                ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+        /*
+         * Lock out other modifications to the RT bitmap inode.
+         */
+        xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
        /*
         * If it's an allocation to an empty file at offset 0,
         * pick an extent that will space things out in the rt area.
@@ -2430,7 +2454,7 @@ xfs_bmap_btalloc_nullfb(
                startag = ag = 0;
        pag = xfs_perag_get(mp, ag);
-        while (*blen < ap->alen) {
+        while (*blen < args->maxlen) {
                if (!pag->pagf_init) {
                        error = xfs_alloc_pagf_init(mp, args->tp, ag,
                                                    XFS_ALLOC_FLAG_TRYLOCK);
@@ -2452,7 +2476,7 @@ xfs_bmap_btalloc_nullfb(
                        notinit = 1;
                if (xfs_inode_is_filestream(ap->ip)) {
-                        if (*blen >= ap->alen)
+                        if (*blen >= args->maxlen)
                                break;
                        if (ap->userdata) {
@@ -2498,14 +2522,14 @@ xfs_bmap_btalloc_nullfb(
         * If the best seen length is less than the request
         * length, use the best as the minimum.
         */
-        else if (*blen < ap->alen)
+        else if (*blen < args->maxlen)
                args->minlen = *blen;
        /*
-         * Otherwise we've seen an extent as big as alen,
+         * Otherwise we've seen an extent as big as maxlen,
         * use that as the minimum.
         */
        else
-                args->minlen = ap->alen;
+                args->minlen = args->maxlen;
        /*
         * set the failure fallback case to look in the selected
@@ -2573,7 +2597,9 @@ xfs_bmap_btalloc(
        args.tp = ap->tp;
        args.mp = mp;
        args.fsbno = ap->rval;
-        args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks);
+        /* Trim the allocation back to the maximum an AG can fit. */
+        args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
        args.firstblock = ap->firstblock;
        blen = 0;
        if (nullfb) {
@@ -2621,7 +2647,7 @@ xfs_bmap_btalloc(
                        /*
                         * Adjust for alignment
                         */
-                        if (blen > args.alignment && blen <= ap->alen)
+                        if (blen > args.alignment && blen <= args.maxlen)
                                args.minlen = blen - args.alignment;
                        args.minalignslop = 0;
                } else {
@@ -2640,7 +2666,7 @@ xfs_bmap_btalloc(
                         * of minlen+alignment+slop doesn't go up
                         * between the calls.
                         */
-                        if (blen > mp->m_dalign && blen <= ap->alen)
+                        if (blen > mp->m_dalign && blen <= args.maxlen)
                                nextminlen = blen - mp->m_dalign;
                        else
                                nextminlen = args.minlen;
@@ -3500,7 +3526,7 @@ xfs_bmap_search_extents(
        if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
                     !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
-                xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount,
+                xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
                                "Access to block zero in inode %llu "
                                "start_block: %llx start_off: %llx "
                                "blkcnt: %llx extent-state: %x lastx: %x\n",
@@ -4174,12 +4200,11 @@ xfs_bmap_read_extents(
                num_recs = xfs_btree_get_numrecs(block);
                if (unlikely(i + num_recs > room)) {
                        ASSERT(i + num_recs <= room);
-                        xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                        xfs_warn(ip->i_mount,
                                "corrupt dinode %Lu, (btree extents).",
                                (unsigned long long) ip->i_ino);
-                        XFS_ERROR_REPORT("xfs_bmap_read_extents(1)",
+                        XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
-                                         XFS_ERRLEVEL_LOW,
+                                XFS_ERRLEVEL_LOW, ip->i_mount, block);
-                                        ip->i_mount);
                        goto error0;
                }
                XFS_WANT_CORRUPTED_GOTO(
@@ -4485,6 +4510,16 @@ xfs_bmapi(
                                /* Figure out the extent size, adjust alen */
                                extsz = xfs_get_extsz_hint(ip);
                                if (extsz) {
+                                        /*
+                                         * make sure we don't exceed a single
+                                         * extent length when we align the
+                                         * extent by reducing length we are
+                                         * going to allocate by the maximum
+                                         * amount extent size aligment may
+                                         * require.
+                                         */
+                                        alen = XFS_FILBLKS_MIN(len,
+                                                   MAXEXTLEN - (2 * extsz - 1));
                                        error = xfs_bmap_extsize_align(mp,
                                                        &got, &prev, extsz,
                                                        rt, eof,
@@ -5743,7 +5778,7 @@ xfs_check_block(
                        else
                                thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
                        if (*thispa == *pp) {
-                                cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
+                                xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
                                        __func__, j, i,
                                        (unsigned long long)be64_to_cpu(*thispa));
                                panic("%s: ptrs are equal in node\n",
@@ -5908,11 +5943,11 @@ xfs_bmap_check_leaf_extents(
        return;
 error0:
-        cmn_err(CE_WARN, "%s: at error0", __func__);
+        xfs_warn(mp, "%s: at error0", __func__);
        if (bp_release)
                xfs_trans_brelse(NULL, bp);
 error_norelse:
-        cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents",
+        xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
                __func__, i);
        panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
        return;
@@ -6115,7 +6150,7 @@ xfs_bmap_punch_delalloc_range(
                if (error) {
                        /* something screwed, just bail */
                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                                xfs_alert(ip->i_mount,
                        "Failed delalloc mapping lookup ino %lld fsb %lld.",
                                                ip->i_ino, start_fsb);
                        }
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 98c6f73b6752..e5413d96f1af 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -130,10 +130,12 @@ xfs_buf_item_log_check(
        orig = bip->bli_orig;
        buffer = XFS_BUF_PTR(bp);
        for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
-                if (orig[x] != buffer[x] && !btst(bip->bli_logged, x))
+                if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
-                        cmn_err(CE_PANIC,
+                        xfs_emerg(bp->b_mount,
-        "xfs_buf_item_log_check bip %x buffer %x orig %x index %d",
+                                "%s: bip %x buffer %x orig %x index %d",
-                                bip, bp, orig, x);
+                                __func__, bip, bp, orig, x);
+                        ASSERT(0);
+                }
        }
 }
 #else
@@ -427,13 +429,15 @@ xfs_buf_item_unpin(
                if (remove) {
                        /*
-                         * We have to remove the log item from the transaction
+                         * If we are in a transaction context, we have to
-                         * as we are about to release our reference to the
+                         * remove the log item from the transaction as we are
-                         * buffer.  If we don't, the unlock that occurs later
+                         * about to release our reference to the buffer.  If we
-                         * in xfs_trans_uncommit() will ry to reference the
+                         * don't, the unlock that occurs later in
+                         * xfs_trans_uncommit() will try to reference the
                         * buffer which we no longer have a hold on.
                         */
-                        xfs_trans_del_item(lip);
+                        if (lip->li_desc)
+                                xfs_trans_del_item(lip);
                        /*
                         * Since the transaction no longer refers to the buffer,
@@ -981,10 +985,9 @@ xfs_buf_iodone_callbacks(
        if (XFS_BUF_TARGET(bp) != lasttarg ||
            time_after(jiffies, (lasttime + 5*HZ))) {
                lasttime = jiffies;
-                cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
+                xfs_alert(mp, "Device %s: metadata write error block 0x%llx",
-                                " block 0x%llx in %s",
                        XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
-                      (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
+                      (__uint64_t)XFS_BUF_ADDR(bp));
        }
        lasttarg = XFS_BUF_TARGET(bp);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 1c00bedb3175..6102ac6d1dff 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1995,13 +1995,12 @@ xfs_da_do_buf(
                error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
                if (unlikely(error == EFSCORRUPTED)) {
                        if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
-                                cmn_err(CE_ALERT, "xfs_da_do_buf: bno %lld\n",
+                                xfs_alert(mp, "%s: bno %lld dir: inode %lld",
-                                        (long long)bno);
+                                        __func__, (long long)bno,
-                                cmn_err(CE_ALERT, "dir: inode %lld\n",
                                        (long long)dp->i_ino);
                                for (i = 0; i < nmap; i++) {
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                                "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d\n",
+"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
                                                i,
                                                (long long)mapp[i].br_startoff,
                                                (long long)mapp[i].br_startblock,
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index e60490bc00a6..be628677c288 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -270,9 +270,9 @@ xfs_swap_extents(
        /* check inode formats now that data is flushed */
        error = xfs_swap_extents_check_format(ip, tip);
        if (error) {
-                xfs_fs_cmn_err(CE_NOTE, mp,
+                xfs_notice(mp,
                    "%s: inode 0x%llx format is incompatible for exchanging.",
-                                __FILE__, ip->i_ino);
+                                __func__, ip->i_ino);
                goto out_unlock;
        }
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index a1321bc7f192..dba7a71cedf3 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -159,7 +159,7 @@ xfs_dir_ino_validate(
                XFS_AGINO_TO_INO(mp, agno, agino) == ino;
        if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
                        XFS_RANDOM_DIR_INO_VALIDATE))) {
-                xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx",
+                xfs_warn(mp, "Invalid inode number 0x%Lx",
                                (unsigned long long) ino);
                XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
                return XFS_ERROR(EFSCORRUPTED);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index f9a0864b696a..a0aab7d3294f 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -899,10 +899,9 @@ xfs_dir2_leafn_rebalance(
        if(blk2->index < 0) {
                state->inleaf = 1;
                blk2->index = 0;
-                cmn_err(CE_ALERT,
+                xfs_alert(args->dp->i_mount,
-                        "xfs_dir2_leafn_rebalance: picked the wrong leaf? reverting original leaf: "
+        "%s: picked the wrong leaf? reverting original leaf: blk1->index %d\n",
-                        "blk1->index %d\n",
+                        __func__, blk1->index);
-                        blk1->index);
        }
 }
@@ -1641,26 +1640,22 @@ xfs_dir2_node_addname_int(
                        }
                        if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) {
-                                cmn_err(CE_ALERT,
+                                xfs_alert(mp,
-                                        "xfs_dir2_node_addname_int: dir ino "
+                        "%s: dir ino " "%llu needed freesp block %lld for\n"
-                                        "%llu needed freesp block %lld for\n"
+                        "  data block %lld, got %lld ifbno %llu lastfbno %d",
-                                        "  data block %lld, got %lld\n"
+                                        __func__, (unsigned long long)dp->i_ino,
-                                        "  ifbno %llu lastfbno %d\n",
-                                        (unsigned long long)dp->i_ino,
                                        (long long)xfs_dir2_db_to_fdb(mp, dbno),
                                        (long long)dbno, (long long)fbno,
                                        (unsigned long long)ifbno, lastfbno);
                                if (fblk) {
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                                " fblk 0x%p blkno %llu "
+                                " fblk 0x%p blkno %llu index %d magic 0x%x",
-                                                "index %d magic 0x%x\n",
                                                fblk,
                                                (unsigned long long)fblk->blkno,
                                                fblk->index,
                                                fblk->magic);
                                } else {
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp, " ... fblk is NULL");
-                                                " ... fblk is NULL\n");
                                }
                                XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
                                                 XFS_ERRLEVEL_LOW, mp);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 4c7db74a05f7..39f06336b99d 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -48,7 +48,7 @@ xfs_error_trap(int e)
                        break;
                if (e != xfs_etrap[i])
                        continue;
-                cmn_err(CE_NOTE, "xfs_error_trap: error %d", e);
+                xfs_notice(NULL, "%s: error %d", __func__, e);
                BUG();
                break;
        }
@@ -74,7 +74,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
                if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(NULL,
        "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
                                expression, file, line, xfs_etest_fsname[i]);
                        return 1;
@@ -95,14 +95,14 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
                if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
-                        cmn_err(CE_WARN, "XFS error tag #%d on", error_tag);
+                        xfs_warn(mp, "error tag #%d on", error_tag);
                        return 0;
                }
        }
        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
                if (xfs_etest[i] == 0) {
-                        cmn_err(CE_WARN, "Turned on XFS error tag #%d",
+                        xfs_warn(mp, "Turned on XFS error tag #%d",
                                error_tag);
                        xfs_etest[i] = error_tag;
                        xfs_etest_fsid[i] = fsid;
@@ -114,7 +114,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
                }
        }
-        cmn_err(CE_WARN, "error tag overflow, too many turned on");
+        xfs_warn(mp, "error tag overflow, too many turned on");
        return 1;
 }
@@ -133,7 +133,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
                if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) &&
                     xfs_etest[i] != 0) {
                        cleared = 1;
-                        cmn_err(CE_WARN, "Clearing XFS error tag #%d",
+                        xfs_warn(mp, "Clearing XFS error tag #%d",
                                xfs_etest[i]);
                        xfs_etest[i] = 0;
                        xfs_etest_fsid[i] = 0LL;
@@ -144,9 +144,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
        }
        if (loud || cleared)
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "Cleared all XFS error tags for filesystem");
-                        "Cleared all XFS error tags for filesystem \"%s\"",
-                        mp->m_fsname);
        return 0;
 }
@@ -162,9 +160,8 @@ xfs_error_report(
        inst_t                  *ra)
 {
        if (level <= xfs_error_level) {
-                xfs_cmn_err(XFS_PTAG_ERROR_REPORT,
+                xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
-                            CE_ALERT, mp,
+                "Internal error %s at line %d of file %s.  Caller 0x%p\n",
-                "XFS internal error %s at line %d of file %s.  Caller 0x%p\n",
                            tag, linenum, filename, ra);
                xfs_stack_trace();
@@ -184,4 +181,5 @@ xfs_corruption_error(
        if (level <= xfs_error_level)
                xfs_hex_dump(p, 16);
        xfs_error_report(tag, level, mp, filename, linenum, ra);
+        xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
 }
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 10dce5475f02..079a367f44ee 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -145,10 +145,8 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
 #endif /* DEBUG */
 /*
- * XFS panic tags -- allow a call to xfs_cmn_err() be turned into
+ * XFS panic tags -- allow a call to xfs_alert_tag() be turned into
- *                      a panic by setting xfs_panic_mask in a
+ *                      a panic by setting xfs_panic_mask in a sysctl.
- *                      sysctl.  update xfs_max[XFS_PARAM] if
- *                      more are added.
 */
 #define         XFS_NO_PTAG                     0
 #define         XFS_PTAG_IFLUSH                 0x00000001
@@ -160,17 +158,4 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
 #define         XFS_PTAG_SHUTDOWN_LOGERROR      0x00000040
 #define         XFS_PTAG_FSBLOCK_ZERO           0x00000080
-struct xfs_mount;
-extern void xfs_hex_dump(void *p, int length);
-#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
-        xfs_fs_cmn_err(level, mp, fmt "  Unmount and run xfs_repair.", ## args)
-#define xfs_fs_mount_cmn_err(f, fmt, args...) \
-        do { \
-                if (!(f & XFS_MFSI_QUIET))      \
-                        cmn_err(CE_WARN, "XFS: " fmt, ## args); \
-        } while (0)
 #endif  /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 75f2ef60e579..d22e62623437 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -138,7 +138,8 @@ xfs_efi_item_unpin(
        if (remove) {
                ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
-                xfs_trans_del_item(lip);
+                if (lip->li_desc)
+                        xfs_trans_del_item(lip);
                xfs_efi_item_free(efip);
                return;
        }
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cec89dd5d7d2..9153d2c77caf 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -53,6 +53,9 @@ xfs_fs_geometry(
        xfs_fsop_geom_t         *geo,
        int                     new_version)
 {
+        memset(geo, 0, sizeof(*geo));
        geo->blocksize = mp->m_sb.sb_blocksize;
        geo->rtextsize = mp->m_sb.sb_rextsize;
        geo->agblocks = mp->m_sb.sb_agblocks;
@@ -382,8 +385,8 @@ xfs_growfs_data_private(
                                  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
                                  XFS_FSS_TO_BB(mp, 1), 0, &bp);
                if (error) {
-                        xfs_fs_cmn_err(CE_WARN, mp,
+                        xfs_warn(mp,
-                        "error %d reading secondary superblock for ag %d",
+                "error %d reading secondary superblock for ag %d",
                                error, agno);
                        break;
                }
@@ -396,7 +399,7 @@ xfs_growfs_data_private(
                if (!(error = xfs_bwrite(mp, bp))) {
                        continue;
                } else {
-                        xfs_fs_cmn_err(CE_WARN, mp,
+                        xfs_warn(mp,
                "write error %d updating secondary superblock for ag %d",
                                error, agno);
                        break; /* no point in continuing */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 0626a32c3447..84ebeec16642 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1055,28 +1055,23 @@ xfs_difree(
         */
        agno = XFS_INO_TO_AGNO(mp, inode);
        if (agno >= mp->m_sb.sb_agcount)  {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
-                        "xfs_difree: agno >= mp->m_sb.sb_agcount (%d >= %d) on %s.  Returning EINVAL.",
+                        __func__, agno, mp->m_sb.sb_agcount);
-                        agno, mp->m_sb.sb_agcount, mp->m_fsname);
                ASSERT(0);
                return XFS_ERROR(EINVAL);
        }
        agino = XFS_INO_TO_AGINO(mp, inode);
        if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
-                        "xfs_difree: inode != XFS_AGINO_TO_INO() "
+                        __func__, (unsigned long long)inode,
-                        "(%llu != %llu) on %s.  Returning EINVAL.",
+                        (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
-                        (unsigned long long)inode,
-                        (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino),
-                        mp->m_fsname);
                ASSERT(0);
                return XFS_ERROR(EINVAL);
        }
        agbno = XFS_AGINO_TO_AGBNO(mp, agino);
        if (agbno >= mp->m_sb.sb_agblocks)  {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
-                        "xfs_difree: agbno >= mp->m_sb.sb_agblocks (%d >= %d) on %s.  Returning EINVAL.",
+                        __func__, agbno, mp->m_sb.sb_agblocks);
-                        agbno, mp->m_sb.sb_agblocks, mp->m_fsname);
                ASSERT(0);
                return XFS_ERROR(EINVAL);
        }
@@ -1085,9 +1080,8 @@ xfs_difree(
         */
        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
        if (error) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
-                        "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s.  Returning error.",
+                        __func__, error);
-                        error, mp->m_fsname);
                return error;
        }
        agi = XFS_BUF_TO_AGI(agbp);
@@ -1106,17 +1100,15 @@ xfs_difree(
         * Look for the entry describing this inode.
         */
        if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
-                        "xfs_difree: xfs_inobt_lookup returned()  an error %d on %s.  Returning error.",
+                        __func__, error);
-                        error, mp->m_fsname);
                goto error0;
        }
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        error = xfs_inobt_get_rec(cur, &rec, &i);
        if (error) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
-                        "xfs_difree: xfs_inobt_get_rec()  returned an error %d on %s.  Returning error.",
+                        __func__, error);
-                        error, mp->m_fsname);
                goto error0;
        }
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
@@ -1157,8 +1149,8 @@ xfs_difree(
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
                if ((error = xfs_btree_delete(cur, &i))) {
-                        cmn_err(CE_WARN, "xfs_difree: xfs_btree_delete returned an error %d on %s.\n",
+                        xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
-                                error, mp->m_fsname);
+                                __func__, error);
                        goto error0;
                }
@@ -1170,9 +1162,8 @@ xfs_difree(
                error = xfs_inobt_update(cur, &rec);
                if (error) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
-        "xfs_difree: xfs_inobt_update returned an error %d on %s.",
+                                __func__, error);
-                                error, mp->m_fsname);
                        goto error0;
                }
@@ -1218,10 +1209,9 @@ xfs_imap_lookup(
        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
        if (error) {
-                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+                xfs_alert(mp,
-                                "xfs_ialloc_read_agi() returned "
+                        "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
-                                "error %d, agno %d",
+                        __func__, error, agno);
-                                error, agno);
                return error;
        }
@@ -1299,24 +1289,21 @@ xfs_imap(
                if (flags & XFS_IGET_UNTRUSTED)
                        return XFS_ERROR(EINVAL);
                if (agno >= mp->m_sb.sb_agcount) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                        "xfs_imap: agno (%d) >= "
+                                "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
-                                        "mp->m_sb.sb_agcount (%d)",
+                                __func__, agno, mp->m_sb.sb_agcount);
-                                        agno,  mp->m_sb.sb_agcount);
                }
                if (agbno >= mp->m_sb.sb_agblocks) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                        "xfs_imap: agbno (0x%llx) >= "
+                "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
-                                        "mp->m_sb.sb_agblocks (0x%lx)",
+                                __func__, (unsigned long long)agbno,
-                                        (unsigned long long) agbno,
+                                (unsigned long)mp->m_sb.sb_agblocks);
-                                        (unsigned long) mp->m_sb.sb_agblocks);
                }
                if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                        "xfs_imap: ino (0x%llx) != "
+                "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
-                                        "XFS_AGINO_TO_INO(mp, agno, agino) "
+                                __func__, ino,
-                                        "(0x%llx)",
+                                XFS_AGINO_TO_INO(mp, agno, agino));
-                                        ino, XFS_AGINO_TO_INO(mp, agno, agino));
                }
                xfs_stack_trace();
 #endif /* DEBUG */
@@ -1388,10 +1375,9 @@ out_map:
         */
        if ((imap->im_blkno + imap->im_len) >
            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+                xfs_alert(mp,
-                        "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
+        "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
-                        " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
+                        __func__, (unsigned long long) imap->im_blkno,
-                        (unsigned long long) imap->im_blkno,
                        (unsigned long long) imap->im_len,
                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
                return XFS_ERROR(EINVAL);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index be7cf625421f..da871f532236 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -110,8 +110,8 @@ xfs_inobp_check(
                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
                                        i * mp->m_sb.sb_inodesize);
                if (!dip->di_next_unlinked)  {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p.  About to pop an ASSERT.",
+        "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
                                bp);
                        ASSERT(dip->di_next_unlinked);
                }
@@ -142,10 +142,9 @@ xfs_imap_to_bp(
                                   (int)imap->im_len, buf_flags, &bp);
        if (error) {
                if (error != EAGAIN) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-                                "xfs_imap_to_bp: xfs_trans_read_buf()returned "
+                                "%s: xfs_trans_read_buf() returned error %d.",
-                                "an error %d on %s.  Returning error.",
+                                __func__, error);
-                                error, mp->m_fsname);
                } else {
                        ASSERT(buf_flags & XBF_TRYLOCK);
                }
@@ -180,12 +179,11 @@ xfs_imap_to_bp(
                        XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
                                                XFS_ERRLEVEL_HIGH, mp, dip);
 #ifdef DEBUG
-                        cmn_err(CE_PANIC,
+                        xfs_emerg(mp,
-                                        "Device %s - bad inode magic/vsn "
+                                "bad inode magic/vsn daddr %lld #%d (magic=%x)",
-                                        "daddr %lld #%d (magic=%x)",
-                                XFS_BUFTARG_NAME(mp->m_ddev_targp),
                                (unsigned long long)imap->im_blkno, i,
                                be16_to_cpu(dip->di_magic));
+                        ASSERT(0);
 #endif
                        xfs_trans_brelse(tp, bp);
                        return XFS_ERROR(EFSCORRUPTED);
@@ -317,7 +315,7 @@ xfs_iformat(
        if (unlikely(be32_to_cpu(dip->di_nextents) +
                     be16_to_cpu(dip->di_anextents) >
                     be64_to_cpu(dip->di_nblocks))) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount,
                        "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
                        (unsigned long long)ip->i_ino,
                        (int)(be32_to_cpu(dip->di_nextents) +
@@ -330,8 +328,7 @@ xfs_iformat(
        }
        if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
-                        "corrupt dinode %Lu, forkoff = 0x%x.",
                        (unsigned long long)ip->i_ino,
                        dip->di_forkoff);
                XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
@@ -341,7 +338,7 @@ xfs_iformat(
        if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
                     !ip->i_mount->m_rtdev_targp)) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount,
                        "corrupt dinode %Lu, has realtime flag set.",
                        ip->i_ino);
                XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
@@ -373,9 +370,8 @@ xfs_iformat(
                         * no local regular files yet
                         */
                        if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
-                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                                xfs_warn(ip->i_mount,
-                                        "corrupt inode %Lu "
+                        "corrupt inode %Lu (local format for regular file).",
-                                        "(local format for regular file).",
                                        (unsigned long long) ip->i_ino);
                                XFS_CORRUPTION_ERROR("xfs_iformat(4)",
                                                     XFS_ERRLEVEL_LOW,
@@ -385,9 +381,8 @@ xfs_iformat(
                        di_size = be64_to_cpu(dip->di_size);
                        if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
-                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                                xfs_warn(ip->i_mount,
-                                        "corrupt inode %Lu "
+                        "corrupt inode %Lu (bad size %Ld for local inode).",
-                                        "(bad size %Ld for local inode).",
                                        (unsigned long long) ip->i_ino,
                                        (long long) di_size);
                                XFS_CORRUPTION_ERROR("xfs_iformat(5)",
@@ -431,9 +426,8 @@ xfs_iformat(
                size = be16_to_cpu(atp->hdr.totsize);
                if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
-                        xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                        xfs_warn(ip->i_mount,
-                                "corrupt inode %Lu "
+                                "corrupt inode %Lu (bad attr fork size %Ld).",
-                                "(bad attr fork size %Ld).",
                                (unsigned long long) ip->i_ino,
                                (long long) size);
                        XFS_CORRUPTION_ERROR("xfs_iformat(8)",
@@ -488,9 +482,8 @@ xfs_iformat_local(
         * kmem_alloc() or memcpy() below.
         */
        if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount,
-                        "corrupt inode %Lu "
+        "corrupt inode %Lu (bad size %d for local fork, size = %d).",
-                        "(bad size %d for local fork, size = %d).",
                        (unsigned long long) ip->i_ino, size,
                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
                XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
@@ -547,8 +540,7 @@ xfs_iformat_extents(
         * kmem_alloc() or memcpy() below.
         */
        if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
-                        "corrupt inode %Lu ((a)extents = %d).",
                        (unsigned long long) ip->i_ino, nex);
                XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
                                     ip->i_mount, dip);
@@ -623,11 +615,10 @@ xfs_iformat_btree(
            || XFS_BMDR_SPACE_CALC(nrecs) >
                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
            || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
-                        "corrupt inode %Lu (btree).",
                        (unsigned long long) ip->i_ino);
-                XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
+                XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
-                                 ip->i_mount);
+                                 ip->i_mount, dip);
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -813,11 +804,9 @@ xfs_iread(
         */
        if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) {
 #ifdef DEBUG
-                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
+                xfs_alert(mp,
-                                "dip->di_magic (0x%x) != "
+                        "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
-                                "XFS_DINODE_MAGIC (0x%x)",
+                        __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
-                                be16_to_cpu(dip->di_magic),
-                                XFS_DINODE_MAGIC);
 #endif /* DEBUG */
                error = XFS_ERROR(EINVAL);
                goto out_brelse;
@@ -835,9 +824,8 @@ xfs_iread(
                error = xfs_iformat(ip, dip);
                if (error)  {
 #ifdef DEBUG
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
+                        xfs_alert(mp, "%s: xfs_iformat() returned error %d",
-                                        "xfs_iformat() returned error %d",
+                                __func__, error);
-                                        error);
 #endif /* DEBUG */
                        goto out_brelse;
                }
@@ -1016,8 +1004,8 @@ xfs_ialloc(
         * This is because we're setting fields here we need
         * to prevent others from looking at until we're done.
         */
-        error = xfs_trans_iget(tp->t_mountp, tp, ino,
+        error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
-                                XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
+                         XFS_ILOCK_EXCL, &ip);
        if (error)
                return error;
        ASSERT(ip != NULL);
@@ -1166,6 +1154,7 @@ xfs_ialloc(
        /*
         * Log the new values stuffed into the inode.
         */
+        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_log_inode(tp, ip, flags);
        /* now that we have an i_mode we can setup inode ops and unlock */
@@ -1820,9 +1809,8 @@ xfs_iunlink_remove(
                 */
                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
                if (error) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s: xfs_itobp() returned error %d.",
-                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
+                                __func__, error);
-                                error, mp->m_fsname);
                        return error;
                }
                next_agino = be32_to_cpu(dip->di_next_unlinked);
@@ -1867,9 +1855,9 @@ xfs_iunlink_remove(
                        error = xfs_inotobp(mp, tp, next_ino, &last_dip,
                                            &last_ibp, &last_offset, 0);
                        if (error) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp,
-                        "xfs_iunlink_remove: xfs_inotobp()  returned an error %d on %s.  Returning error.",
+                                        "%s: xfs_inotobp() returned error %d.",
-                                        error, mp->m_fsname);
+                                        __func__, error);
                                return error;
                        }
                        next_agino = be32_to_cpu(last_dip->di_next_unlinked);
@@ -1882,9 +1870,8 @@ xfs_iunlink_remove(
                 */
                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
                if (error) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.",
-                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
+                                __func__, error);
-                                error, mp->m_fsname);
                        return error;
                }
                next_agino = be32_to_cpu(dip->di_next_unlinked);
@@ -2939,16 +2926,16 @@ xfs_iflush_int(
        if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
                               mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
-                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                    "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
+                        "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
-                        ip->i_ino, be16_to_cpu(dip->di_magic), dip);
+                        __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
                goto corrupt_out;
        }
        if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
                                mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
-                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                        "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
+                        "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
-                        ip->i_ino, ip, ip->i_d.di_magic);
+                        __func__, ip->i_ino, ip, ip->i_d.di_magic);
                goto corrupt_out;
        }
        if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
@@ -2956,9 +2943,9 @@ xfs_iflush_int(
                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
                    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
-                        xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                                "xfs_iflush: Bad regular inode %Lu, ptr 0x%p",
+                                "%s: Bad regular inode %Lu, ptr 0x%p",
-                                ip->i_ino, ip);
+                                __func__, ip->i_ino, ip);
                        goto corrupt_out;
                }
        } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
@@ -2967,28 +2954,28 @@ xfs_iflush_int(
                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
                    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
                    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
-                        xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                                "xfs_iflush: Bad directory inode %Lu, ptr 0x%p",
+                                "%s: Bad directory inode %Lu, ptr 0x%p",
-                                ip->i_ino, ip);
+                                __func__, ip->i_ino, ip);
                        goto corrupt_out;
                }
        }
        if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
                                ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
                                XFS_RANDOM_IFLUSH_5)) {
-                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                        "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p",
+                        "%s: detected corrupt incore inode %Lu, "
-                        ip->i_ino,
+                        "total extents = %d, nblocks = %Ld, ptr 0x%p",
+                        __func__, ip->i_ino,
                        ip->i_d.di_nextents + ip->i_d.di_anextents,
-                        ip->i_d.di_nblocks,
+                        ip->i_d.di_nblocks, ip);
-                        ip);
                goto corrupt_out;
        }
        if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
                                mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
-                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                        "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
+                        "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
-                        ip->i_ino, ip->i_d.di_forkoff, ip);
+                        __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
                goto corrupt_out;
        }
        /*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 5c95fa8ec11d..f753200cef8d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -409,28 +409,35 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
 * Flags for lockdep annotations.
 *
- * XFS_I[O]LOCK_PARENT - for operations that require locking two inodes
+ * XFS_LOCK_PARENT - for directory operations that require locking a
- * (ie directory operations that require locking a directory inode and
+ * parent directory inode and a child entry inode.  The parent gets locked
- * an entry inode).  The first inode gets locked with this flag so it
+ * with this flag so it gets a lockdep subclass of 1 and the child entry
- * gets a lockdep subclass of 1 and the second lock will have a lockdep
+ * lock will have a lockdep subclass of 0.
- * subclass of 0.
+ *
+ * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary
+ * inodes do not participate in the normal lock order, and thus have their
+ * own subclasses.
 *
 * XFS_LOCK_INUMORDER - for locking several inodes at the some time
 * with xfs_lock_inodes().  This flag is used as the starting subclass
 * and each subsequent lock acquired will increment the subclass by one.
- * So the first lock acquired will have a lockdep subclass of 2, the
+ * So the first lock acquired will have a lockdep subclass of 4, the
- * second lock will have a lockdep subclass of 3, and so on. It is
+ * second lock will have a lockdep subclass of 5, and so on. It is
 * the responsibility of the class builder to shift this to the correct
 * portion of the lock_mode lockdep mask.
 */
 #define XFS_LOCK_PARENT         1
-#define XFS_LOCK_INUMORDER      2
+#define XFS_LOCK_RTBITMAP       2
+#define XFS_LOCK_RTSUM          3
+#define XFS_LOCK_INUMORDER      4
 #define XFS_IOLOCK_SHIFT        16
 #define XFS_IOLOCK_PARENT       (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
 #define XFS_ILOCK_SHIFT         24
 #define XFS_ILOCK_PARENT        (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
+#define XFS_ILOCK_RTBITMAP      (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
+#define XFS_ILOCK_RTSUM         (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
 #define XFS_IOLOCK_DEP_MASK     0x00ff0000
 #define XFS_ILOCK_DEP_MASK      0xff000000
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 55582bd66659..091d82b94c4d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -101,11 +101,11 @@ xfs_iomap_eof_align_last_fsb(
 }
 STATIC int
-xfs_cmn_err_fsblock_zero(
+xfs_alert_fsblock_zero(
        xfs_inode_t     *ip,
        xfs_bmbt_irec_t *imap)
 {
-        xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount,
+        xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
                        "Access to block zero in inode %llu "
                        "start_block: %llx start_off: %llx "
                        "blkcnt: %llx extent-state: %x\n",
@@ -246,7 +246,7 @@ xfs_iomap_write_direct(
        }
        if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
-                error = xfs_cmn_err_fsblock_zero(ip, imap);
+                error = xfs_alert_fsblock_zero(ip, imap);
                goto error_out;
        }
@@ -337,7 +337,12 @@ xfs_iomap_prealloc_size(
                int shift = 0;
                int64_t freesp;
-                alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size);
+                /*
+                 * rounddown_pow_of_two() returns an undefined result
+                 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
+                 * ensure we always pass in a non-zero value.
+                 */
+                alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
                alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
                                        rounddown_pow_of_two(alloc_blocks));
@@ -459,7 +464,7 @@ retry:
        }
        if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
-                return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
+                return xfs_alert_fsblock_zero(ip, &imap[0]);
        *ret_imap = imap[0];
        return 0;
@@ -609,7 +614,7 @@ xfs_iomap_write_allocate(
                 * covers at least part of the callers request
                 */
                if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
-                        return xfs_cmn_err_fsblock_zero(ip, imap);
+                        return xfs_alert_fsblock_zero(ip, imap);
                if ((offset_fsb >= imap->br_startoff) &&
                    (offset_fsb < (imap->br_startoff +
@@ -719,7 +724,7 @@ xfs_iomap_write_unwritten(
                        return XFS_ERROR(error);
                if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
-                        return xfs_cmn_err_fsblock_zero(ip, &imap);
+                        return xfs_alert_fsblock_zero(ip, &imap);
                if ((numblks_fsb = imap.br_blockcount) == 0) {
                        /*
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ae6fef1ff563..25efa9b8a602 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -374,11 +374,10 @@ xfs_log_mount(
        int             error;
        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
-                cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
+                xfs_notice(mp, "Mounting Filesystem");
        else {
-                cmn_err(CE_NOTE,
+                xfs_notice(mp,
-                        "Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
+"Mounting filesystem in no-recovery mode.  Filesystem will be inconsistent.");
-                        mp->m_fsname);
                ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
        }
@@ -393,7 +392,7 @@ xfs_log_mount(
         */
        error = xfs_trans_ail_init(mp);
        if (error) {
-                cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
+                xfs_warn(mp, "AIL initialisation failed: error %d", error);
                goto out_free_log;
        }
        mp->m_log->l_ailp = mp->m_ail;
@@ -413,7 +412,8 @@ xfs_log_mount(
                if (readonly)
                        mp->m_flags |= XFS_MOUNT_RDONLY;
                if (error) {
-                        cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error);
+                        xfs_warn(mp, "log mount/recovery failed: error %d",
+                                error);
                        goto out_destroy_ail;
                }
        }
@@ -542,10 +542,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                         */
                }
-                if (error) {
+                if (error)
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp, "%s: unmount record failed", __func__);
-                                "xfs_log_unmount: unmount record failed");
-                }
                spin_lock(&log->l_icloglock);
@@ -852,7 +850,7 @@ xlog_space_left(
                 * In this case we just want to return the size of the
                 * log as the amount of space left.
                 */
-                xfs_fs_cmn_err(CE_ALERT, log->l_mp,
+                xfs_alert(log->l_mp,
                        "xlog_space_left: head behind tail\n"
                        "  tail_cycle = %d, tail_bytes = %d\n"
                        "  GH   cycle = %d, GH   bytes = %d",
@@ -1001,7 +999,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
        if (!log) {
-                xlog_warn("XFS: Log allocation failed: No memory!");
+                xfs_warn(mp, "Log allocation failed: No memory!");
                goto out;
        }
@@ -1029,24 +1027,24 @@ xlog_alloc_log(xfs_mount_t	*mp,
        if (xfs_sb_version_hassector(&mp->m_sb)) {
                log2_size = mp->m_sb.sb_logsectlog;
                if (log2_size < BBSHIFT) {
-                        xlog_warn("XFS: Log sector size too small "
+                        xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
-                                "(0x%x < 0x%x)", log2_size, BBSHIFT);
+                                log2_size, BBSHIFT);
                        goto out_free_log;
                }
                log2_size -= BBSHIFT;
                if (log2_size > mp->m_sectbb_log) {
-                        xlog_warn("XFS: Log sector size too large "
+                        xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)",
-                                "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log);
+                                log2_size, mp->m_sectbb_log);
                        goto out_free_log;
                }
                /* for larger sector sizes, must have v2 or external log */
                if (log2_size && log->l_logBBstart > 0 &&
                            !xfs_sb_version_haslogv2(&mp->m_sb)) {
+                        xfs_warn(mp,
-                        xlog_warn("XFS: log sector size (0x%x) invalid "
+                "log sector size (0x%x) invalid for configuration.",
-                                  "for configuration.", log2_size);
+                                log2_size);
                        goto out_free_log;
                }
        }
@@ -1563,38 +1561,36 @@ xlog_print_tic_res(
            "SWAPEXT"
        };
-        xfs_fs_cmn_err(CE_WARN, mp,
+        xfs_warn(mp,
-                        "xfs_log_write: reservation summary:\n"
+                "xfs_log_write: reservation summary:\n"
-                        "  trans type  = %s (%u)\n"
+                "  trans type  = %s (%u)\n"
-                        "  unit res    = %d bytes\n"
+                "  unit res    = %d bytes\n"
-                        "  current res = %d bytes\n"
+                "  current res = %d bytes\n"
-                        "  total reg   = %u bytes (o/flow = %u bytes)\n"
+                "  total reg   = %u bytes (o/flow = %u bytes)\n"
-                        "  ophdrs      = %u (ophdr space = %u bytes)\n"
+                "  ophdrs      = %u (ophdr space = %u bytes)\n"
-                        "  ophdr + reg = %u bytes\n"
+                "  ophdr + reg = %u bytes\n"
-                        "  num regions = %u\n",
+                "  num regions = %u\n",
-                        ((ticket->t_trans_type <= 0 ||
+                ((ticket->t_trans_type <= 0 ||
-                          ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
+                  ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
-                          "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
+                  "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
-                        ticket->t_trans_type,
+                ticket->t_trans_type,
-                        ticket->t_unit_res,
+                ticket->t_unit_res,
-                        ticket->t_curr_res,
+                ticket->t_curr_res,
-                        ticket->t_res_arr_sum, ticket->t_res_o_flow,
+                ticket->t_res_arr_sum, ticket->t_res_o_flow,
-                        ticket->t_res_num_ophdrs, ophdr_spc,
+                ticket->t_res_num_ophdrs, ophdr_spc,
-                        ticket->t_res_arr_sum + 
+                ticket->t_res_arr_sum +
-                        ticket->t_res_o_flow + ophdr_spc,
+                ticket->t_res_o_flow + ophdr_spc,
-                        ticket->t_res_num);
+                ticket->t_res_num);
        for (i = 0; i < ticket->t_res_num; i++) {
-                uint r_type = ticket->t_res_arr[i].r_type; 
+                uint r_type = ticket->t_res_arr[i].r_type;
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "region[%u]: %s - %u bytes\n", i,
-                            "region[%u]: %s - %u bytes\n",
-                            i, 
                            ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
                            "bad-rtype" : res_type_str[r_type-1]),
                            ticket->t_res_arr[i].r_len);
        }
-        xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
+        xfs_alert_tag(mp, XFS_PTAG_LOGRES,
                "xfs_log_write: reservation ran out. Need to up reservation");
        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 }
@@ -1682,7 +1678,7 @@ xlog_write_setup_ophdr(
        case XFS_LOG:
                break;
        default:
-                xfs_fs_cmn_err(CE_WARN, log->l_mp,
+                xfs_warn(log->l_mp,
                        "Bad XFS transaction clientid 0x%x in ticket 0x%p",
                        ophdr->oh_clientid, ticket);
                return NULL;
@@ -2264,7 +2260,7 @@ xlog_state_do_callback(
                if (repeats > 5000) {
                        flushcnt += repeats;
                        repeats = 0;
-                        xfs_fs_cmn_err(CE_WARN, log->l_mp,
+                        xfs_warn(log->l_mp,
                                "%s: possible infinite loop (%d iterations)",
                                __func__, flushcnt);
                }
@@ -3052,10 +3048,8 @@ xfs_log_force(
        int     error;
        error = _xfs_log_force(mp, flags, NULL);
-        if (error) {
+        if (error)
-                xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+                xfs_warn(mp, "%s: error %d returned.", __func__, error);
-                        "error %d returned.", error);
-        }
 }
 /*
@@ -3204,10 +3198,8 @@ xfs_log_force_lsn(
        int     error;
        error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
-        if (error) {
+        if (error)
-                xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+                xfs_warn(mp, "%s: error %d returned.", __func__, error);
-                        "error %d returned.", error);
-        }
 }
 /*
@@ -3412,7 +3404,7 @@ xlog_verify_dest_ptr(
        }
        if (!good_ptr)
-                xlog_panic("xlog_verify_dest_ptr: invalid ptr");
+                xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
 }
 STATIC void
@@ -3448,16 +3440,16 @@ xlog_verify_tail_lsn(xlog_t	    *log,
        blocks =
            log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
        if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
-            xlog_panic("xlog_verify_tail_lsn: ran out of log space");
+                xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
    } else {
        ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
        if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
-            xlog_panic("xlog_verify_tail_lsn: tail wrapped");
+                xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
        blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
        if (blocks < BTOBB(iclog->ic_offset) + 1)
-            xlog_panic("xlog_verify_tail_lsn: ran out of log space");
+                xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
    }
 }       /* xlog_verify_tail_lsn */
@@ -3497,22 +3489,23 @@ xlog_verify_iclog(xlog_t	 *log,
        icptr = log->l_iclog;
        for (i=0; i < log->l_iclog_bufs; i++) {
                if (icptr == NULL)
-                        xlog_panic("xlog_verify_iclog: invalid ptr");
+                        xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
                icptr = icptr->ic_next;
        }
        if (icptr != log->l_iclog)
-                xlog_panic("xlog_verify_iclog: corrupt iclog ring");
+                xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
        spin_unlock(&log->l_icloglock);
        /* check log magic numbers */
        if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM)
-                xlog_panic("xlog_verify_iclog: invalid magic num");
+                xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
        ptr = (xfs_caddr_t) &iclog->ic_header;
        for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
             ptr += BBSIZE) {
                if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
-                        xlog_panic("xlog_verify_iclog: unexpected magic num");
+                        xfs_emerg(log->l_mp, "%s: unexpected magic num",
+                                __func__);
        }
        /* check fields */
@@ -3542,9 +3535,10 @@ xlog_verify_iclog(xlog_t	 *log,
                        }
                }
                if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
-                        cmn_err(CE_WARN, "xlog_verify_iclog: "
+                        xfs_warn(log->l_mp,
-                                "invalid clientid %d op 0x%p offset 0x%lx",
+                                "%s: invalid clientid %d op 0x%p offset 0x%lx",
-                                clientid, ophead, (unsigned long)field_offset);
+                                __func__, clientid, ophead,
+                                (unsigned long)field_offset);
                /* check length */
                field_offset = (__psint_t)
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 916eb7db14d9..3bd3291ef8d2 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -191,7 +191,7 @@ void	  xfs_log_ticket_put(struct xlog_ticket *ticket);
 xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
-int     xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+void    xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
                                struct xfs_log_vec *log_vector,
                                xfs_lsn_t *commit_lsn, int flags);
 bool    xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 9dc8125d04e5..9ca59be08977 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -543,7 +543,7 @@ xlog_cil_push(
        error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
        if (error)
-                goto out_abort;
+                goto out_abort_free_ticket;
        /*
         * now that we've written the checkpoint into the log, strictly
@@ -569,8 +569,9 @@ restart:
        }
        spin_unlock(&cil->xc_cil_lock);
+        /* xfs_log_done always frees the ticket on error. */
        commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
-        if (error || commit_lsn == -1)
+        if (commit_lsn == -1)
                goto out_abort;
        /* attach all the transactions w/ busy extents to iclog */
@@ -600,6 +601,8 @@ out_free_ticket:
        kmem_free(new_ctx);
        return 0;
+out_abort_free_ticket:
+        xfs_log_ticket_put(tic);
 out_abort:
        xlog_cil_committed(ctx, XFS_LI_ABORTED);
        return XFS_ERROR(EIO);
@@ -622,7 +625,7 @@ out_abort:
 * background commit, returns without it held once background commits are
 * allowed again.
 */
-int
+void
 xfs_log_commit_cil(
        struct xfs_mount        *mp,
        struct xfs_trans        *tp,
@@ -637,11 +640,6 @@ xfs_log_commit_cil(
        if (flags & XFS_TRANS_RELEASE_LOG_RES)
                log_flags = XFS_LOG_REL_PERM_RESERV;
-        if (XLOG_FORCED_SHUTDOWN(log)) {
-                xlog_cil_free_logvec(log_vector);
-                return XFS_ERROR(EIO);
-        }
        /*
         * do all the hard work of formatting items (including memory
         * allocation) outside the CIL context lock. This prevents stalling CIL
@@ -701,7 +699,6 @@ xfs_log_commit_cil(
         */
        if (push)
                xlog_cil_push(log, 0);
-        return 0;
 }
 /*
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index d5f8be8f4bf6..15dbf1f9c2be 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -87,10 +87,6 @@ static inline uint xlog_get_client_id(__be32 i)
        return be32_to_cpu(i) >> 24;
 }
-#define xlog_panic(args...)     cmn_err(CE_PANIC, ## args)
-#define xlog_exit(args...)      cmn_err(CE_PANIC, ## args)
-#define xlog_warn(args...)      cmn_err(CE_WARN, ## args)
 /*
 * In core log state
 */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index aa0ebb776903..0c4a5618e7af 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -92,7 +92,7 @@ xlog_get_bp(
        int             nbblks)
 {
        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
                        nbblks);
                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return NULL;
@@ -160,7 +160,7 @@ xlog_bread_noalign(
        int             error;
        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
                        nbblks);
                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return EFSCORRUPTED;
@@ -219,7 +219,7 @@ xlog_bwrite(
        int             error;
        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
                        nbblks);
                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return EFSCORRUPTED;
@@ -254,9 +254,9 @@ xlog_header_check_dump(
        xfs_mount_t             *mp,
        xlog_rec_header_t       *head)
 {
-        cmn_err(CE_DEBUG, "%s:  SB : uuid = %pU, fmt = %d\n",
+        xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d\n",
                __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
-        cmn_err(CE_DEBUG, "    log : uuid = %pU, fmt = %d\n",
+        xfs_debug(mp, "    log : uuid = %pU, fmt = %d\n",
                &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
 }
 #else
@@ -279,15 +279,15 @@ xlog_header_check_recover(
         * a dirty log created in IRIX.
         */
        if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
-                xlog_warn(
+                xfs_warn(mp,
-        "XFS: dirty log written in incompatible format - can't recover");
+        "dirty log written in incompatible format - can't recover");
                xlog_header_check_dump(mp, head);
                XFS_ERROR_REPORT("xlog_header_check_recover(1)",
                                 XFS_ERRLEVEL_HIGH, mp);
                return XFS_ERROR(EFSCORRUPTED);
        } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
-                xlog_warn(
+                xfs_warn(mp,
-        "XFS: dirty log entry has mismatched uuid - can't recover");
+        "dirty log entry has mismatched uuid - can't recover");
                xlog_header_check_dump(mp, head);
                XFS_ERROR_REPORT("xlog_header_check_recover(2)",
                                 XFS_ERRLEVEL_HIGH, mp);
@@ -312,9 +312,9 @@ xlog_header_check_mount(
                 * h_fs_uuid is nil, we assume this log was last mounted
                 * by IRIX and continue.
                 */
-                xlog_warn("XFS: nil uuid in log - IRIX style log");
+                xfs_warn(mp, "nil uuid in log - IRIX style log");
        } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
-                xlog_warn("XFS: log has mismatched uuid - can't recover");
+                xfs_warn(mp, "log has mismatched uuid - can't recover");
                xlog_header_check_dump(mp, head);
                XFS_ERROR_REPORT("xlog_header_check_mount",
                                 XFS_ERRLEVEL_HIGH, mp);
@@ -490,8 +490,8 @@ xlog_find_verify_log_record(
        for (i = (*last_blk) - 1; i >= 0; i--) {
                if (i < start_blk) {
                        /* valid log record not found */
-                        xlog_warn(
+                        xfs_warn(log->l_mp,
-                "XFS: Log inconsistent (didn't find previous header)");
+                "Log inconsistent (didn't find previous header)");
                        ASSERT(0);
                        error = XFS_ERROR(EIO);
                        goto out;
@@ -591,12 +591,12 @@ xlog_find_head(
                         * mkfs etc write a dummy unmount record to a fresh
                         * log so we can store the uuid in there
                         */
-                        xlog_warn("XFS: totally zeroed log");
+                        xfs_warn(log->l_mp, "totally zeroed log");
                }
                return 0;
        } else if (error) {
-                xlog_warn("XFS: empty log check failed");
+                xfs_warn(log->l_mp, "empty log check failed");
                return error;
        }
@@ -819,7 +819,7 @@ validate_head:
        xlog_put_bp(bp);
        if (error)
-            xlog_warn("XFS: failed to find log head");
+                xfs_warn(log->l_mp, "failed to find log head");
        return error;
 }
@@ -912,7 +912,7 @@ xlog_find_tail(
                }
        }
        if (!found) {
-                xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
+                xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
                ASSERT(0);
                return XFS_ERROR(EIO);
        }
@@ -1028,7 +1028,7 @@ done:
        xlog_put_bp(bp);
        if (error)
-                xlog_warn("XFS: failed to locate log tail");
+                xfs_warn(log->l_mp, "failed to locate log tail");
        return error;
 }
@@ -1092,7 +1092,8 @@ xlog_find_zeroed(
                 * the first block must be 1. If it's not, maybe we're
                 * not looking at a log... Bail out.
                 */
-                xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
+                xfs_warn(log->l_mp,
+                        "Log inconsistent or not a log (last==0, first!=1)");
                return XFS_ERROR(EINVAL);
        }
@@ -1506,8 +1507,8 @@ xlog_recover_add_to_trans(
        if (list_empty(&trans->r_itemq)) {
                /* we need to catch log corruptions here */
                if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
-                        xlog_warn("XFS: xlog_recover_add_to_trans: "
+                        xfs_warn(log->l_mp, "%s: bad header magic number",
-                                  "bad header magic number");
+                                __func__);
                        ASSERT(0);
                        return XFS_ERROR(EIO);
                }
@@ -1534,8 +1535,8 @@ xlog_recover_add_to_trans(
        if (item->ri_total == 0) {              /* first region to be added */
                if (in_f->ilf_size == 0 ||
                    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
-                        xlog_warn(
+                        xfs_warn(log->l_mp,
-        "XFS: bad number of regions (%d) in inode log format",
+                "bad number of regions (%d) in inode log format",
                                  in_f->ilf_size);
                        ASSERT(0);
                        return XFS_ERROR(EIO);
@@ -1592,8 +1593,9 @@ xlog_recover_reorder_trans(
                        list_move_tail(&item->ri_list, &trans->r_itemq);
                        break;
                default:
-                        xlog_warn(
+                        xfs_warn(log->l_mp,
-        "XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
+                                "%s: unrecognized type of log operation",
+                                __func__);
                        ASSERT(0);
                        return XFS_ERROR(EIO);
                }
@@ -1803,8 +1805,9 @@ xlog_recover_do_inode_buffer(
                logged_nextp = item->ri_buf[item_index].i_addr +
                                next_unlinked_offset - reg_buf_offset;
                if (unlikely(*logged_nextp == 0)) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                "bad inode buffer log record (ptr = 0x%p, bp = 0x%p).  XFS trying to replay bad (0) inode di_next_unlinked field",
+                "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
+                "Trying to replay bad (0) inode di_next_unlinked field.",
                                item, bp);
                        XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
                                         XFS_ERRLEVEL_LOW, mp);
@@ -1863,17 +1866,17 @@ xlog_recover_do_reg_buffer(
                if (buf_f->blf_flags &
                   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                        if (item->ri_buf[i].i_addr == NULL) {
-                                cmn_err(CE_ALERT,
+                                xfs_alert(mp,
                                        "XFS: NULL dquot in %s.", __func__);
                                goto next;
                        }
                        if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
-                                cmn_err(CE_ALERT,
+                                xfs_alert(mp,
                                        "XFS: dquot too small (%d) in %s.",
                                        item->ri_buf[i].i_len, __func__);
                                goto next;
                        }
-                        error = xfs_qm_dqcheck(item->ri_buf[i].i_addr,
+                        error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
                                               -1, 0, XFS_QMOPT_DOWARN,
                                               "dquot_buf_recover");
                        if (error)
@@ -1898,6 +1901,7 @@ xlog_recover_do_reg_buffer(
 */
 int
 xfs_qm_dqcheck(
+        struct xfs_mount *mp,
        xfs_disk_dquot_t *ddq,
        xfs_dqid_t       id,
        uint             type,    /* used only when IO_dorepair is true */
@@ -1924,14 +1928,14 @@ xfs_qm_dqcheck(
         */
        if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
                if (flags & XFS_QMOPT_DOWARN)
-                        cmn_err(CE_ALERT,
+                        xfs_alert(mp,
                        "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
                        str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
                errs++;
        }
        if (ddq->d_version != XFS_DQUOT_VERSION) {
                if (flags & XFS_QMOPT_DOWARN)
-                        cmn_err(CE_ALERT,
+                        xfs_alert(mp,
                        "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
                        str, id, ddq->d_version, XFS_DQUOT_VERSION);
                errs++;
@@ -1941,7 +1945,7 @@ xfs_qm_dqcheck(
            ddq->d_flags != XFS_DQ_PROJ &&
            ddq->d_flags != XFS_DQ_GROUP) {
                if (flags & XFS_QMOPT_DOWARN)
-                        cmn_err(CE_ALERT,
+                        xfs_alert(mp,
                        "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
                        str, id, ddq->d_flags);
                errs++;
@@ -1949,7 +1953,7 @@ xfs_qm_dqcheck(
        if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
                if (flags & XFS_QMOPT_DOWARN)
-                        cmn_err(CE_ALERT,
+                        xfs_alert(mp,
                        "%s : ondisk-dquot 0x%p, ID mismatch: "
                        "0x%x expected, found id 0x%x",
                        str, ddq, id, be32_to_cpu(ddq->d_id));
@@ -1962,9 +1966,8 @@ xfs_qm_dqcheck(
                                be64_to_cpu(ddq->d_blk_softlimit)) {
                        if (!ddq->d_btimer) {
                                if (flags & XFS_QMOPT_DOWARN)
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                        "%s : Dquot ID 0x%x (0x%p) "
+                        "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
-                                        "BLK TIMER NOT STARTED",
                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
                                errs++;
                        }
@@ -1974,9 +1977,8 @@ xfs_qm_dqcheck(
                                be64_to_cpu(ddq->d_ino_softlimit)) {
                        if (!ddq->d_itimer) {
                                if (flags & XFS_QMOPT_DOWARN)
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                        "%s : Dquot ID 0x%x (0x%p) "
+                        "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
-                                        "INODE TIMER NOT STARTED",
                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
                                errs++;
                        }
@@ -1986,9 +1988,8 @@ xfs_qm_dqcheck(
                                be64_to_cpu(ddq->d_rtb_softlimit)) {
                        if (!ddq->d_rtbtimer) {
                                if (flags & XFS_QMOPT_DOWARN)
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                        "%s : Dquot ID 0x%x (0x%p) "
+                        "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
-                                        "RTBLK TIMER NOT STARTED",
                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
                                errs++;
                        }
@@ -1999,7 +2000,7 @@ xfs_qm_dqcheck(
                return errs;
        if (flags & XFS_QMOPT_DOWARN)
-                cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
+                xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
        /*
         * Typically, a repair is only requested by quotacheck.
@@ -2218,9 +2219,9 @@ xlog_recover_inode_pass2(
         */
        if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
+        "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
-                        dip, bp, in_f->ilf_ino);
+                        __func__, dip, bp, in_f->ilf_ino);
                XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
@@ -2229,9 +2230,9 @@ xlog_recover_inode_pass2(
        dicp = item->ri_buf[1].i_addr;
        if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
+                        "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
-                        item, in_f->ilf_ino);
+                        __func__, item, in_f->ilf_ino);
                XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
@@ -2263,9 +2264,10 @@ xlog_recover_inode_pass2(
                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
                                         XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+                "%s: Bad regular inode log record, rec ptr 0x%p, "
-                                item, dip, bp, in_f->ilf_ino);
+                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+                                __func__, item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
@@ -2276,9 +2278,10 @@ xlog_recover_inode_pass2(
                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
                                             XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+                "%s: Bad dir inode log record, rec ptr 0x%p, "
-                                item, dip, bp, in_f->ilf_ino);
+                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+                                __func__, item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
@@ -2287,9 +2290,10 @@ xlog_recover_inode_pass2(
                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
+        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
-                        item, dip, bp, in_f->ilf_ino,
+        "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
+                        __func__, item, dip, bp, in_f->ilf_ino,
                        dicp->di_nextents + dicp->di_anextents,
                        dicp->di_nblocks);
                error = EFSCORRUPTED;
@@ -2299,8 +2303,9 @@ xlog_recover_inode_pass2(
                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
+        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
+        "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
                        item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
                error = EFSCORRUPTED;
                goto error;
@@ -2309,9 +2314,9 @@ xlog_recover_inode_pass2(
                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
+                        "%s: Bad inode log record length %d, rec ptr 0x%p",
-                        item->ri_buf[1].i_len, item);
+                        __func__, item->ri_buf[1].i_len, item);
                error = EFSCORRUPTED;
                goto error;
        }
@@ -2398,7 +2403,7 @@ xlog_recover_inode_pass2(
                        break;
                default:
-                        xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag");
+                        xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
                        ASSERT(0);
                        xfs_buf_relse(bp);
                        error = EIO;
@@ -2467,13 +2472,11 @@ xlog_recover_dquot_pass2(
        recddq = item->ri_buf[1].i_addr;
        if (recddq == NULL) {
-                cmn_err(CE_ALERT,
+                xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
-                        "XFS: NULL dquot in %s.", __func__);
                return XFS_ERROR(EIO);
        }
        if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
-                cmn_err(CE_ALERT,
+                xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
-                        "XFS: dquot too small (%d) in %s.",
                        item->ri_buf[1].i_len, __func__);
                return XFS_ERROR(EIO);
        }
@@ -2498,12 +2501,10 @@ xlog_recover_dquot_pass2(
         */
        dq_f = item->ri_buf[0].i_addr;
        ASSERT(dq_f);
-        if ((error = xfs_qm_dqcheck(recddq,
+        error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-                           dq_f->qlf_id,
+                           "xlog_recover_dquot_pass2 (log copy)");
-                           0, XFS_QMOPT_DOWARN,
+        if (error)
-                           "xlog_recover_dquot_pass2 (log copy)"))) {
                return XFS_ERROR(EIO);
-        }
        ASSERT(dq_f->qlf_len == 1);
        error = xfs_read_buf(mp, mp->m_ddev_targp,
@@ -2523,8 +2524,9 @@ xlog_recover_dquot_pass2(
         * was among a chunk of dquots created earlier, and we did some
         * minimal initialization then.
         */
-        if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+        error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-                           "xlog_recover_dquot_pass2")) {
+                           "xlog_recover_dquot_pass2");
+        if (error) {
                xfs_buf_relse(bp);
                return XFS_ERROR(EIO);
        }
@@ -2676,9 +2678,8 @@ xlog_recover_commit_pass1(
                /* nothing to do in pass 1 */
                return 0;
        default:
-                xlog_warn(
+                xfs_warn(log->l_mp, "%s: invalid item type (%d)",
-        "XFS: invalid item type (%d) xlog_recover_commit_pass1",
+                        __func__, ITEM_TYPE(item));
-                        ITEM_TYPE(item));
                ASSERT(0);
                return XFS_ERROR(EIO);
        }
@@ -2707,9 +2708,8 @@ xlog_recover_commit_pass2(
                /* nothing to do in pass2 */
                return 0;
        default:
-                xlog_warn(
+                xfs_warn(log->l_mp, "%s: invalid item type (%d)",
-        "XFS: invalid item type (%d) xlog_recover_commit_pass2",
+                        __func__, ITEM_TYPE(item));
-                        ITEM_TYPE(item));
                ASSERT(0);
                return XFS_ERROR(EIO);
        }
@@ -2751,10 +2751,11 @@ xlog_recover_commit_trans(
 STATIC int
 xlog_recover_unmount_trans(
+        struct log              *log,
        xlog_recover_t          *trans)
 {
        /* Do nothing now */
-        xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
+        xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
        return 0;
 }
@@ -2797,8 +2798,8 @@ xlog_recover_process_data(
                dp += sizeof(xlog_op_header_t);
                if (ohead->oh_clientid != XFS_TRANSACTION &&
                    ohead->oh_clientid != XFS_LOG) {
-                        xlog_warn(
+                        xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
-                "XFS: xlog_recover_process_data: bad clientid");
+                                        __func__, ohead->oh_clientid);
                        ASSERT(0);
                        return (XFS_ERROR(EIO));
                }
@@ -2811,8 +2812,8 @@ xlog_recover_process_data(
                                        be64_to_cpu(rhead->h_lsn));
                } else {
                        if (dp + be32_to_cpu(ohead->oh_len) > lp) {
-                                xlog_warn(
+                                xfs_warn(log->l_mp, "%s: bad length 0x%x",
-                        "XFS: xlog_recover_process_data: bad length");
+                                        __func__, be32_to_cpu(ohead->oh_len));
                                WARN_ON(1);
                                return (XFS_ERROR(EIO));
                        }
@@ -2825,7 +2826,7 @@ xlog_recover_process_data(
                                                                trans, pass);
                                break;
                        case XLOG_UNMOUNT_TRANS:
-                                error = xlog_recover_unmount_trans(trans);
+                                error = xlog_recover_unmount_trans(log, trans);
                                break;
                        case XLOG_WAS_CONT_TRANS:
                                error = xlog_recover_add_to_cont_trans(log,
@@ -2833,8 +2834,8 @@ xlog_recover_process_data(
                                                be32_to_cpu(ohead->oh_len));
                                break;
                        case XLOG_START_TRANS:
-                                xlog_warn(
+                                xfs_warn(log->l_mp, "%s: bad transaction",
-                        "XFS: xlog_recover_process_data: bad transaction");
+                                        __func__);
                                ASSERT(0);
                                error = XFS_ERROR(EIO);
                                break;
@@ -2844,8 +2845,8 @@ xlog_recover_process_data(
                                                dp, be32_to_cpu(ohead->oh_len));
                                break;
                        default:
-                                xlog_warn(
+                                xfs_warn(log->l_mp, "%s: bad flag 0x%x",
-                        "XFS: xlog_recover_process_data: bad flag");
+                                        __func__, flags);
                                ASSERT(0);
                                error = XFS_ERROR(EIO);
                                break;
@@ -3030,8 +3031,7 @@ xlog_recover_clear_agi_bucket(
 out_abort:
        xfs_trans_cancel(tp, XFS_TRANS_ABORT);
 out_error:
-        xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
+        xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
-                        "failed to clear agi %d. Continuing.", agno);
        return;
 }
@@ -3282,7 +3282,7 @@ xlog_valid_rec_header(
        if (unlikely(
            (!rhead->h_version ||
            (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
-                xlog_warn("XFS: %s: unrecognised log version (%d).",
+                xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
                        __func__, be32_to_cpu(rhead->h_version));
                return XFS_ERROR(EIO);
        }
@@ -3740,10 +3740,9 @@ xlog_recover(
                        return error;
                }
-                cmn_err(CE_NOTE,
+                xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
-                        "Starting XFS recovery on filesystem: %s (logdev: %s)",
+                                log->l_mp->m_logname ? log->l_mp->m_logname
-                        log->l_mp->m_fsname, log->l_mp->m_logname ?
+                                                     : "internal");
-                        log->l_mp->m_logname : "internal");
                error = xlog_do_recover(log, head_blk, tail_blk);
                log->l_flags |= XLOG_RECOVERY_NEEDED;
@@ -3776,9 +3775,7 @@ xlog_recover_finish(
                int     error;
                error = xlog_recover_process_efis(log);
                if (error) {
-                        cmn_err(CE_ALERT,
+                        xfs_alert(log->l_mp, "Failed to recover EFIs");
-                                "Failed to recover EFIs on filesystem: %s",
-                                log->l_mp->m_fsname);
                        return error;
                }
                /*
@@ -3793,15 +3790,12 @@ xlog_recover_finish(
                xlog_recover_check_summary(log);
-                cmn_err(CE_NOTE,
+                xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
-                        "Ending XFS recovery on filesystem: %s (logdev: %s)",
+                                log->l_mp->m_logname ? log->l_mp->m_logname
-                        log->l_mp->m_fsname, log->l_mp->m_logname ?
+                                                     : "internal");
-                        log->l_mp->m_logname : "internal");
                log->l_flags &= ~XLOG_RECOVERY_NEEDED;
        } else {
-                cmn_err(CE_DEBUG,
+                xfs_info(log->l_mp, "Ending clean mount");
-                        "Ending clean XFS mount for filesystem: %s\n",
-                        log->l_mp->m_fsname);
        }
        return 0;
 }
@@ -3834,10 +3828,8 @@ xlog_recover_check_summary(
        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
                error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
                if (error) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp, "%s agf read failed agno %d error %d",
-                                        "xlog_recover_check_summary(agf)"
+                                                __func__, agno, error);
-                                        "agf read failed agno %d error %d",
-                                                        agno, error);
                } else {
                        agfp = XFS_BUF_TO_AGF(agfbp);
                        freeblks += be32_to_cpu(agfp->agf_freeblks) +
@@ -3846,7 +3838,10 @@ xlog_recover_check_summary(
                }
                error = xfs_read_agi(mp, NULL, agno, &agibp);
-                if (!error) {
+                if (error) {
+                        xfs_alert(mp, "%s agi read failed agno %d error %d",
+                                                __func__, agno, error);
+                } else {
                        struct xfs_agi  *agi = XFS_BUF_TO_AGI(agibp);
                        itotal += be32_to_cpu(agi->agi_count);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d447aef84bc3..bb3f9a7b24ed 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -133,9 +133,7 @@ xfs_uuid_mount(
                return 0;
        if (uuid_is_nil(uuid)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "Filesystem has nil UUID - can't mount");
-                        "XFS: Filesystem %s has nil UUID - can't mount",
-                        mp->m_fsname);
                return XFS_ERROR(EINVAL);
        }
@@ -163,8 +161,7 @@ xfs_uuid_mount(
 out_duplicate:
        mutex_unlock(&xfs_uuid_table_mutex);
-        cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount",
+        xfs_warn(mp, "Filesystem has duplicate UUID - can't mount");
-                         mp->m_fsname);
        return XFS_ERROR(EINVAL);
 }
@@ -311,6 +308,8 @@ xfs_mount_validate_sb(
        xfs_sb_t        *sbp,
        int             flags)
 {
+        int             loud = !(flags & XFS_MFSI_QUIET);
        /*
         * If the log device and data device have the
         * same device number, the log is internal.
@@ -319,28 +318,32 @@ xfs_mount_validate_sb(
         * a volume filesystem in a non-volume manner.
         */
        if (sbp->sb_magicnum != XFS_SB_MAGIC) {
-                xfs_fs_mount_cmn_err(flags, "bad magic number");
+                if (loud)
+                        xfs_warn(mp, "bad magic number");
                return XFS_ERROR(EWRONGFS);
        }
        if (!xfs_sb_good_version(sbp)) {
-                xfs_fs_mount_cmn_err(flags, "bad version");
+                if (loud)
+                        xfs_warn(mp, "bad version");
                return XFS_ERROR(EWRONGFS);
        }
        if (unlikely(
            sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "filesystem is marked as having an external log; "
+                        xfs_warn(mp,
-                        "specify logdev on the\nmount command line.");
+                "filesystem is marked as having an external log; "
+                "specify logdev on the mount command line.");
                return XFS_ERROR(EINVAL);
        }
        if (unlikely(
            sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "filesystem is marked as having an internal log; "
+                        xfs_warn(mp,
-                        "do not specify logdev on\nthe mount command line.");
+                "filesystem is marked as having an internal log; "
+                "do not specify logdev on the mount command line.");
                return XFS_ERROR(EINVAL);
        }
@@ -369,7 +372,8 @@ xfs_mount_validate_sb(
            (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)  ||
            (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)  ||
            (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
-                xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed");
+                if (loud)
+                        xfs_warn(mp, "SB sanity check 1 failed");
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -382,7 +386,8 @@ xfs_mount_validate_sb(
             (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
            sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
                              sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) {
-                xfs_fs_mount_cmn_err(flags, "SB sanity check 2 failed");
+                if (loud)
+                        xfs_warn(mp, "SB sanity check 2 failed");
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -390,12 +395,12 @@ xfs_mount_validate_sb(
         * Until this is fixed only page-sized or smaller data blocks work.
         */
        if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud) {
-                        "file system with blocksize %d bytes",
+                        xfs_warn(mp,
-                        sbp->sb_blocksize);
+                "File system with blocksize %d bytes. "
-                xfs_fs_mount_cmn_err(flags,
+                "Only pagesize (%ld) or less will currently work.",
-                        "only pagesize (%ld) or less will currently work.",
+                                sbp->sb_blocksize, PAGE_SIZE);
-                        PAGE_SIZE);
+                }
                return XFS_ERROR(ENOSYS);
        }
@@ -409,21 +414,23 @@ xfs_mount_validate_sb(
        case 2048:
                break;
        default:
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "inode size of %d bytes not supported",
+                        xfs_warn(mp, "inode size of %d bytes not supported",
-                        sbp->sb_inodesize);
+                                sbp->sb_inodesize);
                return XFS_ERROR(ENOSYS);
        }
        if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
            xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "file system too large to be mounted on this system.");
+                        xfs_warn(mp,
+                "file system too large to be mounted on this system.");
                return XFS_ERROR(EFBIG);
        }
        if (unlikely(sbp->sb_inprogress)) {
-                xfs_fs_mount_cmn_err(flags, "file system busy");
+                if (loud)
+                        xfs_warn(mp, "file system busy");
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -431,8 +438,9 @@ xfs_mount_validate_sb(
         * Version 1 directory format has never worked on Linux.
         */
        if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "file system using version 1 directory format");
+                        xfs_warn(mp,
+                                "file system using version 1 directory format");
                return XFS_ERROR(ENOSYS);
        }
@@ -673,6 +681,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
        unsigned int    sector_size;
        xfs_buf_t       *bp;
        int             error;
+        int             loud = !(flags & XFS_MFSI_QUIET);
        ASSERT(mp->m_sb_bp == NULL);
        ASSERT(mp->m_ddev_targp != NULL);
@@ -688,7 +697,8 @@ reread:
        bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
                                        XFS_SB_DADDR, sector_size, 0);
        if (!bp) {
-                xfs_fs_mount_cmn_err(flags, "SB buffer read failed");
+                if (loud)
+                        xfs_warn(mp, "SB buffer read failed");
                return EIO;
        }
@@ -699,7 +709,8 @@ reread:
        xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
        error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
        if (error) {
-                xfs_fs_mount_cmn_err(flags, "SB validate failed");
+                if (loud)
+                        xfs_warn(mp, "SB validate failed");
                goto release_buf;
        }
@@ -707,9 +718,9 @@ reread:
         * We must be able to do sector-sized and sector-aligned IO.
         */
        if (sector_size > mp->m_sb.sb_sectsize) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "device supports only %u byte sectors (not %u)",
+                        xfs_warn(mp, "device supports %u byte sectors (not %u)",
-                        sector_size, mp->m_sb.sb_sectsize);
+                                sector_size, mp->m_sb.sb_sectsize);
                error = ENOSYS;
                goto release_buf;
        }
@@ -853,8 +864,7 @@ xfs_update_alignment(xfs_mount_t *mp)
                if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
                    (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
                        if (mp->m_flags & XFS_MOUNT_RETERR) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "alignment check 1 failed");
-                                        "XFS: alignment check 1 failed");
                                return XFS_ERROR(EINVAL);
                        }
                        mp->m_dalign = mp->m_swidth = 0;
@@ -867,8 +877,9 @@ xfs_update_alignment(xfs_mount_t *mp)
                                if (mp->m_flags & XFS_MOUNT_RETERR) {
                                        return XFS_ERROR(EINVAL);
                                }
-                                xfs_fs_cmn_err(CE_WARN, mp,
+                                xfs_warn(mp,
-"stripe alignment turned off: sunit(%d)/swidth(%d) incompatible with agsize(%d)",
+                "stripe alignment turned off: sunit(%d)/swidth(%d) "
+                "incompatible with agsize(%d)",
                                        mp->m_dalign, mp->m_swidth,
                                        sbp->sb_agblocks);
@@ -878,9 +889,9 @@ xfs_update_alignment(xfs_mount_t *mp)
                                mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
                        } else {
                                if (mp->m_flags & XFS_MOUNT_RETERR) {
-                                        xfs_fs_cmn_err(CE_WARN, mp,
+                                        xfs_warn(mp,
-"stripe alignment turned off: sunit(%d) less than bsize(%d)",
+                "stripe alignment turned off: sunit(%d) less than bsize(%d)",
-                                                mp->m_dalign,
+                                                mp->m_dalign,
                                                mp->m_blockmask +1);
                                        return XFS_ERROR(EINVAL);
                                }
@@ -1026,14 +1037,14 @@ xfs_check_sizes(xfs_mount_t *mp)
        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
-                cmn_err(CE_WARN, "XFS: filesystem size mismatch detected");
+                xfs_warn(mp, "filesystem size mismatch detected");
                return XFS_ERROR(EFBIG);
        }
        bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
                                        d - XFS_FSS_TO_BB(mp, 1),
                                        BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
        if (!bp) {
-                cmn_err(CE_WARN, "XFS: last sector read failed");
+                xfs_warn(mp, "last sector read failed");
                return EIO;
        }
        xfs_buf_relse(bp);
@@ -1041,14 +1052,14 @@ xfs_check_sizes(xfs_mount_t *mp)
        if (mp->m_logdev_targp != mp->m_ddev_targp) {
                d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
                if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
-                        cmn_err(CE_WARN, "XFS: log size mismatch detected");
+                        xfs_warn(mp, "log size mismatch detected");
                        return XFS_ERROR(EFBIG);
                }
                bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
                                        d - XFS_FSB_TO_BB(mp, 1),
                                        XFS_FSB_TO_B(mp, 1), 0);
                if (!bp) {
-                        cmn_err(CE_WARN, "XFS: log device read failed");
+                        xfs_warn(mp, "log device read failed");
                        return EIO;
                }
                xfs_buf_relse(bp);
@@ -1086,7 +1097,7 @@ xfs_mount_reset_sbqflags(
                return 0;
 #ifdef QUOTADEBUG
-        xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
+        xfs_notice(mp, "Writing superblock quota changes");
 #endif
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
@@ -1094,8 +1105,7 @@ xfs_mount_reset_sbqflags(
                                      XFS_DEFAULT_LOG_COUNT);
        if (error) {
                xfs_trans_cancel(tp, 0);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp, "%s: Superblock update failed!", __func__);
-                        "xfs_mount_reset_sbqflags: Superblock update failed!");
                return error;
        }
@@ -1161,8 +1171,7 @@ xfs_mountfs(
         * transaction subsystem is online.
         */
        if (xfs_sb_has_mismatched_features2(sbp)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "correcting sb_features alignment problem");
-                        "XFS: correcting sb_features alignment problem");
                sbp->sb_features2 |= sbp->sb_bad_features2;
                sbp->sb_bad_features2 = sbp->sb_features2;
                mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
@@ -1241,7 +1250,7 @@ xfs_mountfs(
         */
        error = xfs_rtmount_init(mp);
        if (error) {
-                cmn_err(CE_WARN, "XFS: RT mount failed");
+                xfs_warn(mp, "RT mount failed");
                goto out_remove_uuid;
        }
@@ -1272,12 +1281,12 @@ xfs_mountfs(
        INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
        error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
        if (error) {
-                cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
+                xfs_warn(mp, "Failed per-ag init: %d", error);
                goto out_remove_uuid;
        }
        if (!sbp->sb_logblocks) {
-                cmn_err(CE_WARN, "XFS: no log defined");
+                xfs_warn(mp, "no log defined");
                XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
                error = XFS_ERROR(EFSCORRUPTED);
                goto out_free_perag;
@@ -1290,7 +1299,7 @@ xfs_mountfs(
                              XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
                              XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
        if (error) {
-                cmn_err(CE_WARN, "XFS: log mount failed");
+                xfs_warn(mp, "log mount failed");
                goto out_free_perag;
        }
@@ -1327,16 +1336,14 @@ xfs_mountfs(
         */
        error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
        if (error) {
-                cmn_err(CE_WARN, "XFS: failed to read root inode");
+                xfs_warn(mp, "failed to read root inode");
                goto out_log_dealloc;
        }
        ASSERT(rip != NULL);
        if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
-                cmn_err(CE_WARN, "XFS: corrupted root inode");
+                xfs_warn(mp, "corrupted root inode %llu: not a directory",
-                cmn_err(CE_WARN, "Device %s - root %llu is not a directory",
-                        XFS_BUFTARG_NAME(mp->m_ddev_targp),
                        (unsigned long long)rip->i_ino);
                xfs_iunlock(rip, XFS_ILOCK_EXCL);
                XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
@@ -1356,7 +1363,7 @@ xfs_mountfs(
                /*
                 * Free up the root inode.
                 */
-                cmn_err(CE_WARN, "XFS: failed to read RT inodes");
+                xfs_warn(mp, "failed to read RT inodes");
                goto out_rele_rip;
        }
@@ -1368,7 +1375,7 @@ xfs_mountfs(
        if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
                error = xfs_mount_log_sb(mp, mp->m_update_flags);
                if (error) {
-                        cmn_err(CE_WARN, "XFS: failed to write sb changes");
+                        xfs_warn(mp, "failed to write sb changes");
                        goto out_rtunmount;
                }
        }
@@ -1389,10 +1396,7 @@ xfs_mountfs(
                 * quotachecked license.
                 */
                if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
-                        cmn_err(CE_NOTE,
+                        xfs_notice(mp, "resetting quota flags");
-                                "XFS: resetting qflags for filesystem %s",
-                                mp->m_fsname);
                        error = xfs_mount_reset_sbqflags(mp);
                        if (error)
                                return error;
@@ -1406,7 +1410,7 @@ xfs_mountfs(
         */
        error = xfs_log_mount_finish(mp);
        if (error) {
-                cmn_err(CE_WARN, "XFS: log mount finish failed");
+                xfs_warn(mp, "log mount finish failed");
                goto out_rtunmount;
        }
@@ -1435,8 +1439,8 @@ xfs_mountfs(
                resblks = xfs_default_resblks(mp);
                error = xfs_reserve_blocks(mp, &resblks, NULL);
                if (error)
-                        cmn_err(CE_WARN, "XFS: Unable to allocate reserve "
+                        xfs_warn(mp,
-                                "blocks. Continuing without a reserve pool.");
+        "Unable to allocate reserve blocks. Continuing without reserve pool.");
        }
        return 0;
@@ -1525,12 +1529,12 @@ xfs_unmountfs(
        resblks = 0;
        error = xfs_reserve_blocks(mp, &resblks, NULL);
        if (error)
-                cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. "
+                xfs_warn(mp, "Unable to free reserved block pool. "
                                "Freespace may not be correct on next mount.");
        error = xfs_log_sbcount(mp, 1);
        if (error)
-                cmn_err(CE_WARN, "XFS: Unable to update superblock counters. "
+                xfs_warn(mp, "Unable to update superblock counters. "
                                "Freespace may not be correct on next mount.");
        xfs_unmountfs_writesb(mp);
        xfs_unmountfs_wait(mp);                 /* wait for async bufs */
@@ -2013,10 +2017,8 @@ xfs_dev_is_read_only(
        if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
            xfs_readonly_buftarg(mp->m_logdev_targp) ||
            (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
-                cmn_err(CE_NOTE,
+                xfs_notice(mp, "%s required on read-only device.", message);
-                        "XFS: %s required on read-only device.", message);
+                xfs_notice(mp, "write access unavailable, cannot proceed.");
-                cmn_err(CE_NOTE,
-                        "XFS: write access unavailable, cannot proceed.");
                return EROFS;
        }
        return 0;
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index edfa178bafb6..4aff56395732 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -309,7 +309,7 @@ xfs_mru_cache_init(void)
        if (!xfs_mru_elem_zone)
                goto out;
-        xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache");
+        xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
        if (!xfs_mru_reap_wq)
                goto out_destroy_mru_elem_zone;
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 9bb6eda4cd21..a595f29567fe 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -382,7 +382,8 @@ static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
        xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
                                f | XFS_QMOPT_RES_REGBLKS)
-extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *);
+extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
+                                xfs_dqid_t, uint, uint, char *);
 extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
 #endif  /* __KERNEL__ */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 12a191385310..8f76fdff4f46 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -76,7 +76,7 @@ xfs_growfs_rt_alloc(
        xfs_mount_t     *mp,            /* file system mount point */
        xfs_extlen_t    oblocks,        /* old count of blocks */
        xfs_extlen_t    nblocks,        /* new count of blocks */
-        xfs_ino_t       ino)            /* inode number (bitmap/summary) */
+        xfs_inode_t     *ip)            /* inode (bitmap/summary) */
 {
        xfs_fileoff_t   bno;            /* block number in file */
        xfs_buf_t       *bp;            /* temporary buffer for zeroing */
@@ -86,7 +86,6 @@ xfs_growfs_rt_alloc(
        xfs_fsblock_t   firstblock;     /* first block allocated in xaction */
        xfs_bmap_free_t flist;          /* list of freed blocks */
        xfs_fsblock_t   fsbno;          /* filesystem block for bno */
-        xfs_inode_t     *ip;            /* pointer to incore inode */
        xfs_bmbt_irec_t map;            /* block map output */
        int             nmap;           /* number of block maps */
        int             resblks;        /* space reservation */
@@ -112,9 +111,9 @@ xfs_growfs_rt_alloc(
                /*
                 * Lock the inode.
                 */
-                if ((error = xfs_trans_iget(mp, tp, ino, 0,
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                                                XFS_ILOCK_EXCL, &ip)))
+                xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
-                        goto error_cancel;
                xfs_bmap_init(&flist, &firstblock);
                /*
                 * Allocate blocks to the bitmap file.
@@ -155,9 +154,8 @@ xfs_growfs_rt_alloc(
                        /*
                         * Lock the bitmap inode.
                         */
-                        if ((error = xfs_trans_iget(mp, tp, ino, 0,
+                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                                                        XFS_ILOCK_EXCL, &ip)))
+                        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
-                                goto error_cancel;
                        /*
                         * Get a buffer for the block.
                         */
@@ -1854,7 +1852,6 @@ xfs_growfs_rt(
        xfs_rtblock_t   bmbno;          /* bitmap block number */
        xfs_buf_t       *bp;            /* temporary buffer */
        int             error;          /* error return value */
-        xfs_inode_t     *ip;            /* bitmap inode, used as lock */
        xfs_mount_t     *nmp;           /* new (fake) mount structure */
        xfs_drfsbno_t   nrblocks;       /* new number of realtime blocks */
        xfs_extlen_t    nrbmblocks;     /* new number of rt bitmap blocks */
@@ -1918,11 +1915,11 @@ xfs_growfs_rt(
        /*
         * Allocate space to the bitmap and summary files, as necessary.
         */
-        if ((error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks,
+        error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip);
-                        mp->m_sb.sb_rbmino)))
+        if (error)
                return error;
-        if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks,
+        error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
-                        mp->m_sb.sb_rsumino)))
+        if (error)
                return error;
        /*
         * Allocate a new (fake) mount/sb.
@@ -1972,10 +1969,8 @@ xfs_growfs_rt(
                /*
                 * Lock out other callers by grabbing the bitmap inode lock.
                 */
-                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+                xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-                                                XFS_ILOCK_EXCL, &ip)))
+                xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-                        goto error_cancel;
-                ASSERT(ip == mp->m_rbmip);
                /*
                 * Update the bitmap inode's size.
                 */
@@ -1986,10 +1981,8 @@ xfs_growfs_rt(
                /*
                 * Get the summary inode into the transaction.
                 */
-                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0,
+                xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
-                                                XFS_ILOCK_EXCL, &ip)))
+                xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
-                        goto error_cancel;
-                ASSERT(ip == mp->m_rsumip);
                /*
                 * Update the summary inode's size.
                 */
@@ -2075,15 +2068,15 @@ xfs_rtallocate_extent(
        xfs_extlen_t    prod,           /* extent product factor */
        xfs_rtblock_t   *rtblock)       /* out: start block allocated */
 {
+        xfs_mount_t     *mp = tp->t_mountp;
        int             error;          /* error value */
-        xfs_inode_t     *ip;            /* inode for bitmap file */
-        xfs_mount_t     *mp;            /* file system mount structure */
        xfs_rtblock_t   r;              /* result allocated block */
        xfs_fsblock_t   sb;             /* summary file block number */
        xfs_buf_t       *sumbp;         /* summary file block buffer */
+        ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
        ASSERT(minlen > 0 && minlen <= maxlen);
-        mp = tp->t_mountp;
        /*
         * If prod is set then figure out what to do to minlen and maxlen.
         */
@@ -2099,12 +2092,7 @@ xfs_rtallocate_extent(
                        return 0;
                }
        }
-        /*
-         * Lock out other callers by grabbing the bitmap inode lock.
-         */
-        if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
-                                        XFS_ILOCK_EXCL, &ip)))
-                return error;
        sumbp = NULL;
        /*
         * Allocate by size, or near another block, or exactly at some block.
@@ -2123,11 +2111,12 @@ xfs_rtallocate_extent(
                                len, &sumbp, &sb, prod, &r);
                break;
        default:
+                error = EIO;
                ASSERT(0);
        }
-        if (error) {
+        if (error)
                return error;
-        }
        /*
         * If it worked, update the superblock.
         */
@@ -2155,7 +2144,6 @@ xfs_rtfree_extent(
        xfs_extlen_t    len)            /* length of extent freed */
 {
        int             error;          /* error value */
-        xfs_inode_t     *ip;            /* bitmap file inode */
        xfs_mount_t     *mp;            /* file system mount structure */
        xfs_fsblock_t   sb;             /* summary file block number */
        xfs_buf_t       *sumbp;         /* summary file block buffer */
@@ -2164,9 +2152,9 @@ xfs_rtfree_extent(
        /*
         * Synchronize by locking the bitmap inode.
         */
-        if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+        xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-                                        XFS_ILOCK_EXCL, &ip)))
+        xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-                return error;
 #if defined(__KERNEL__) && defined(DEBUG)
        /*
         * Check to see that this whole range is currently allocated.
@@ -2199,10 +2187,10 @@ xfs_rtfree_extent(
         */
        if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
            mp->m_sb.sb_rextents) {
-                if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
+                if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
-                        ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+                        mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
-                *(__uint64_t *)&ip->i_d.di_atime = 0;
+                *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
-                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+                xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
        }
        return 0;
 }
@@ -2222,8 +2210,8 @@ xfs_rtmount_init(
        if (sbp->sb_rblocks == 0)
                return 0;
        if (mp->m_rtdev_targp == NULL) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: This filesystem has a realtime volume, use rtdev=device option");
+        "Filesystem has a realtime volume, use rtdev=device option");
                return XFS_ERROR(ENODEV);
        }
        mp->m_rsumlevels = sbp->sb_rextslog + 1;
@@ -2237,7 +2225,7 @@ xfs_rtmount_init(
         */
        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
-                cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu",
+                xfs_warn(mp, "realtime mount -- %llu != %llu",
                        (unsigned long long) XFS_BB_TO_FSB(mp, d),
                        (unsigned long long) mp->m_sb.sb_rblocks);
                return XFS_ERROR(EFBIG);
@@ -2246,7 +2234,7 @@ xfs_rtmount_init(
                                        d - XFS_FSB_TO_BB(mp, 1),
                                        XFS_FSB_TO_B(mp, 1), 0);
        if (!bp) {
-                cmn_err(CE_WARN, "XFS: realtime device size check failed");
+                xfs_warn(mp, "realtime device size check failed");
                return EIO;
        }
        xfs_buf_relse(bp);
@@ -2306,20 +2294,16 @@ xfs_rtpick_extent(
        xfs_rtblock_t   *pick)          /* result rt extent */
 {
        xfs_rtblock_t   b;              /* result block */
-        int             error;          /* error return value */
-        xfs_inode_t     *ip;            /* bitmap incore inode */
        int             log2;           /* log of sequence number */
        __uint64_t      resid;          /* residual after log removed */
        __uint64_t      seq;            /* sequence number of file creation */
        __uint64_t      *seqp;          /* pointer to seqno in inode */
-        if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+        ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
-                                        XFS_ILOCK_EXCL, &ip)))
-                return error;
+        seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime;
-        ASSERT(ip == mp->m_rbmip);
+        if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
-        seqp = (__uint64_t *)&ip->i_d.di_atime;
+                mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
-        if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
-                ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
                *seqp = 0;
        }
        seq = *seqp;
@@ -2335,7 +2319,7 @@ xfs_rtpick_extent(
                        b = mp->m_sb.sb_rextents - len;
        }
        *seqp = seq + 1;
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
        *pick = b;
        return 0;
 }
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index ff614c29b441..09e1f4f35e97 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -154,7 +154,7 @@ xfs_rtmount_init(
        if (mp->m_sb.sb_rblocks == 0)
                return 0;
-        cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT");
+        xfs_warn(mp, "Not built with CONFIG_XFS_RT");
        return ENOSYS;
 }
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 56861d5daaef..d6d6fdfe9422 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -49,9 +49,9 @@ xfs_do_force_shutdown(
        logerror = flags & SHUTDOWN_LOG_IO_ERROR;
        if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
-                cmn_err(CE_NOTE, "xfs_force_shutdown(%s,0x%x) called from "
+                xfs_notice(mp,
-                                 "line %d of file %s.  Return address = 0x%p",
+        "%s(0x%x) called from line %d of file %s.  Return address = 0x%p",
-                        mp->m_fsname, flags, lnnum, fname, __return_address);
+                        __func__, flags, lnnum, fname, __return_address);
        }
        /*
         * No need to duplicate efforts.
@@ -69,30 +69,25 @@ xfs_do_force_shutdown(
                return;
        if (flags & SHUTDOWN_CORRUPT_INCORE) {
-                xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
-    "Corruption of in-memory data detected.  Shutting down filesystem: %s",
+    "Corruption of in-memory data detected.  Shutting down filesystem");
-                        mp->m_fsname);
+                if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
-                if (XFS_ERRLEVEL_HIGH <= xfs_error_level) {
                        xfs_stack_trace();
-                }
        } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
                if (logerror) {
-                        xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
-                "Log I/O Error Detected.  Shutting down filesystem: %s",
+                "Log I/O Error Detected.  Shutting down filesystem");
-                                mp->m_fsname);
                } else if (flags & SHUTDOWN_DEVICE_REQ) {
-                        xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
-                "All device paths lost.  Shutting down filesystem: %s",
+                "All device paths lost.  Shutting down filesystem");
-                                mp->m_fsname);
                } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
-                        xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
-                "I/O Error Detected.  Shutting down filesystem: %s",
+                "I/O Error Detected. Shutting down filesystem");
-                                mp->m_fsname);
                }
        }
        if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
-                cmn_err(CE_ALERT, "Please umount the filesystem, "
+                xfs_alert(mp,
-                                  "and rectify the problem(s)");
+        "Please umount the filesystem and rectify the problem(s)");
        }
 }
@@ -106,10 +101,9 @@ xfs_ioerror_alert(
        xfs_buf_t               *bp,
        xfs_daddr_t             blkno)
 {
-        cmn_err(CE_ALERT,
+        xfs_alert(mp,
- "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx"
+                 "I/O error occurred: meta-data dev %s block 0x%llx"
- "       (\"%s\") error %d buf count %zd",
+                 "       (\"%s\") error %d buf count %zd",
-                (!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname,
                XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
                (__uint64_t)blkno, func,
                XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));
@@ -173,17 +167,9 @@ xfs_extlen_t
 xfs_get_extsz_hint(
        struct xfs_inode        *ip)
 {
-        xfs_extlen_t            extsz;
+        if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
+                return ip->i_d.di_extsize;
-        if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
+        if (XFS_IS_REALTIME_INODE(ip))
-                extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
+                return ip->i_mount->m_sb.sb_rextsize;
-                                ? ip->i_d.di_extsize
+        return 0;
-                                : ip->i_mount->m_sb.sb_rextsize;
-                ASSERT(extsz);
-        } else {
-                extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
-                                ? ip->i_d.di_extsize : 0;
-        }
-        return extsz;
 }
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 33dbc4e0ad62..76922793f64f 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1446,6 +1446,14 @@ xfs_log_item_batch_insert(
 * Bulk operation version of xfs_trans_committed that takes a log vector of
 * items to insert into the AIL. This uses bulk AIL insertion techniques to
 * minimise lock traffic.
+ *
+ * If we are called with the aborted flag set, it is because a log write during
+ * a CIL checkpoint commit has failed. In this case, all the items in the
+ * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
+ * means that checkpoint commit abort handling is treated exactly the same
+ * as an iclog write error even though we haven't started any IO yet. Hence in
+ * this case all we need to do is IOP_COMMITTED processing, followed by an
+ * IOP_UNPIN(aborted) call.
 */
 void
 xfs_trans_committed_bulk(
@@ -1472,6 +1480,16 @@ xfs_trans_committed_bulk(
                if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
                        continue;
+                /*
+                 * if we are aborting the operation, no point in inserting the
+                 * object into the AIL as we are in a shutdown situation.
+                 */
+                if (aborted) {
+                        ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
+                        IOP_UNPIN(lip, 1);
+                        continue;
+                }
                if (item_lsn != commit_lsn) {
                        /*
@@ -1503,20 +1521,24 @@ xfs_trans_committed_bulk(
 }
 /*
- * Called from the trans_commit code when we notice that
+ * Called from the trans_commit code when we notice that the filesystem is in
- * the filesystem is in the middle of a forced shutdown.
+ * the middle of a forced shutdown.
+ *
+ * When we are called here, we have already pinned all the items in the
+ * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called
+ * so we can simply walk the items in the transaction, unpin them with an abort
+ * flag and then free the items. Note that unpinning the items can result in
+ * them being freed immediately, so we need to use a safe list traversal method
+ * here.
 */
 STATIC void
 xfs_trans_uncommit(
        struct xfs_trans        *tp,
        uint                    flags)
 {
-        struct xfs_log_item_desc *lidp;
+        struct xfs_log_item_desc *lidp, *n;
-        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+        list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) {
-                /*
-                 * Unpin all but those that aren't dirty.
-                 */
                if (lidp->lid_flags & XFS_LID_DIRTY)
                        IOP_UNPIN(lidp->lid_item, 1);
        }
@@ -1733,7 +1755,6 @@ xfs_trans_commit_cil(
        int                     flags)
 {
        struct xfs_log_vec      *log_vector;
-        int                     error;
        /*
         * Get each log item to allocate a vector structure for
@@ -1744,9 +1765,7 @@ xfs_trans_commit_cil(
        if (!log_vector)
                return ENOMEM;
-        error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
+        xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
-        if (error)
-                return error;
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
        xfs_trans_free(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c2042b736b81..06a9759b6352 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -469,8 +469,6 @@ void		xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void            xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
-int             xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
-                               xfs_ino_t , uint, uint, struct xfs_inode **);
 void            xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
 void            xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
 void            xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index c5bbbc45db91..12aff9584e29 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -563,7 +563,7 @@ xfs_trans_ail_delete_bulk(
                        spin_unlock(&ailp->xa_lock);
                        if (!XFS_FORCED_SHUTDOWN(mp)) {
-                                xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
+                                xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
                "%s: attempting to delete a log item that is not in the AIL",
                                                __func__);
                                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index c47918c302a5..3bea66132334 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -305,7 +305,7 @@ xfs_trans_read_buf(
                        if (xfs_error_target == target) {
                                if (((xfs_req_num++) % xfs_error_mod) == 0) {
                                        xfs_buf_relse(bp);
-                                        cmn_err(CE_DEBUG, "Returning error!\n");
+                                        xfs_debug(mp, "Returning error!");
                                        return XFS_ERROR(EIO);
                                }
                        }
@@ -403,7 +403,7 @@ xfs_trans_read_buf(
                                xfs_force_shutdown(tp->t_mountp,
                                                   SHUTDOWN_META_IO_ERROR);
                                xfs_buf_relse(bp);
-                                cmn_err(CE_DEBUG, "Returning trans error!\n");
+                                xfs_debug(mp, "Returning trans error!");
                                return XFS_ERROR(EIO);
                        }
                }
@@ -427,7 +427,7 @@ shutdown_abort:
         */
 #if defined(DEBUG)
        if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
-                cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp);
+                xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
 #endif
        ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
                                     (XBF_STALE|XBF_DELWRI));
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index ccb34532768b..16084d8ea231 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -44,28 +44,6 @@ xfs_trans_inode_broot_debug(
 #endif
 /*
- * Get an inode and join it to the transaction.
- */
-int
-xfs_trans_iget(
-        xfs_mount_t     *mp,
-        xfs_trans_t     *tp,
-        xfs_ino_t       ino,
-        uint            flags,
-        uint            lock_flags,
-        xfs_inode_t     **ipp)
-{
-        int                     error;
-        error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp);
-        if (!error && tp) {
-                xfs_trans_ijoin(tp, *ipp);
-                (*ipp)->i_itemp->ili_lock_flags = lock_flags;
-        }
-        return error;
-}
-/*
 * Add a locked inode to the transaction.
 *
 * The inode must be locked, and it cannot be associated with any transaction.
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index d8e6f8cd6f0c..37d8146ee15b 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1189,9 +1189,8 @@ xfs_inactive(
                 * inode might be lost for a long time or forever.
                 */
                if (!XFS_FORCED_SHUTDOWN(mp)) {
-                        cmn_err(CE_NOTE,
+                        xfs_notice(mp, "%s: xfs_ifree returned error %d",
-                "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
+                                __func__, error);
-                                error, mp->m_fsname);
                        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
                }
                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
@@ -1208,12 +1207,12 @@ xfs_inactive(
                 */
                error = xfs_bmap_finish(&tp,  &free_list, &committed);
                if (error)
-                        xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
+                        xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
-                                "xfs_bmap_finish() returned error %d", error);
+                                __func__, error);
                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
                if (error)
-                        xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
+                        xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
-                                "xfs_trans_commit() returned error %d", error);
+                                __func__, error);
        }
        /*
@@ -1310,7 +1309,7 @@ xfs_create(
        error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
-                goto std_return;
+                return error;
        if (is_dir) {
                rdev = 0;
@@ -1390,12 +1389,6 @@ xfs_create(
        }
        /*
-         * At this point, we've gotten a newly allocated inode.
-         * It is locked (and joined to the transaction).
-         */
-        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        /*
         * Now we join the directory inode to the transaction.  We do not do it
         * earlier because xfs_dir_ialloc might commit the previous transaction
         * (and release all the locks).  An error from here on will result in
@@ -1440,22 +1433,13 @@ xfs_create(
         */
        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
-        /*
-         * xfs_trans_commit normally decrements the vnode ref count
-         * when it unlocks the inode. Since we want to return the
-         * vnode to the caller, we bump the vnode ref count now.
-         */
-        IHOLD(ip);
        error = xfs_bmap_finish(&tp, &free_list, &committed);
        if (error)
-                goto out_abort_rele;
+                goto out_bmap_cancel;
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-        if (error) {
+        if (error)
-                IRELE(ip);
+                goto out_release_inode;
-                goto out_dqrele;
-        }
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
@@ -1469,27 +1453,21 @@ xfs_create(
        cancel_flags |= XFS_TRANS_ABORT;
 out_trans_cancel:
        xfs_trans_cancel(tp, cancel_flags);
- out_dqrele:
+ out_release_inode:
+        /*
+         * Wait until after the current transaction is aborted to
+         * release the inode.  This prevents recursive transactions
+         * and deadlocks from xfs_inactive.
+         */
+        if (ip)
+                IRELE(ip);
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
        if (unlock_dp_on_error)
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
- std_return:
        return error;
- out_abort_rele:
-        /*
-         * Wait until after the current transaction is aborted to
-         * release the inode.  This prevents recursive transactions
-         * and deadlocks from xfs_inactive.
-         */
-        xfs_bmap_cancel(&free_list);
-        cancel_flags |= XFS_TRANS_ABORT;
-        xfs_trans_cancel(tp, cancel_flags);
-        IRELE(ip);
-        unlock_dp_on_error = B_FALSE;
-        goto out_dqrele;
 }
 #ifdef DEBUG
@@ -2114,9 +2092,8 @@ xfs_symlink(
                                  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
                                  &first_block, resblks, mval, &nmaps,
                                  &free_list);
-                if (error) {
+                if (error)
-                        goto error1;
+                        goto error2;
-                }
                if (resblks)
                        resblks -= fs_blocks;
@@ -2148,7 +2125,7 @@ xfs_symlink(
        error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
                                        &first_block, &free_list, resblks);
        if (error)
-                goto error1;
+                goto error2;
        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
@@ -2161,13 +2138,6 @@ xfs_symlink(
                xfs_trans_set_sync(tp);
        }
-        /*
-         * xfs_trans_commit normally decrements the vnode ref count
-         * when it unlocks the inode. Since we want to return the
-         * vnode to the caller, we bump the vnode ref count now.
-         */
-        IHOLD(ip);
        error = xfs_bmap_finish(&tp, &free_list, &committed);
        if (error) {
                goto error2;
author	Artem Bityutskiy <Artem.Bityutskiy@nokia.com>	2011-03-25 11:41:20 -0400
committer	Artem Bityutskiy <Artem.Bityutskiy@nokia.com>	2011-03-25 11:41:20 -0400
commit	7bf7e370d5919112c223a269462cd0b546903829 (patch)
tree	03ccc715239df14ae168277dbccc9d9cf4d8a2c8 /fs
parent	68b1a1e786f29c900fa1c516a402e24f0ece622a (diff)
parent	d39dd11c3e6a7af5c20bfac40594db36cf270f42 (diff)